From c2b93e0699723700f886ce17bb65ffd771195a6d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 7 May 2013 11:28:31 -0400 Subject: cifs: only set ops for inodes in I_NEW state It's generally not safe to reset the inode ops once they've been set. In the case where the inode was originally thought to be a directory and then later found to be a DFS referral, this can lead to an oops when we try to trigger an inode op on it after changing the ops to the blank referral operations. Cc: Reported-and-Tested-by: Sachin Prabhu Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index fc3025199cb3..20efd81266c6 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -171,7 +171,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL) inode->i_flags |= S_AUTOMOUNT; - cifs_set_ops(inode); + if (inode->i_state & I_NEW) + cifs_set_ops(inode); } void -- cgit v1.2.2 From 3fab70c165795431f00ddf9be8b84ddd07bd1f8f Mon Sep 17 00:00:00 2001 From: Lingzhu Xiang Date: Fri, 10 May 2013 18:29:21 +0800 Subject: efivarfs: Never return ENOENT from firmware again Previously in 1fa7e69 efi_status_to_err() translated firmware status EFI_NOT_FOUND to -EIO instead of -ENOENT for efivarfs operations to avoid confusion. After refactoring in e14ab23, it is also used in other places where the translation may be unnecessary. So move the translation to efivarfs specific code. Also return EOF for reading zero-length files, which is what users would expect. Cc: Josh Boyer Cc: Jeremy Kerr Cc: Lee, Chun-Yi Cc: Andy Whitcroft Signed-off-by: Lingzhu Xiang Signed-off-by: Matt Fleming --- fs/efivarfs/file.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index bfb531564319..8dd524f32284 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -44,8 +44,11 @@ static ssize_t efivarfs_file_write(struct file *file, bytes = efivar_entry_set_get_size(var, attributes, &datasize, data, &set); - if (!set && bytes) + if (!set && bytes) { + if (bytes == -ENOENT) + bytes = -EIO; goto out; + } if (bytes == -ENOENT) { drop_nlink(inode); @@ -76,7 +79,14 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf, int err; err = efivar_entry_size(var, &datasize); - if (err) + + /* + * efivarfs represents uncommitted variables with + * zero-length files. Reading them should return EOF. + */ + if (err == -ENOENT) + return 0; + else if (err) return err; data = kmalloc(datasize + sizeof(attributes), GFP_KERNEL); -- cgit v1.2.2 From de82b923012ff8790bcfff381eb3ca9069d00f49 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 14 May 2013 11:25:39 -0400 Subject: fuse: allocate for_background dio requests based on io->async state Commit 8b41e671 introduced explicit background checking for fuse_req structures with BUG_ON() checks for the appropriate type of request in in the associated send functions. Commit bcba24cc introduced the ability to send dio requests as background requests but does not update the request allocation based on the type of I/O request. As a result, a BUG_ON() triggers in the fuse_request_send_background() background path if an async I/O is sent. Allocate a request based on the async state of the fuse_io_priv to avoid the BUG. Signed-off-by: Brian Foster Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d1c9b85b3f58..fe191325fefa 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1278,7 +1278,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, iov_iter_init(&ii, iov, nr_segs, count, 0); - req = fuse_get_req(fc, fuse_iter_npages(&ii)); + if (io->async) + req = fuse_get_req_for_background(fc, fuse_iter_npages(&ii)); + else + req = fuse_get_req(fc, fuse_iter_npages(&ii)); if (IS_ERR(req)) return PTR_ERR(req); @@ -1314,7 +1317,11 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, break; if (count) { fuse_put_request(fc, req); - req = fuse_get_req(fc, fuse_iter_npages(&ii)); + if (io->async) + req = fuse_get_req_for_background(fc, + fuse_iter_npages(&ii)); + else + req = fuse_get_req(fc, fuse_iter_npages(&ii)); if (IS_ERR(req)) break; } -- cgit v1.2.2 From c420276a532a10ef59849adc2681f45306166b89 Mon Sep 17 00:00:00 2001 From: Jim Schutt Date: Wed, 15 May 2013 13:03:35 -0500 Subject: ceph: add cpu_to_le32() calls when encoding a reconnect capability In his review, Alex Elder mentioned that he hadn't checked that num_fcntl_locks and num_flock_locks were properly decoded on the server side, from a le32 over-the-wire type to a cpu type. I checked, and AFAICS it is done; those interested can consult Locker::_do_cap_update() in src/mds/Locker.cc and src/include/encoding.h in the Ceph server code (git://github.com/ceph/ceph). I also checked the server side for flock_len decoding, and I believe that also happens correctly, by virtue of having been declared __le32 in struct ceph_mds_cap_reconnect, in src/include/ceph_fs.h. Cc: stable@vger.kernel.org # 3.4+ Signed-off-by: Jim Schutt Reviewed-by: Alex Elder --- fs/ceph/locks.c | 7 +++++-- fs/ceph/mds_client.c | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 202dd3d68be0..a80ed18d64ff 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -206,10 +206,12 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, int err = 0; int seen_fcntl = 0; int seen_flock = 0; + __le32 nlocks; dout("encoding %d flock and %d fcntl locks", num_flock_locks, num_fcntl_locks); - err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32)); + nlocks = cpu_to_le32(num_fcntl_locks); + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); if (err) goto fail; for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { @@ -229,7 +231,8 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, goto fail; } - err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32)); + nlocks = cpu_to_le32(num_flock_locks); + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); if (err) goto fail; for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 4f22671a5bd4..d9ca15255477 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2485,7 +2485,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, lock_flocks(); ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); - rec.v2.flock_len = (2*sizeof(u32) + + rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) + (num_fcntl_locks+num_flock_locks) * sizeof(struct ceph_filelock)); unlock_flocks(); -- cgit v1.2.2 From 39be95e9c8c0b5668c9f8806ffe29bf9f4bc0f40 Mon Sep 17 00:00:00 2001 From: Jim Schutt Date: Wed, 15 May 2013 13:03:35 -0500 Subject: ceph: ceph_pagelist_append might sleep while atomic Ceph's encode_caps_cb() worked hard to not call __page_cache_alloc() while holding a lock, but it's spoiled because ceph_pagelist_addpage() always calls kmap(), which might sleep. Here's the result: [13439.295457] ceph: mds0 reconnect start [13439.300572] BUG: sleeping function called from invalid context at include/linux/highmem.h:58 [13439.309243] in_atomic(): 1, irqs_disabled(): 0, pid: 12059, name: kworker/1:1 . . . [13439.376225] Call Trace: [13439.378757] [] __might_sleep+0xfc/0x110 [13439.384353] [] ceph_pagelist_append+0x120/0x1b0 [libceph] [13439.391491] [] ceph_encode_locks+0x89/0x190 [ceph] [13439.398035] [] ? _raw_spin_lock+0x49/0x50 [13439.403775] [] ? lock_flocks+0x15/0x20 [13439.409277] [] encode_caps_cb+0x41f/0x4a0 [ceph] [13439.415622] [] ? igrab+0x28/0x70 [13439.420610] [] ? iterate_session_caps+0xe8/0x250 [ceph] [13439.427584] [] iterate_session_caps+0x115/0x250 [ceph] [13439.434499] [] ? set_request_path_attr+0x2d0/0x2d0 [ceph] [13439.441646] [] send_mds_reconnect+0x238/0x450 [ceph] [13439.448363] [] ? ceph_mdsmap_decode+0x5e2/0x770 [ceph] [13439.455250] [] check_new_map+0x352/0x500 [ceph] [13439.461534] [] ceph_mdsc_handle_map+0x1bd/0x260 [ceph] [13439.468432] [] ? mutex_unlock+0xe/0x10 [13439.473934] [] extra_mon_dispatch+0x22/0x30 [ceph] [13439.480464] [] dispatch+0xbc/0x110 [libceph] [13439.486492] [] process_message+0x1ad/0x1d0 [libceph] [13439.493190] [] ? read_partial_message+0x3e8/0x520 [libceph] . . . [13439.587132] ceph: mds0 reconnect success [13490.720032] ceph: mds0 caps stale [13501.235257] ceph: mds0 recovery completed [13501.300419] ceph: mds0 caps renewed Fix it up by encoding locks into a buffer first, and when the number of encoded locks is stable, copy that into a ceph_pagelist. [elder@inktank.com: abbreviated the stack info a bit.] Cc: stable@vger.kernel.org # 3.4+ Signed-off-by: Jim Schutt Reviewed-by: Alex Elder --- fs/ceph/locks.c | 76 ++++++++++++++++++++++++++++++++-------------------- fs/ceph/mds_client.c | 65 +++++++++++++++++++++++--------------------- fs/ceph/super.h | 9 +++++-- 3 files changed, 89 insertions(+), 61 deletions(-) (limited to 'fs') diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index a80ed18d64ff..ebbf680378e2 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -191,29 +191,23 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) } /** - * Encode the flock and fcntl locks for the given inode into the pagelist. - * Format is: #fcntl locks, sequential fcntl locks, #flock locks, - * sequential flock locks. - * Must be called with lock_flocks() already held. - * If we encounter more of a specific lock type than expected, - * we return the value 1. + * Encode the flock and fcntl locks for the given inode into the ceph_filelock + * array. Must be called with lock_flocks() already held. + * If we encounter more of a specific lock type than expected, return -ENOSPC. */ -int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, - int num_fcntl_locks, int num_flock_locks) +int ceph_encode_locks_to_buffer(struct inode *inode, + struct ceph_filelock *flocks, + int num_fcntl_locks, int num_flock_locks) { struct file_lock *lock; - struct ceph_filelock cephlock; int err = 0; int seen_fcntl = 0; int seen_flock = 0; - __le32 nlocks; + int l = 0; dout("encoding %d flock and %d fcntl locks", num_flock_locks, num_fcntl_locks); - nlocks = cpu_to_le32(num_fcntl_locks); - err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); - if (err) - goto fail; + for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { if (lock->fl_flags & FL_POSIX) { ++seen_fcntl; @@ -221,20 +215,12 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, err = -ENOSPC; goto fail; } - err = lock_to_ceph_filelock(lock, &cephlock); + err = lock_to_ceph_filelock(lock, &flocks[l]); if (err) goto fail; - err = ceph_pagelist_append(pagelist, &cephlock, - sizeof(struct ceph_filelock)); + ++l; } - if (err) - goto fail; } - - nlocks = cpu_to_le32(num_flock_locks); - err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); - if (err) - goto fail; for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { if (lock->fl_flags & FL_FLOCK) { ++seen_flock; @@ -242,19 +228,51 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, err = -ENOSPC; goto fail; } - err = lock_to_ceph_filelock(lock, &cephlock); + err = lock_to_ceph_filelock(lock, &flocks[l]); if (err) goto fail; - err = ceph_pagelist_append(pagelist, &cephlock, - sizeof(struct ceph_filelock)); + ++l; } - if (err) - goto fail; } fail: return err; } +/** + * Copy the encoded flock and fcntl locks into the pagelist. + * Format is: #fcntl locks, sequential fcntl locks, #flock locks, + * sequential flock locks. + * Returns zero on success. + */ +int ceph_locks_to_pagelist(struct ceph_filelock *flocks, + struct ceph_pagelist *pagelist, + int num_fcntl_locks, int num_flock_locks) +{ + int err = 0; + __le32 nlocks; + + nlocks = cpu_to_le32(num_fcntl_locks); + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); + if (err) + goto out_fail; + + err = ceph_pagelist_append(pagelist, flocks, + num_fcntl_locks * sizeof(*flocks)); + if (err) + goto out_fail; + + nlocks = cpu_to_le32(num_flock_locks); + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); + if (err) + goto out_fail; + + err = ceph_pagelist_append(pagelist, + &flocks[num_fcntl_locks], + num_flock_locks * sizeof(*flocks)); +out_fail: + return err; +} + /* * Given a pointer to a lock, convert it to a ceph filelock */ diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d9ca15255477..4d2920304be8 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2478,39 +2478,44 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, if (recon_state->flock) { int num_fcntl_locks, num_flock_locks; - struct ceph_pagelist_cursor trunc_point; - - ceph_pagelist_set_cursor(pagelist, &trunc_point); - do { - lock_flocks(); - ceph_count_locks(inode, &num_fcntl_locks, - &num_flock_locks); - rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) + - (num_fcntl_locks+num_flock_locks) * - sizeof(struct ceph_filelock)); - unlock_flocks(); - - /* pre-alloc pagelist */ - ceph_pagelist_truncate(pagelist, &trunc_point); - err = ceph_pagelist_append(pagelist, &rec, reclen); - if (!err) - err = ceph_pagelist_reserve(pagelist, - rec.v2.flock_len); - - /* encode locks */ - if (!err) { - lock_flocks(); - err = ceph_encode_locks(inode, - pagelist, - num_fcntl_locks, - num_flock_locks); - unlock_flocks(); - } - } while (err == -ENOSPC); + struct ceph_filelock *flocks; + +encode_again: + lock_flocks(); + ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); + unlock_flocks(); + flocks = kmalloc((num_fcntl_locks+num_flock_locks) * + sizeof(struct ceph_filelock), GFP_NOFS); + if (!flocks) { + err = -ENOMEM; + goto out_free; + } + lock_flocks(); + err = ceph_encode_locks_to_buffer(inode, flocks, + num_fcntl_locks, + num_flock_locks); + unlock_flocks(); + if (err) { + kfree(flocks); + if (err == -ENOSPC) + goto encode_again; + goto out_free; + } + /* + * number of encoded locks is stable, so copy to pagelist + */ + rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) + + (num_fcntl_locks+num_flock_locks) * + sizeof(struct ceph_filelock)); + err = ceph_pagelist_append(pagelist, &rec, reclen); + if (!err) + err = ceph_locks_to_pagelist(flocks, pagelist, + num_fcntl_locks, + num_flock_locks); + kfree(flocks); } else { err = ceph_pagelist_append(pagelist, &rec, reclen); } - out_free: kfree(path); out_dput: diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 8696be2ff679..7ccfdb4aea2e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -822,8 +822,13 @@ extern const struct export_operations ceph_export_ops; extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); -extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p, - int p_locks, int f_locks); +extern int ceph_encode_locks_to_buffer(struct inode *inode, + struct ceph_filelock *flocks, + int num_fcntl_locks, + int num_flock_locks); +extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks, + struct ceph_pagelist *pagelist, + int num_fcntl_locks, int num_flock_locks); extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); /* debugfs.c */ -- cgit v1.2.2 From 60b62978bc5e903cd487de34972fb30f76c74a2e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 30 Apr 2013 17:29:29 +0000 Subject: btrfs: annotate quota tree for lockdep Quota tree has been missing from lockdep annotations, though no warning has been seen in the wild. There's currently one entry that does not belong there, BTRFS_ORPHAN_OBJECTID. No such tree exists, it's probably a copy & paste mistake, the id is defined among tree ids. Signed-off-by: David Sterba Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 6 +++--- fs/btrfs/disk-io.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 63c328a9ce95..2720d5555883 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -88,12 +88,12 @@ struct btrfs_ordered_sum; /* holds checksums of all the data extents */ #define BTRFS_CSUM_TREE_OBJECTID 7ULL -/* for storing balance parameters in the root tree */ -#define BTRFS_BALANCE_OBJECTID -4ULL - /* holds quota configuration and tracking */ #define BTRFS_QUOTA_TREE_OBJECTID 8ULL +/* for storing balance parameters in the root tree */ +#define BTRFS_BALANCE_OBJECTID -4ULL + /* orhpan objectid for tracking unlinked/truncated files */ #define BTRFS_ORPHAN_OBJECTID -5ULL diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4e9ebe1f1827..72b17276c255 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -152,7 +152,7 @@ static struct btrfs_lockdep_keyset { { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" }, { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" }, { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" }, - { .id = BTRFS_ORPHAN_OBJECTID, .name_stem = "orphan" }, + { .id = BTRFS_QUOTA_TREE_OBJECTID, .name_stem = "quota" }, { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" }, { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" }, { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" }, -- cgit v1.2.2 From a52f4cd2b1a863a42c1cb268b1cddad451cdfede Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 1 May 2013 16:23:41 +0000 Subject: Btrfs: fix off-by-one in fiemap lock_extent/unlock_extent expect an exclusive end. Tested-by: David Sterba Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d2ac518f90e4..3e6e410002e5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3989,7 +3989,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, last_for_get_extent = isize; } - lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0, &cached_state); em = get_extent_skip_holes(inode, start, last_for_get_extent, @@ -4076,7 +4076,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, out_free: free_extent_map(em); out: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, + unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state, GFP_NOFS); return ret; } -- cgit v1.2.2 From 03b71c6ca6286625d8f1ed44aabab9b5bf5dac10 Mon Sep 17 00:00:00 2001 From: Gabriel de Perthuis Date: Mon, 6 May 2013 17:40:18 +0000 Subject: btrfs: don't stop searching after encountering the wrong item The search ioctl skips items that are too large for a result buffer, but inline items of a certain size occuring before any search result is found would trigger an overflow and stop the search entirely. Bug: https://bugzilla.kernel.org/show_bug.cgi?id=57641 Cc: stable@vger.kernel.org Signed-off-by: Gabriel de Perthuis Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0de4a2fcfb24..0f81d67cdc8d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1801,7 +1801,11 @@ static noinline int copy_to_sk(struct btrfs_root *root, item_off = btrfs_item_ptr_offset(leaf, i); item_len = btrfs_item_size_nr(leaf, i); - if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE) + btrfs_item_key_to_cpu(leaf, key, i); + if (!key_in_sk(key, sk)) + continue; + + if (sizeof(sh) + item_len > BTRFS_SEARCH_ARGS_BUFSIZE) item_len = 0; if (sizeof(sh) + item_len + *sk_offset > @@ -1810,10 +1814,6 @@ static noinline int copy_to_sk(struct btrfs_root *root, goto overflow; } - btrfs_item_key_to_cpu(leaf, key, i); - if (!key_in_sk(key, sk)) - continue; - sh.objectid = key->objectid; sh.offset = key->offset; sh.type = key->type; -- cgit v1.2.2 From 69a85bd87cc81bcbd36730d4a1214c12fdb8a548 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 8 May 2013 13:30:11 -0400 Subject: Btrfs: don't null pointer deref on abort I'm sorry, theres no excuse for this sort of work. We need to use root->leafsize since eb may be NULL. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 72b17276c255..e8b29da30154 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3808,7 +3808,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, while (start <= end) { eb = btrfs_find_tree_block(root, start, root->leafsize); - start += eb->len; + start += root->leafsize; if (!eb) continue; wait_on_extent_buffer_writeback(eb); -- cgit v1.2.2 From 73e1e61fb85ab206854b6d87ff31733628bb8d72 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 8 May 2013 16:44:57 -0400 Subject: Btrfs: remove warn on in free space cache writeout This catches block groups that are too large to properly cache. We deal with this case fine, so the warning just confuses users. Remove the warning. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ecca6c7375a6..6a8bb9c79674 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -920,10 +920,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, /* Make sure we can fit our crcs into the first page */ if (io_ctl.check_crcs && - (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) { - WARN_ON(1); + (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) goto out_nospc; - } io_ctl_set_generation(&io_ctl, trans->transid); -- cgit v1.2.2 From b1c79e0947e0c190f865e2eb7b84a0fea0021cec Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 9 May 2013 13:49:30 -0400 Subject: Btrfs: handle running extent ops with skinny metadata Chris hit a bug where we weren't finding extent records when running extent ops. This is because we use the delayed_ref_head when running the extent op, which means we can't use the ->type checks to see if we are metadata. We also lose the level of the metadata we are working on. So to fix this we can just check the ->is_data section of the extent_op, and we can store the level of the buffer we were modifying in the extent_op. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.c | 4 +++- fs/btrfs/ctree.h | 2 +- fs/btrfs/delayed-ref.h | 1 + fs/btrfs/extent-tree.c | 15 +++++++-------- 4 files changed, 12 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index de6de8e60b46..02fae7f7e42c 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -951,10 +951,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ } if (new_flags != 0) { + int level = btrfs_header_level(buf); + ret = btrfs_set_disk_extent_flags(trans, root, buf->start, buf->len, - new_flags, 0); + new_flags, level, 0); if (ret) return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2720d5555883..d6dd49b51ba8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3075,7 +3075,7 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, - int is_data); + int level, int is_data); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index f75fcaf79aeb..70b962cc177d 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -60,6 +60,7 @@ struct btrfs_delayed_ref_node { struct btrfs_delayed_extent_op { struct btrfs_disk_key key; u64 flags_to_set; + int level; unsigned int update_key:1; unsigned int update_flags:1; unsigned int is_data:1; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2305b5c5cf00..c4c94b30c729 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2070,8 +2070,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, u32 item_size; int ret; int err = 0; - int metadata = (node->type == BTRFS_TREE_BLOCK_REF_KEY || - node->type == BTRFS_SHARED_BLOCK_REF_KEY); + int metadata = !extent_op->is_data; if (trans->aborted) return 0; @@ -2086,11 +2085,8 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, key.objectid = node->bytenr; if (metadata) { - struct btrfs_delayed_tree_ref *tree_ref; - - tree_ref = btrfs_delayed_node_to_tree_ref(node); key.type = BTRFS_METADATA_ITEM_KEY; - key.offset = tree_ref->level; + key.offset = extent_op->level; } else { key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = node->num_bytes; @@ -2719,7 +2715,7 @@ out: int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, - int is_data) + int level, int is_data) { struct btrfs_delayed_extent_op *extent_op; int ret; @@ -2732,6 +2728,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->update_flags = 1; extent_op->update_key = 0; extent_op->is_data = is_data ? 1 : 0; + extent_op->level = level; ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, num_bytes, extent_op); @@ -6763,6 +6760,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, extent_op->update_key = 1; extent_op->update_flags = 1; extent_op->is_data = 0; + extent_op->level = level; ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, ins.objectid, @@ -6934,7 +6932,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); BUG_ON(ret); /* -ENOMEM */ ret = btrfs_set_disk_extent_flags(trans, root, eb->start, - eb->len, flag, 0); + eb->len, flag, + btrfs_header_level(eb), 0); BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; } -- cgit v1.2.2 From 49688107527a24b0ed3780576257a1225902180b Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 7 May 2013 17:28:03 +0000 Subject: Btrfs: don't allow device replace on RAID5/RAID6 This is not yet supported and causes crashes. One sad user reported that it destroyed his filesystem. One failure is in __btrfs_map_block+0xc1f calling kmalloc(0). 0x5f21f is in __btrfs_map_block (fs/btrfs/volumes.c:4923). 4918 num_stripes = map->num_stripes; 4919 max_errors = nr_parity_stripes(map); 4920 4921 raid_map = kmalloc(sizeof(u64) * num_stripes, 4922 GFP_NOFS); 4923 if (!raid_map) { 4924 ret = -ENOMEM; 4925 goto out; 4926 } 4927 There might be more issues. Until this is really tested, don't allow users to start the procedure on RAID5/RAID6 filesystems. Signed-off-by: Stefan Behrens Signed-off-by: Josef Bacik --- fs/btrfs/dev-replace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 7ba7b3900cb8..65241f32d3f8 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -313,6 +313,11 @@ int btrfs_dev_replace_start(struct btrfs_root *root, struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; + if (btrfs_fs_incompat(fs_info, RAID56)) { + pr_warn("btrfs: dev_replace cannot yet handle RAID5/RAID6\n"); + return -EINVAL; + } + switch (args->start.cont_reading_from_srcdev_mode) { case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: -- cgit v1.2.2 From c16c2e2e51c2f0951fffa73c343b8fcb641108ba Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Wed, 8 May 2013 08:10:25 +0000 Subject: Btrfs: fix possible memory leak in the find_parent_nodes() In the find_parent_nodes(), if read_tree_block() fails, we can not return directly, we should free some allocated memory otherwise memory leak happens. Signed-off-by: Wang Shilong Signed-off-by: Josef Bacik --- fs/btrfs/backref.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b4fb41558111..290e347b6db3 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -918,7 +918,8 @@ again: ref->parent, bsz, 0); if (!eb || !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); - return -EIO; + ret = -EIO; + goto out; } ret = find_extent_in_eb(eb, bytenr, *extent_item_pos, &eie); -- cgit v1.2.2 From 379cde741b220091d2124fb500b178b90ad7f460 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Wed, 8 May 2013 08:56:09 +0000 Subject: Btrfs: fix possible memory leak in replace_path() In replace_path(), if read_tree_block() fails, we cannot return directly, we should free some allocated memory otherwise memory leak happens. Similar to Wang's "Btrfs: fix possible memory leak in the find_parent_nodes()" patch, the current commit fixes an issue that is related to the "Btrfs: fix all callers of read_tree_block" commit. Signed-off-by: Stefan Behrens Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 704a1b8d2a2b..5c5b8bb44ee5 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1773,7 +1773,7 @@ again: if (!eb || !extent_buffer_uptodate(eb)) { ret = (!eb) ? -ENOMEM : -EIO; free_extent_buffer(eb); - return ret; + break; } btrfs_tree_lock(eb); if (cow) { -- cgit v1.2.2 From 8250dabedb633e162bce89f2aacf5e65fa9e6464 Mon Sep 17 00:00:00 2001 From: Andreas Philipp Date: Sat, 11 May 2013 11:13:03 +0000 Subject: Correct allowed raid levels on balance. Raid5 with 3 devices is well defined while the old logic allowed raid5 only with a minimum of 4 devices when converting the block group profile via btrfs balance. Creating a raid5 with just three devices using mkfs.btrfs worked always as expected. This is now fixed and the whole logic is rewritten. Signed-off-by: Andreas Philipp Signed-off-by: Josef Bacik --- fs/btrfs/volumes.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a191bac31d85..062e93069c5e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3120,14 +3120,13 @@ int btrfs_balance(struct btrfs_balance_control *bctl, allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; if (num_devices == 1) allowed |= BTRFS_BLOCK_GROUP_DUP; - else if (num_devices < 4) + else if (num_devices > 1) allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); - else - allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6); - + if (num_devices > 2) + allowed |= BTRFS_BLOCK_GROUP_RAID5; + if (num_devices > 3) + allowed |= (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID6); if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && (!alloc_profile_is_valid(bctl->data.target, 1) || (bctl->data.target & ~allowed))) { -- cgit v1.2.2 From 7cfa9e51d2948ae90e7599cc114dcce2c7c2b1fc Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 13 May 2013 13:55:08 +0000 Subject: Btrfs: don't abort the current transaction if there is no enough space for inode cache The filesystem with inode cache was forced to be read-only when we umounted it. Steps to reproduce: # mkfs.btrfs -f ${DEV} # mount -o inode_cache ${DEV} ${MNT} # dd if=/dev/zero of=${MNT}/file1 bs=1M count=8192 # btrfs fi syn ${MNT} # dd if=${MNT}/file1 of=/dev/null bs=1M # rm -f ${MNT}/file1 # btrfs fi syn ${MNT} # umount ${MNT} It is because there was no enough space to do inode cache truncation, and then we aborted the current transaction. But no space error is not a serious problem when we write out the inode cache, and it is safe that we just skip this step if we meet this problem. So we need not abort the current transaction. Reported-by: Tsutomu Itoh Signed-off-by: Miao Xie Tested-by: Tsutomu Itoh Signed-off-by: Josef Bacik --- fs/btrfs/inode-map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index d26f67a59e36..9818d4a3f829 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -468,7 +468,8 @@ again: if (i_size_read(inode) > 0) { ret = btrfs_truncate_free_space_cache(root, trans, path, inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + if (ret != -ENOSPC) + btrfs_abort_transaction(trans, root, ret); goto out_put; } } -- cgit v1.2.2 From 7b61cd92242542944fc27024900c495a6a7b3396 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 13 May 2013 13:55:09 +0000 Subject: Btrfs: don't use global block reservation for inode cache truncation It is very likely that there are lots of subvolumes/snapshots in the filesystem, so if we use global block reservation to do inode cache truncation, we may hog all the free space that is reserved in global rsv. So it is better that we do the free space reservation for inode cache truncation by ourselves. Cc: Tsutomu Itoh Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 5 +++++ fs/btrfs/free-space-cache.c | 39 +++++++++++++++++++-------------------- fs/btrfs/free-space-cache.h | 2 ++ fs/btrfs/inode-map.c | 5 +++-- fs/btrfs/relocation.c | 5 +++++ 5 files changed, 34 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c4c94b30c729..162a66bfbffd 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3106,6 +3106,11 @@ again: WARN_ON(ret); if (i_size_read(inode) > 0) { + ret = btrfs_check_trunc_cache_free_space(root, + &root->fs_info->global_block_rsv); + if (ret) + goto out_put; + ret = btrfs_truncate_free_space_cache(root, trans, path, inode); if (ret) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6a8bb9c79674..e53009657f0e 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -197,30 +197,32 @@ int create_free_space_inode(struct btrfs_root *root, block_group->key.objectid); } -int btrfs_truncate_free_space_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct inode *inode) +int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, + struct btrfs_block_rsv *rsv) { - struct btrfs_block_rsv *rsv; u64 needed_bytes; - loff_t oldsize; - int ret = 0; - - rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->global_block_rsv; + int ret; /* 1 for slack space, 1 for updating the inode */ needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) + btrfs_calc_trans_metadata_size(root, 1); - spin_lock(&trans->block_rsv->lock); - if (trans->block_rsv->reserved < needed_bytes) { - spin_unlock(&trans->block_rsv->lock); - trans->block_rsv = rsv; - return -ENOSPC; - } - spin_unlock(&trans->block_rsv->lock); + spin_lock(&rsv->lock); + if (rsv->reserved < needed_bytes) + ret = -ENOSPC; + else + ret = 0; + spin_unlock(&rsv->lock); + return 0; +} + +int btrfs_truncate_free_space_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct inode *inode) +{ + loff_t oldsize; + int ret = 0; oldsize = i_size_read(inode); btrfs_i_size_write(inode, 0); @@ -232,9 +234,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, */ ret = btrfs_truncate_inode_items(trans, root, inode, 0, BTRFS_EXTENT_DATA_KEY); - if (ret) { - trans->block_rsv = rsv; btrfs_abort_transaction(trans, root, ret); return ret; } @@ -242,7 +242,6 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, ret = btrfs_update_inode(trans, root, inode); if (ret) btrfs_abort_transaction(trans, root, ret); - trans->block_rsv = rsv; return ret; } diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 4dc17d8809c7..8b7f19f44961 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -54,6 +54,8 @@ int create_free_space_inode(struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); +int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, + struct btrfs_block_rsv *rsv); int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, struct btrfs_path *path, diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 9818d4a3f829..2c66ddbbe670 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -429,11 +429,12 @@ int btrfs_save_ino_cache(struct btrfs_root *root, num_bytes = trans->bytes_reserved; /* * 1 item for inode item insertion if need - * 3 items for inode item update (in the worst case) + * 4 items for inode item update (in the worst case) + * 1 items for slack space if we need do truncation * 1 item for free space object * 3 items for pre-allocation */ - trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); + trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 10); ret = btrfs_block_rsv_add(root, trans->block_rsv, trans->bytes_reserved, BTRFS_RESERVE_NO_FLUSH); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 5c5b8bb44ee5..395b82031a42 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3350,6 +3350,11 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info, } truncate: + ret = btrfs_check_trunc_cache_free_space(root, + &fs_info->global_block_rsv); + if (ret) + goto out; + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; -- cgit v1.2.2 From b586b32374909863311b7c916c7c0c709141e35a Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 13 May 2013 13:55:10 +0000 Subject: Btrfs: optimize the error handle of use_block_rsv() cc: Tsutomu Itoh Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 65 ++++++++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 162a66bfbffd..5e2c0bf84075 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6656,48 +6656,39 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); - if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(root, block_rsv, blocksize, - BTRFS_RESERVE_NO_FLUSH); - /* - * If we couldn't reserve metadata bytes try and use some from - * the global reserve. - */ - if (ret && block_rsv != global_rsv) { - ret = block_rsv_use_bytes(global_rsv, blocksize); - if (!ret) - return global_rsv; - return ERR_PTR(ret); - } else if (ret) { - return ERR_PTR(ret); - } - return block_rsv; - } + if (unlikely(block_rsv->size == 0)) + goto try_reserve; ret = block_rsv_use_bytes(block_rsv, blocksize); if (!ret) return block_rsv; - if (ret && !block_rsv->failfast) { - if (btrfs_test_opt(root, ENOSPC_DEBUG)) { - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL * 10, - /*DEFAULT_RATELIMIT_BURST*/ 1); - if (__ratelimit(&_rs)) - WARN(1, KERN_DEBUG - "btrfs: block rsv returned %d\n", ret); - } - ret = reserve_metadata_bytes(root, block_rsv, blocksize, - BTRFS_RESERVE_NO_FLUSH); - if (!ret) { - return block_rsv; - } else if (ret && block_rsv != global_rsv) { - ret = block_rsv_use_bytes(global_rsv, blocksize); - if (!ret) - return global_rsv; - } - } - return ERR_PTR(-ENOSPC); + if (block_rsv->failfast) + return ERR_PTR(ret); + + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL * 10, + /*DEFAULT_RATELIMIT_BURST*/ 1); + if (__ratelimit(&_rs)) + WARN(1, KERN_DEBUG + "btrfs: block rsv returned %d\n", ret); + } +try_reserve: + ret = reserve_metadata_bytes(root, block_rsv, blocksize, + BTRFS_RESERVE_NO_FLUSH); + if (!ret) + return block_rsv; + /* + * If we couldn't reserve metadata bytes try and use some from + * the global reserve. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL) { + ret = block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; + } + return ERR_PTR(ret); } static void unuse_block_rsv(struct btrfs_fs_info *fs_info, -- cgit v1.2.2 From 5881cfc924c8143dbc3e1f343516fc6527eb8311 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 13 May 2013 13:55:11 +0000 Subject: Btrfs: don't steal the reserved space from the global reserve if their space type is different If the type of the space we need is different with the global reserve, we can not steal the space from the global reserve, because we can not allocate the space from the free space cache that the global reserve points to. Cc: Tsutomu Itoh Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5e2c0bf84075..54e63b273a64 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6681,9 +6681,11 @@ try_reserve: return block_rsv; /* * If we couldn't reserve metadata bytes try and use some from - * the global reserve. + * the global reserve if its space type is the same as the global + * reservation. */ - if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL) { + if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && + block_rsv->space_info == global_rsv->space_info) { ret = block_rsv_use_bytes(global_rsv, blocksize); if (!ret) return global_rsv; -- cgit v1.2.2 From d88033dbf4c23279b012725876f1e164e09644ff Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 13 May 2013 13:55:12 +0000 Subject: Btrfs: update the global reserve if it is empty Before applying this patch, we reserved the space for the global reserve by the minimum unit if we found it is empty, it was unreasonable and inefficient, because if the global reserve space was depleted, it implied that the size of the global reserve was too small. In this case, we shoud update the global reserve and fill it. Cc: Tsutomu Itoh Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 54e63b273a64..42f5e6196021 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6653,12 +6653,13 @@ use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; int ret; + bool global_updated = false; block_rsv = get_block_rsv(trans, root); if (unlikely(block_rsv->size == 0)) goto try_reserve; - +again: ret = block_rsv_use_bytes(block_rsv, blocksize); if (!ret) return block_rsv; @@ -6666,6 +6667,12 @@ use_block_rsv(struct btrfs_trans_handle *trans, if (block_rsv->failfast) return ERR_PTR(ret); + if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { + global_updated = true; + update_global_block_rsv(root->fs_info); + goto again; + } + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL * 10, -- cgit v1.2.2 From b9aa55bed1c1a3a329da31884b643c62d57ebb21 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 14 May 2013 02:12:15 +0000 Subject: Btrfs: return errno if possible when we fail to allocate memory We need to set return value explicitly, otherwise we'll lose the error value. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1669c3b4be2f..99a9c25d36a6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -714,8 +714,10 @@ retry: async_extent->ram_size - 1, 0); em = alloc_extent_map(); - if (!em) + if (!em) { + ret = -ENOMEM; goto out_free_reserve; + } em->start = async_extent->start; em->len = async_extent->ram_size; em->orig_start = em->start; @@ -922,8 +924,10 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, } em = alloc_extent_map(); - if (!em) + if (!em) { + ret = -ENOMEM; goto out_reserve; + } em->start = start; em->orig_start = em->start; ram_size = ins.offset; -- cgit v1.2.2 From 89042e5ad23d50449691141334f30d53d6271266 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 May 2013 07:48:15 +0000 Subject: Btrfs: fix accessing a freed tree root inode_tree_del() will move the tree root into the dead root list, and then the tree will be destroyed by the cleaner. So if we remove the delayed node which is cached in the inode after inode_tree_del(), we may access a freed tree root. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 99a9c25d36a6..790eceb48fb0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4727,6 +4727,7 @@ void btrfs_evict_inode(struct inode *inode) btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); no_delete: + btrfs_remove_delayed_node(inode); clear_inode(inode); return; } @@ -7982,7 +7983,6 @@ void btrfs_destroy_inode(struct inode *inode) inode_tree_del(inode); btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); free: - btrfs_remove_delayed_node(inode); call_rcu(&inode->i_rcu, btrfs_i_callback); } -- cgit v1.2.2 From e1409cef85894f96f4bddc6633d64d1c5275e2a3 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 May 2013 07:48:16 +0000 Subject: Btrfs: fix unprotected root node of the subvolume's inode rb-tree The root node of the rb-tree may be changed, so we should get it under the lock. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 790eceb48fb0..19eef3e852b0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4843,14 +4843,13 @@ static void inode_tree_add(struct inode *inode) struct rb_node **p; struct rb_node *parent; u64 ino = btrfs_ino(inode); -again: - p = &root->inode_tree.rb_node; - parent = NULL; if (inode_unhashed(inode)) return; - +again: + parent = NULL; spin_lock(&root->inode_lock); + p = &root->inode_tree.rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct btrfs_inode, rb_node); -- cgit v1.2.2 From 061594ef171a5ba52b5786688ae766907b0bda2b Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 May 2013 07:48:17 +0000 Subject: Btrfs: pause the space balance when remounting to R/O Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a4807ced23cc..f0857e092a3c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1263,6 +1263,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) btrfs_dev_replace_suspend_for_unmount(fs_info); btrfs_scrub_cancel(fs_info); + btrfs_pause_balance(fs_info); ret = btrfs_commit_super(root); if (ret) -- cgit v1.2.2 From 314297c2a3fbcbda992507f70cd04cc82084e434 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 May 2013 07:48:18 +0000 Subject: Btrfs: remove BUG_ON() in btrfs_read_fs_tree_no_radix() We have checked if ->node is NULL or not, so it is unnecessary to use BUG_ON() to check again. Remove it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e8b29da30154..9b9f28664b4f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1513,7 +1513,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, } root->commit_root = btrfs_root_node(root); - BUG_ON(!root->node); /* -ENOMEM */ out: if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { root->ref_cows = 1; -- cgit v1.2.2 From b216cbfb52c08300c203abf06ea9519d15d10045 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 May 2013 07:48:21 +0000 Subject: Btrfs: don't invoke btrfs_invalidate_inodes() in the spin lock context btrfs_invalidate_inodes() may sleep, so we should not invoke it in the spin lock context. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9b9f28664b4f..1b03f8393a69 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3658,8 +3658,11 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, ordered_operations); list_del_init(&btrfs_inode->ordered_operations); + spin_unlock(&root->fs_info->ordered_extent_lock); btrfs_invalidate_inodes(btrfs_inode->root); + + spin_lock(&root->fs_info->ordered_extent_lock); } spin_unlock(&root->fs_info->ordered_extent_lock); @@ -3781,8 +3784,11 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) list_del_init(&btrfs_inode->delalloc_inodes); clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, &btrfs_inode->runtime_flags); + spin_unlock(&root->fs_info->delalloc_lock); btrfs_invalidate_inodes(btrfs_inode->root); + + spin_lock(&root->fs_info->delalloc_lock); } spin_unlock(&root->fs_info->delalloc_lock); -- cgit v1.2.2 From 17a5adccf3fd01added91f3bf9aa7ee9aa28843b Mon Sep 17 00:00:00 2001 From: Alexandre Oliva Date: Wed, 15 May 2013 11:38:55 -0400 Subject: btrfs: do away with non-whole_page extent I/O end_bio_extent_readpage computes whole_page based on bv_offset and bv_len, without taking into account that blk_update_request may modify them when some of the blocks to be read into a page produce a read error. This would cause the read to unlock only part of the file range associated with the page, which would in turn leave the entire page locked, which would not only keep the process blocked instead of returning -EIO to it, but also prevent any further access to the file. It turns out that btrfs always issues whole-page reads and writes. The special handling of non-whole_page appears to be a mistake or a left-over from a time when this wasn't the case. Indeed, end_bio_extent_writepage distinguished between whole_page and non-whole_page writes but behaved identically in both cases! I've replaced the whole_page computations with warnings, just to be sure that we're not issuing partial page reads or writes. The warnings should probably just go away some time. Signed-off-by: Alexandre Oliva Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 85 +++++++++++++++++++--------------------------------- 1 file changed, 30 insertions(+), 55 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3e6e410002e5..ca4355ddea06 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1947,28 +1947,6 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) SetPageUptodate(page); } -/* - * helper function to unlock a page if all the extents in the tree - * for that page are unlocked - */ -static void check_page_locked(struct extent_io_tree *tree, struct page *page) -{ - u64 start = page_offset(page); - u64 end = start + PAGE_CACHE_SIZE - 1; - if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) - unlock_page(page); -} - -/* - * helper function to end page writeback if all the extents - * in the tree for that page are done with writeback - */ -static void check_page_writeback(struct extent_io_tree *tree, - struct page *page) -{ - end_page_writeback(page); -} - /* * When IO fails, either with EIO or csum verification fails, we * try other mirrors that might have a good copy of the data. This @@ -2398,19 +2376,24 @@ static void end_bio_extent_writepage(struct bio *bio, int err) struct extent_io_tree *tree; u64 start; u64 end; - int whole_page; do { struct page *page = bvec->bv_page; tree = &BTRFS_I(page->mapping->host)->io_tree; - start = page_offset(page) + bvec->bv_offset; - end = start + bvec->bv_len - 1; + /* We always issue full-page reads, but if some block + * in a page fails to read, blk_update_request() will + * advance bv_offset and adjust bv_len to compensate. + * Print a warning for nonzero offsets, and an error + * if they don't add up to a full page. */ + if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) + printk("%s page write in btrfs with offset %u and length %u\n", + bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE + ? KERN_ERR "partial" : KERN_INFO "incomplete", + bvec->bv_offset, bvec->bv_len); - if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) - whole_page = 1; - else - whole_page = 0; + start = page_offset(page); + end = start + bvec->bv_offset + bvec->bv_len - 1; if (--bvec >= bio->bi_io_vec) prefetchw(&bvec->bv_page->flags); @@ -2418,10 +2401,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err) if (end_extent_writepage(page, err, start, end)) continue; - if (whole_page) - end_page_writeback(page); - else - check_page_writeback(tree, page); + end_page_writeback(page); } while (bvec >= bio->bi_io_vec); bio_put(bio); @@ -2446,7 +2426,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err) struct extent_io_tree *tree; u64 start; u64 end; - int whole_page; int mirror; int ret; @@ -2463,13 +2442,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err) (long int)bio->bi_bdev); tree = &BTRFS_I(page->mapping->host)->io_tree; - start = page_offset(page) + bvec->bv_offset; - end = start + bvec->bv_len - 1; + /* We always issue full-page reads, but if some block + * in a page fails to read, blk_update_request() will + * advance bv_offset and adjust bv_len to compensate. + * Print a warning for nonzero offsets, and an error + * if they don't add up to a full page. */ + if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) + printk("%s page read in btrfs with offset %u and length %u\n", + bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE + ? KERN_ERR "partial" : KERN_INFO "incomplete", + bvec->bv_offset, bvec->bv_len); - if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) - whole_page = 1; - else - whole_page = 0; + start = page_offset(page); + end = start + bvec->bv_offset + bvec->bv_len - 1; if (++bvec <= bvec_end) prefetchw(&bvec->bv_page->flags); @@ -2528,23 +2513,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err) } unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); - if (whole_page) { - if (uptodate) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); + if (uptodate) { + SetPageUptodate(page); } else { - if (uptodate) { - check_page_uptodate(tree, page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - check_page_locked(tree, page); + ClearPageUptodate(page); + SetPageError(page); } + unlock_page(page); } while (bvec <= bvec_end); bio_put(bio); -- cgit v1.2.2 From 3a6cad9009c85e29e83aafc8ac00b1dd5067fc5f Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Thu, 16 May 2013 14:48:19 +0000 Subject: Btrfs: explicitly use global_block_rsv for quota_tree The quota_tree was set up to use the empty_block_rsv before which would be problematic when the filesystem is filled up and ENOSPC happens during internal operations while the quota tree is updated and COWed (when the btrfs_qgroup_info_item items) are written. In fact, use_block_rsv() which is used in btrfs_cow_block() falls back to the global_block_rsv in this case. But just in order to make it more clear what is happening, change it to explicitly use the global_block_rsv. Signed-off-by: Stefan Behrens Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 42f5e6196021..df472ab1b5ac 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4564,6 +4564,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; + if (fs_info->quota_root) + fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; update_global_block_rsv(fs_info); -- cgit v1.2.2 From 655b09fe540b73edeaabfb4c2d700be51a1f8bce Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 17 May 2013 14:06:51 -0400 Subject: Btrfs: make sure roots are assigned before freeing their nodes If we fail to load the chunk tree we'll call free_root_pointers, except we may not have assigned the roots for the dev_root/extent_root/csum_root yet, so we could NULL pointer deref at this point. Just add checks to make sure these roots are set to keep us from panicing. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1b03f8393a69..4bdb0528688a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1987,30 +1987,33 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) { free_extent_buffer(info->tree_root->node); free_extent_buffer(info->tree_root->commit_root); - free_extent_buffer(info->dev_root->node); - free_extent_buffer(info->dev_root->commit_root); - free_extent_buffer(info->extent_root->node); - free_extent_buffer(info->extent_root->commit_root); - free_extent_buffer(info->csum_root->node); - free_extent_buffer(info->csum_root->commit_root); - if (info->quota_root) { - free_extent_buffer(info->quota_root->node); - free_extent_buffer(info->quota_root->commit_root); - } - info->tree_root->node = NULL; info->tree_root->commit_root = NULL; - info->dev_root->node = NULL; - info->dev_root->commit_root = NULL; - info->extent_root->node = NULL; - info->extent_root->commit_root = NULL; - info->csum_root->node = NULL; - info->csum_root->commit_root = NULL; + + if (info->dev_root) { + free_extent_buffer(info->dev_root->node); + free_extent_buffer(info->dev_root->commit_root); + info->dev_root->node = NULL; + info->dev_root->commit_root = NULL; + } + if (info->extent_root) { + free_extent_buffer(info->extent_root->node); + free_extent_buffer(info->extent_root->commit_root); + info->extent_root->node = NULL; + info->extent_root->commit_root = NULL; + } + if (info->csum_root) { + free_extent_buffer(info->csum_root->node); + free_extent_buffer(info->csum_root->commit_root); + info->csum_root->node = NULL; + info->csum_root->commit_root = NULL; + } if (info->quota_root) { + free_extent_buffer(info->quota_root->node); + free_extent_buffer(info->quota_root->commit_root); info->quota_root->node = NULL; info->quota_root->commit_root = NULL; } - if (chunk_root) { free_extent_buffer(info->chunk_root->node); free_extent_buffer(info->chunk_root->commit_root); -- cgit v1.2.2 From 9be3395bcd4ad4af76476ac38152b4cafa6b6159 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 17 May 2013 18:30:14 -0400 Subject: Btrfs: use a btrfs bioset instead of abusing bio internals Btrfs has been pointer tagging bi_private and using bi_bdev to store the stripe index and mirror number of failed IOs. As bios bubble back up through the call chain, we use these to decide if and how to retry our IOs. They are also used to count IO failures on a per device basis. Recently a bio tracepoint was added lead to crashes because we were abusing bi_bdev. This commit adds a btrfs bioset, and creates explicit fields for the mirror number and stripe index. The plan is to extend this structure for all of the fields currently in struct btrfs_bio, which will mean one less kmalloc in our IO path. Signed-off-by: Chris Mason Reported-by: Tejun Heo --- fs/btrfs/check-integrity.c | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent_io.c | 49 +++++++++++++++++++++++++++++------ fs/btrfs/extent_io.h | 2 ++ fs/btrfs/inode.c | 64 +++++++++++++++++++++++++++++++--------------- fs/btrfs/raid56.c | 2 +- fs/btrfs/scrub.c | 10 ++++---- fs/btrfs/volumes.c | 41 +++++------------------------ fs/btrfs/volumes.h | 20 +++++++++++++++ 9 files changed, 120 insertions(+), 72 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 18af6f48781a..1431a6965017 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1700,7 +1700,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, unsigned int j; DECLARE_COMPLETION_ONSTACK(complete); - bio = bio_alloc(GFP_NOFS, num_pages - i); + bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i); if (!bio) { printk(KERN_INFO "btrfsic: bio_alloc() for %u pages failed!\n", diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4e9ebe1f1827..ca0ea9928210 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3128,7 +3128,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) * caller */ device->flush_bio = NULL; - bio = bio_alloc(GFP_NOFS, 0); + bio = btrfs_io_bio_alloc(GFP_NOFS, 0); if (!bio) return -ENOMEM; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d2ac518f90e4..fe1d6c3424a5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -23,6 +23,7 @@ static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; +static struct bio_set *btrfs_bioset; #ifdef CONFIG_BTRFS_DEBUG static LIST_HEAD(buffers); @@ -125,10 +126,20 @@ int __init extent_io_init(void) SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) goto free_state_cache; + + btrfs_bioset = bioset_create(BIO_POOL_SIZE, + offsetof(struct btrfs_io_bio, bio)); + if (!btrfs_bioset) + goto free_buffer_cache; return 0; +free_buffer_cache: + kmem_cache_destroy(extent_buffer_cache); + extent_buffer_cache = NULL; + free_state_cache: kmem_cache_destroy(extent_state_cache); + extent_state_cache = NULL; return -ENOMEM; } @@ -145,6 +156,8 @@ void extent_io_exit(void) kmem_cache_destroy(extent_state_cache); if (extent_buffer_cache) kmem_cache_destroy(extent_buffer_cache); + if (btrfs_bioset) + bioset_free(btrfs_bioset); } void extent_io_tree_init(struct extent_io_tree *tree, @@ -2046,7 +2059,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) return 0; - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; bio->bi_private = &compl; @@ -2336,7 +2349,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, return -EIO; } - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) { free_io_failure(inode, failrec, 0); return -EIO; @@ -2457,10 +2470,11 @@ static void end_bio_extent_readpage(struct bio *bio, int err) struct page *page = bvec->bv_page; struct extent_state *cached = NULL; struct extent_state *state; + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " - "mirror=%ld\n", (u64)bio->bi_sector, err, - (long int)bio->bi_bdev); + "mirror=%lu\n", (u64)bio->bi_sector, err, + io_bio->mirror_num); tree = &BTRFS_I(page->mapping->host)->io_tree; start = page_offset(page) + bvec->bv_offset; @@ -2485,7 +2499,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) } spin_unlock(&tree->lock); - mirror = (int)(unsigned long)bio->bi_bdev; + mirror = io_bio->mirror_num; if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { ret = tree->ops->readpage_end_io_hook(page, start, end, state, mirror); @@ -2550,17 +2564,23 @@ static void end_bio_extent_readpage(struct bio *bio, int err) bio_put(bio); } +/* + * this allocates from the btrfs_bioset. We're returning a bio right now + * but you can call btrfs_io_bio for the appropriate container_of magic + */ struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags) { struct bio *bio; - bio = bio_alloc(gfp_flags, nr_vecs); + bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); if (bio == NULL && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) - bio = bio_alloc(gfp_flags, nr_vecs); + while (!bio && (nr_vecs /= 2)) { + bio = bio_alloc_bioset(gfp_flags, + nr_vecs, btrfs_bioset); + } } if (bio) { @@ -2571,6 +2591,19 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, return bio; } +struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) +{ + return bio_clone_bioset(bio, gfp_mask, btrfs_bioset); +} + + +/* this also allocates from the btrfs_bioset */ +struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) +{ + return bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); +} + + static int __must_check submit_one_bio(int rw, struct bio *bio, int mirror_num, unsigned long bio_flags) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a2c03a175009..41fb81e7ec53 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -336,6 +336,8 @@ int extent_clear_unlock_delalloc(struct inode *inode, struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags); +struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs); +struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask); struct btrfs_fs_info; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1669c3b4be2f..d59dddfb1f96 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6927,7 +6927,11 @@ struct btrfs_dio_private { /* IO errors */ int errors; + /* orig_bio is our btrfs_io_bio */ struct bio *orig_bio; + + /* dio_bio came from fs/direct-io.c */ + struct bio *dio_bio; }; static void btrfs_endio_direct_read(struct bio *bio, int err) @@ -6937,6 +6941,7 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) struct bio_vec *bvec = bio->bi_io_vec; struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; + struct bio *dio_bio; u64 start; start = dip->logical_offset; @@ -6976,14 +6981,15 @@ failed: unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); - bio->bi_private = dip->private; + dio_bio = dip->dio_bio; kfree(dip); /* If we had a csum failure make sure to clear the uptodate flag */ if (err) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - dio_end_io(bio, err); + clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); + dio_end_io(dio_bio, err); + bio_put(bio); } static void btrfs_endio_direct_write(struct bio *bio, int err) @@ -6994,6 +7000,7 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) struct btrfs_ordered_extent *ordered = NULL; u64 ordered_offset = dip->logical_offset; u64 ordered_bytes = dip->bytes; + struct bio *dio_bio; int ret; if (err) @@ -7021,14 +7028,15 @@ out_test: goto again; } out_done: - bio->bi_private = dip->private; + dio_bio = dip->dio_bio; kfree(dip); /* If we had an error make sure to clear the uptodate flag */ if (err) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - dio_end_io(bio, err); + clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); + dio_end_io(dio_bio, err); + bio_put(bio); } static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, @@ -7064,10 +7072,10 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) if (!atomic_dec_and_test(&dip->pending_bios)) goto out; - if (dip->errors) + if (dip->errors) { bio_io_error(dip->orig_bio); - else { - set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags); + } else { + set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags); bio_endio(dip->orig_bio, 0); } out: @@ -7242,25 +7250,34 @@ out_err: return 0; } -static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, - loff_t file_offset) +static void btrfs_submit_direct(int rw, struct bio *dio_bio, + struct inode *inode, loff_t file_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_dio_private *dip; - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec = dio_bio->bi_io_vec; + struct bio *io_bio; int skip_sum; int write = rw & REQ_WRITE; int ret = 0; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS); + + if (!io_bio) { + ret = -ENOMEM; + goto free_ordered; + } + dip = kmalloc(sizeof(*dip), GFP_NOFS); if (!dip) { ret = -ENOMEM; - goto free_ordered; + goto free_io_bio; } - dip->private = bio->bi_private; + dip->private = dio_bio->bi_private; + io_bio->bi_private = dio_bio->bi_private; dip->inode = inode; dip->logical_offset = file_offset; @@ -7268,22 +7285,27 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, do { dip->bytes += bvec->bv_len; bvec++; - } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1)); + } while (bvec <= (dio_bio->bi_io_vec + dio_bio->bi_vcnt - 1)); - dip->disk_bytenr = (u64)bio->bi_sector << 9; - bio->bi_private = dip; + dip->disk_bytenr = (u64)dio_bio->bi_sector << 9; + io_bio->bi_private = dip; dip->errors = 0; - dip->orig_bio = bio; + dip->orig_bio = io_bio; + dip->dio_bio = dio_bio; atomic_set(&dip->pending_bios, 0); if (write) - bio->bi_end_io = btrfs_endio_direct_write; + io_bio->bi_end_io = btrfs_endio_direct_write; else - bio->bi_end_io = btrfs_endio_direct_read; + io_bio->bi_end_io = btrfs_endio_direct_read; ret = btrfs_submit_direct_hook(rw, dip, skip_sum); if (!ret) return; + +free_io_bio: + bio_put(io_bio); + free_ordered: /* * If this is a write, we need to clean up the reserved space and kill @@ -7299,7 +7321,7 @@ free_ordered: btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } - bio_endio(bio, ret); + bio_endio(dio_bio, ret); } static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0740621daf6c..0525e1389f5b 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1050,7 +1050,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, } /* put a new bio on the list */ - bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); + bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); if (!bio) return -ENOMEM; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f489e24659a4..79bd479317cb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1296,7 +1296,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, } WARN_ON(!page->page); - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) { page->io_error = 1; sblock->no_io_error_seen = 0; @@ -1431,7 +1431,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; } - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; bio->bi_bdev = page_bad->dev->bdev; @@ -1522,7 +1522,7 @@ again: sbio->dev = wr_ctx->tgtdev; bio = sbio->bio; if (!bio) { - bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); + bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); if (!bio) { mutex_unlock(&wr_ctx->wr_lock); return -ENOMEM; @@ -1930,7 +1930,7 @@ again: sbio->dev = spage->dev; bio = sbio->bio; if (!bio) { - bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); + bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); if (!bio) return -ENOMEM; sbio->bio = bio; @@ -3307,7 +3307,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); return -EIO; } - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a191bac31d85..317afc7a2af4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5019,42 +5019,16 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, return 0; } -static void *merge_stripe_index_into_bio_private(void *bi_private, - unsigned int stripe_index) -{ - /* - * with single, dup, RAID0, RAID1 and RAID10, stripe_index is - * at most 1. - * The alternative solution (instead of stealing bits from the - * pointer) would be to allocate an intermediate structure - * that contains the old private pointer plus the stripe_index. - */ - BUG_ON((((uintptr_t)bi_private) & 3) != 0); - BUG_ON(stripe_index > 3); - return (void *)(((uintptr_t)bi_private) | stripe_index); -} - -static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private) -{ - return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3)); -} - -static unsigned int extract_stripe_index_from_bio_private(void *bi_private) -{ - return (unsigned int)((uintptr_t)bi_private) & 3; -} - static void btrfs_end_bio(struct bio *bio, int err) { - struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private); + struct btrfs_bio *bbio = bio->bi_private; int is_orig_bio = 0; if (err) { atomic_inc(&bbio->error); if (err == -EIO || err == -EREMOTEIO) { unsigned int stripe_index = - extract_stripe_index_from_bio_private( - bio->bi_private); + btrfs_io_bio(bio)->stripe_index; struct btrfs_device *dev; BUG_ON(stripe_index >= bbio->num_stripes); @@ -5084,8 +5058,7 @@ static void btrfs_end_bio(struct bio *bio, int err) } bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; - bio->bi_bdev = (struct block_device *) - (unsigned long)bbio->mirror_num; + btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; /* only send an error to the higher layers if it is * beyond the tolerance of the btrfs bio */ @@ -5211,8 +5184,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, struct btrfs_device *dev = bbio->stripes[dev_nr].dev; bio->bi_private = bbio; - bio->bi_private = merge_stripe_index_into_bio_private( - bio->bi_private, (unsigned int)dev_nr); + btrfs_io_bio(bio)->stripe_index = dev_nr; bio->bi_end_io = btrfs_end_bio; bio->bi_sector = physical >> 9; #ifdef DEBUG @@ -5273,8 +5245,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) if (atomic_dec_and_test(&bbio->stripes_pending)) { bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; - bio->bi_bdev = (struct block_device *) - (unsigned long)bbio->mirror_num; + btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_sector = logical >> 9; kfree(bbio); bio_endio(bio, -EIO); @@ -5352,7 +5323,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, } if (dev_nr < total_devs - 1) { - bio = bio_clone(first_bio, GFP_NOFS); + bio = btrfs_bio_clone(first_bio, GFP_NOFS); BUG_ON(!bio); /* -ENOMEM */ } else { bio = first_bio; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 845ccbb0d2e3..f6247e2a47f7 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -152,6 +152,26 @@ struct btrfs_fs_devices { int rotating; }; +/* + * we need the mirror number and stripe index to be passed around + * the call chain while we are processing end_io (especially errors). + * Really, what we need is a btrfs_bio structure that has this info + * and is properly sized with its stripe array, but we're not there + * quite yet. We have our own btrfs bioset, and all of the bios + * we allocate are actually btrfs_io_bios. We'll cram as much of + * struct btrfs_bio as we can into this over time. + */ +struct btrfs_io_bio { + unsigned long mirror_num; + unsigned long stripe_index; + struct bio bio; +}; + +static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio) +{ + return container_of(bio, struct btrfs_io_bio, bio); +} + struct btrfs_bio_stripe { struct btrfs_device *dev; u64 physical; -- cgit v1.2.2 From 3634a6327815d39dd93e5c44a602daae91c66297 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 17 May 2013 09:30:32 -0400 Subject: fuse: truncate pagecache range on hole punch fuse supports hole punch via the fallocate() FALLOC_FL_PUNCH_HOLE interface. When a hole punch is passed through, the page cache is not cleared and thus allows reading stale data from the cache. This is easily demonstrable (using FOPEN_KEEP_CACHE) by reading a smallish random data file into cache, punching a hole and creating a copy of the file. Drop caches or remount and observe that the original file no longer matches the file copied after the hole punch. The original file contains a zeroed range and the latter file contains stale data. Protect against writepage requests in progress and punch out the associated page cache range after a successful client fs hole punch. Signed-off-by: Brian Foster Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index fe191325fefa..a200a2d80377 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -16,6 +16,7 @@ #include #include #include +#include static const struct file_operations fuse_direct_io_file_operations; @@ -2453,6 +2454,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, loff_t length) { struct fuse_file *ff = file->private_data; + struct inode *inode = file->f_inode; struct fuse_conn *fc = ff->fc; struct fuse_req *req; struct fuse_fallocate_in inarg = { @@ -2466,9 +2468,16 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (fc->no_fallocate) return -EOPNOTSUPP; + if (mode & FALLOC_FL_PUNCH_HOLE) { + mutex_lock(&inode->i_mutex); + fuse_set_nowrite(inode); + } + req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } req->in.h.opcode = FUSE_FALLOCATE; req->in.h.nodeid = ff->nodeid; @@ -2483,6 +2492,15 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, } fuse_put_request(fc, req); +out: + if (mode & FALLOC_FL_PUNCH_HOLE) { + if (!err) + truncate_pagecache_range(inode, offset, + offset + length - 1); + fuse_release_nowrite(inode); + mutex_unlock(&inode->i_mutex); + } + return err; } -- cgit v1.2.2 From bee6c307800bbb26ba1a855b1841c2f0c4b7622a Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 17 May 2013 15:27:34 -0400 Subject: fuse: update inode size and invalidate attributes on fallocate An fallocate request without FALLOC_FL_KEEP_SIZE set can extend the size of a file. Update the inode size after a successful fallocate. Also invalidate the inode attributes after a successful fallocate to ensure we pick up the latest attribute values (i.e., i_blocks). Signed-off-by: Brian Foster Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a200a2d80377..d9f467907791 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2492,11 +2492,20 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, } fuse_put_request(fc, req); + if (err) + goto out; + + /* we could have extended the file */ + if (!(mode & FALLOC_FL_KEEP_SIZE)) + fuse_write_update_size(inode, offset + length); + + if (mode & FALLOC_FL_PUNCH_HOLE) + truncate_pagecache_range(inode, offset, offset + length - 1); + + fuse_invalidate_attr(inode); + out: if (mode & FALLOC_FL_PUNCH_HOLE) { - if (!err) - truncate_pagecache_range(inode, offset, - offset + length - 1); fuse_release_nowrite(inode); mutex_unlock(&inode->i_mutex); } -- cgit v1.2.2 From 774d5f14ee1ecac55f42a84ff35eb00b896b00b6 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Mon, 20 May 2013 14:13:50 -0400 Subject: NFSv4.1 Fix a pNFS session draining deadlock On a CB_RECALL the callback service thread flushes the inode using filemap_flush prior to scheduling the state manager thread to return the delegation. When pNFS is used and I/O has not yet gone to the data server servicing the inode, a LAYOUTGET can preceed the I/O. Unlike the async filemap_flush call, the LAYOUTGET must proceed to completion. If the state manager starts to recover data while the inode flush is sending the LAYOUTGET, a deadlock occurs as the callback service thread holds the single callback session slot until the flushing is done which blocks the state manager thread, and the state manager thread has set the session draining bit which puts the inode flush LAYOUTGET RPC to sleep on the forechannel slot table waitq. Separate the draining of the back channel from the draining of the fore channel by moving the NFS4_SESSION_DRAINING bit from session scope into the fore and back slot tables. Drain the back channel first allowing the LAYOUTGET call to proceed (and fail) so the callback service thread frees the callback slot. Then proceed with draining the forechannel. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 2 +- fs/nfs/callback_xdr.c | 2 +- fs/nfs/nfs4proc.c | 2 +- fs/nfs/nfs4session.c | 4 ++-- fs/nfs/nfs4session.h | 13 ++++++++----- fs/nfs/nfs4state.c | 15 +++++++-------- 6 files changed, 20 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index a13d26ede254..0bc27684ebfa 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -414,7 +414,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, spin_lock(&tbl->slot_tbl_lock); /* state manager is resetting the session */ - if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) { + if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { spin_unlock(&tbl->slot_tbl_lock); status = htonl(NFS4ERR_DELAY); /* Return NFS4ERR_BADSESSION if we're draining the session diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 59461c957d9d..a35582c9d444 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -763,7 +763,7 @@ static void nfs4_callback_free_slot(struct nfs4_session *session) * A single slot, so highest used slotid is either 0 or -1 */ tbl->highest_used_slotid = NFS4_NO_SLOT; - nfs4_session_drain_complete(session, tbl); + nfs4_slot_tbl_drain_complete(tbl); spin_unlock(&tbl->slot_tbl_lock); } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8fbc10054115..4e2fe714d5c2 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -572,7 +572,7 @@ int nfs41_setup_sequence(struct nfs4_session *session, task->tk_timeout = 0; spin_lock(&tbl->slot_tbl_lock); - if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && + if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state) && !args->sa_privileged) { /* The state manager will wait until the slot table is empty */ dprintk("%s session is draining\n", __func__); diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index ebda5f4a031b..c4e225e4a9af 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -73,7 +73,7 @@ void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot) tbl->highest_used_slotid = new_max; else { tbl->highest_used_slotid = NFS4_NO_SLOT; - nfs4_session_drain_complete(tbl->session, tbl); + nfs4_slot_tbl_drain_complete(tbl); } } dprintk("%s: slotid %u highest_used_slotid %d\n", __func__, @@ -226,7 +226,7 @@ static bool nfs41_assign_slot(struct rpc_task *task, void *pslot) struct nfs4_slot *slot = pslot; struct nfs4_slot_table *tbl = slot->table; - if (nfs4_session_draining(tbl->session) && !args->sa_privileged) + if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) return false; slot->generation = tbl->generation; args->sa_slot = slot; diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 6f3cb39386d4..ff7d9f0f8a65 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -25,6 +25,10 @@ struct nfs4_slot { }; /* Sessions */ +enum nfs4_slot_tbl_state { + NFS4_SLOT_TBL_DRAINING, +}; + #define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long)) struct nfs4_slot_table { struct nfs4_session *session; /* Parent session */ @@ -43,6 +47,7 @@ struct nfs4_slot_table { unsigned long generation; /* Generation counter for target_highest_slotid */ struct completion complete; + unsigned long slot_tbl_state; }; /* @@ -68,7 +73,6 @@ struct nfs4_session { enum nfs4_session_state { NFS4_SESSION_INITING, - NFS4_SESSION_DRAINING, }; #if defined(CONFIG_NFS_V4_1) @@ -88,12 +92,11 @@ extern void nfs4_destroy_session(struct nfs4_session *session); extern int nfs4_init_session(struct nfs_server *server); extern int nfs4_init_ds_session(struct nfs_client *, unsigned long); -extern void nfs4_session_drain_complete(struct nfs4_session *session, - struct nfs4_slot_table *tbl); +extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); -static inline bool nfs4_session_draining(struct nfs4_session *session) +static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl) { - return !!test_bit(NFS4_SESSION_DRAINING, &session->session_state); + return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state); } bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 300d17d85c0e..1fab140764c4 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -241,7 +241,7 @@ static void nfs4_end_drain_session(struct nfs_client *clp) if (ses == NULL) return; tbl = &ses->fc_slot_table; - if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { + if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { spin_lock(&tbl->slot_tbl_lock); nfs41_wake_slot_table(tbl); spin_unlock(&tbl->slot_tbl_lock); @@ -251,15 +251,15 @@ static void nfs4_end_drain_session(struct nfs_client *clp) /* * Signal state manager thread if session fore channel is drained */ -void nfs4_session_drain_complete(struct nfs4_session *session, - struct nfs4_slot_table *tbl) +void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl) { - if (nfs4_session_draining(session)) + if (nfs4_slot_tbl_draining(tbl)) complete(&tbl->complete); } -static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl) +static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl) { + set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state); spin_lock(&tbl->slot_tbl_lock); if (tbl->highest_used_slotid != NFS4_NO_SLOT) { INIT_COMPLETION(tbl->complete); @@ -275,13 +275,12 @@ static int nfs4_begin_drain_session(struct nfs_client *clp) struct nfs4_session *ses = clp->cl_session; int ret = 0; - set_bit(NFS4_SESSION_DRAINING, &ses->session_state); /* back channel */ - ret = nfs4_wait_on_slot_tbl(&ses->bc_slot_table); + ret = nfs4_drain_slot_tbl(&ses->bc_slot_table); if (ret) return ret; /* fore channel */ - return nfs4_wait_on_slot_tbl(&ses->fc_slot_table); + return nfs4_drain_slot_tbl(&ses->fc_slot_table); } static void nfs41_finish_session_reset(struct nfs_client *clp) -- cgit v1.2.2 From 83c168bf8017212a9d502536f9dcd0b54d24e330 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 15 May 2013 22:00:10 -0400 Subject: NFS: Fix SETCLIENTID fallback if GSS is not available Commit 79d852bf "NFS: Retry SETCLIENTID with AUTH_SYS instead of AUTH_NONE" did not take into account commit 23631227 "NFSv4: Fix the fallback to AUTH_NULL if krb5i is not available". Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfs4client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 947b0c908aa9..4cbad5d6b276 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -203,7 +203,7 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); if (error == -EINVAL) - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_NULL); + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); if (error < 0) goto error; -- cgit v1.2.2 From 37f715774e2dd9ae521334dbbc3af63becd47adb Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 10 May 2013 11:59:18 -0400 Subject: GFS2: two minor quota fixups This patch fixes two regression problems that Abhi found in the GFS2 quota code. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/quota.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index c7c840e916f8..c253b13722e8 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -121,7 +121,7 @@ static u64 qd2index(struct gfs2_quota_data *qd) { struct kqid qid = qd->qd_id; return (2 * (u64)from_kqid(&init_user_ns, qid)) + - (qid.type == USRQUOTA) ? 0 : 1; + ((qid.type == USRQUOTA) ? 0 : 1); } static u64 qd2offset(struct gfs2_quota_data *qd) @@ -721,7 +721,7 @@ get_a_page: goto unlock_out; } - gfs2_trans_add_meta(ip->i_gl, bh); + gfs2_trans_add_data(ip->i_gl, bh); kaddr = kmap_atomic(page); if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE) -- cgit v1.2.2 From af21ca8ed50f01c5278c5ded6dad6f05e8a5d2e4 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 14 May 2013 13:04:29 -0400 Subject: GFS2: Use single-block reservations for directories This patch changes the multi-block allocation code, such that directory inodes only get a single block reserved in the bitmap. That way, the bitmaps are more tightly packed together, and there are fewer spans of free blocks for in-use block reservations. This means it takes less time to find a free span of blocks in the bitmap, which speeds things up. This increases the performance of some workloads by almost 2X. In Nate's mockup.py script (which does (1) create dir, (2) create dir in dir, (3) create file in that dir) the test executes in 23 steps rather than 43 steps, a 47% performance improvement. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 0c5a575b513e..5232525934ae 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1401,9 +1401,14 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, u32 extlen; u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved; int ret; + struct inode *inode = &ip->i_inode; - extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested); - extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks); + if (S_ISDIR(inode->i_mode)) + extlen = 1; + else { + extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested); + extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks); + } if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen)) return; -- cgit v1.2.2 From 75f96ce6e754eee30d9ec788ad91c6ec21d0a46b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 14 May 2013 10:02:48 -0700 Subject: GFS2: fix DLM depends to fix build errors Fix build errors by correcting DLM dependencies in GFS2. Build errors happen when CONFIG_GFS2_FS_LOCKING_DLM=y and CONFIG_DLM=m: fs/built-in.o: In function `gfs2_lock': file.c:(.text+0xc7abd): undefined reference to `dlm_posix_get' file.c:(.text+0xc7ad0): undefined reference to `dlm_posix_unlock' file.c:(.text+0xc7ad9): undefined reference to `dlm_posix_lock' fs/built-in.o: In function `gdlm_unmount': lock_dlm.c:(.text+0xd6e5b): undefined reference to `dlm_release_lockspace' fs/built-in.o: In function `sync_unlock': lock_dlm.c:(.text+0xd6e9e): undefined reference to `dlm_unlock' fs/built-in.o: In function `sync_lock': lock_dlm.c:(.text+0xd6fb6): undefined reference to `dlm_lock' fs/built-in.o: In function `gdlm_put_lock': lock_dlm.c:(.text+0xd7238): undefined reference to `dlm_unlock' fs/built-in.o: In function `gdlm_mount': lock_dlm.c:(.text+0xd753e): undefined reference to `dlm_new_lockspace' lock_dlm.c:(.text+0xd79d3): undefined reference to `dlm_release_lockspace' fs/built-in.o: In function `gdlm_lock': lock_dlm.c:(.text+0xd8179): undefined reference to `dlm_lock' fs/built-in.o: In function `gdlm_cancel': lock_dlm.c:(.text+0xd6b22): undefined reference to `dlm_unlock' Signed-off-by: Randy Dunlap Signed-off-by: Steven Whitehouse --- fs/gfs2/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index eb08c9e43c2a..5a376ab81feb 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -26,7 +26,7 @@ config GFS2_FS config GFS2_FS_LOCKING_DLM bool "GFS2 DLM locking" depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \ - HOTPLUG && DLM && CONFIGFS_FS && SYSFS + HOTPLUG && CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS) help Multiple node locking module for GFS2 -- cgit v1.2.2 From e97e548ba8baffe051146d1f5e897dec48da20e0 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 21 May 2013 12:49:07 +0100 Subject: GFS2: Fix typo in gfs2_log_end_write loop There was a missing _all in this loop iterator Signed-off-by: Steven Whitehouse --- fs/gfs2/lops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index c5fa758fd844..68b4c8f1fce8 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -212,7 +212,7 @@ static void gfs2_end_log_write(struct bio *bio, int error) fs_err(sdp, "Error %d writing to log\n", error); } - bio_for_each_segment(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i) { page = bvec->bv_page; if (page_has_buffers(page)) gfs2_end_log_write_bh(sdp, bvec, error); -- cgit v1.2.2 From 62106e96279f66c52e2123782f9420af9dbe8cbe Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 7 May 2013 11:28:31 -0400 Subject: cifs: only set ops for inodes in I_NEW state It's generally not safe to reset the inode ops once they've been set. In the case where the inode was originally thought to be a directory and then later found to be a DFS referral, this can lead to an oops when we try to trigger an inode op on it after changing the ops to the blank referral operations. Cc: Reported-and-Tested-by: Sachin Prabhu Signed-off-by: Jeff Layton Signed-off-by: Steve French Signed-off-by: Steve French --- fs/cifs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index fc3025199cb3..20efd81266c6 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -171,7 +171,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL) inode->i_flags |= S_AUTOMOUNT; - cifs_set_ops(inode); + if (inode->i_state & I_NEW) + cifs_set_ops(inode); } void -- cgit v1.2.2 From 166faf21bd14bc5c5295a44874bf7f3930c30b20 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 24 May 2013 07:40:04 -0400 Subject: cifs: fix potential buffer overrun when composing a new options string Consider the case where we have a very short ip= string in the original mount options, and when we chase a referral we end up with a very long IPv6 address. Be sure to allow for that possibility when estimating the size of the string to allocate. Cc: Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifs_dfs_ref.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 8e33ec65847b..8e5a1817496d 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "cifsglob.h" #include "cifsproto.h" #include "cifsfs.h" @@ -150,7 +151,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata, * assuming that we have 'unc=' and 'ip=' in * the original sb_mountdata */ - md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12; + md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12 + + INET6_ADDRSTRLEN; mountdata = kzalloc(md_len+1, GFP_KERNEL); if (mountdata == NULL) { rc = -ENOMEM; -- cgit v1.2.2 From 539673fff76af73c3aee96e0de10edcc97d84db3 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 24 May 2013 07:40:04 -0400 Subject: cifs: allow sec=none mounts to work against servers that don't support extended security In the case of sec=none, we're not sending a username or password, so there's little benefit to mandating NTLMSSP auth. Allow it to use unencapsulated auth in that case. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 99eeaa17ee00..0a7fdc31f253 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1061,6 +1061,7 @@ static int cifs_parse_security_flavors(char *value, #endif case Opt_sec_none: vol->nullauth = 1; + vol->secFlg |= CIFSSEC_MAY_NTLM; break; default: cifs_dbg(VFS, "bad security option: %s\n", value); -- cgit v1.2.2 From 37d4f99b55d46d9f71f4769faf74c95adb2c1daf Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 24 May 2013 07:40:05 -0400 Subject: cifs: fix error handling when calling cifs_parse_devname When we allowed separate unc= and prefixpath= mount options, we could ignore EINVAL errors from cifs_parse_devname. Now that they are deprecated, we need to check for that as well and fail the mount if it's malformed. Also fix a later error message that refers to the unc= option. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0a7fdc31f253..5b97e56ddbca 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1258,14 +1258,18 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->backupuid_specified = false; /* no backup intent for a user */ vol->backupgid_specified = false; /* no backup intent for a group */ - /* - * For now, we ignore -EINVAL errors under the assumption that the - * unc= and prefixpath= options will be usable. - */ - if (cifs_parse_devname(devname, vol) == -ENOMEM) { - printk(KERN_ERR "CIFS: Unable to allocate memory to parse " - "device string.\n"); - goto out_nomem; + switch (cifs_parse_devname(devname, vol)) { + case 0: + break; + case -ENOMEM: + cifs_dbg(VFS, "Unable to allocate memory for devname.\n"); + goto cifs_parse_mount_err; + case -EINVAL: + cifs_dbg(VFS, "Malformed UNC in devname.\n"); + goto cifs_parse_mount_err; + default: + cifs_dbg(VFS, "Unknown error parsing devname.\n"); + goto cifs_parse_mount_err; } while ((data = strsep(&options, separator)) != NULL) { @@ -1827,7 +1831,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } #endif if (!vol->UNC) { - cifs_dbg(VFS, "CIFS mount error: No usable UNC path provided in device string or in unc= option!\n"); + cifs_dbg(VFS, "CIFS mount error: No usable UNC path provided in device string!\n"); goto cifs_parse_mount_err; } -- cgit v1.2.2 From 9c9c29e1af2ff0459087876e3800078555794b60 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 24 May 2013 07:40:05 -0400 Subject: cifs: stop printing the unc= option in /proc/mounts Since we no longer recognize that option, stop printing it out. The devicename is now the canonical source for this info. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 72e4efee1389..3752b9f6d9e4 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -372,9 +372,6 @@ cifs_show_options(struct seq_file *s, struct dentry *root) cifs_show_security(s, tcon->ses->server); cifs_show_cache_flavor(s, cifs_sb); - seq_printf(s, ",unc="); - seq_escape(s, tcon->treeName, " \t\n\\"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) seq_printf(s, ",multiuser"); else if (tcon->ses->user_name) -- cgit v1.2.2 From d9deef0a3f38bcc1c155e8d9a8b522404e5e648c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 24 May 2013 07:40:06 -0400 Subject: cifs: fix composing of mount options for DFS referrals With the change to ignore the unc= and prefixpath= mount options, there is no longer any need to add them to the options string when mounting. By the same token, we now need to build a device name that includes the prefixpath when mounting. To make things neater, the delimiters on the devicename are changed to '/' since that's preferred when mounting anyway. v2: fix some comments and don't bother looking at whether there is a prepath in the ref->node_name when deciding whether to pass a prepath to cifs_build_devname. v3: rebase on top of potential buffer overrun fix for stable Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifs_dfs_ref.c | 141 +++++++++++++++++++++++++------------------------ fs/cifs/dns_resolve.c | 4 +- 2 files changed, 73 insertions(+), 72 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 8e5a1817496d..58df174deb10 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -49,58 +49,74 @@ void cifs_dfs_release_automount_timer(void) } /** - * cifs_get_share_name - extracts share name from UNC - * @node_name: pointer to UNC string + * cifs_build_devname - build a devicename from a UNC and optional prepath + * @nodename: pointer to UNC string + * @prepath: pointer to prefixpath (or NULL if there isn't one) * - * Extracts sharename form full UNC. - * i.e. strips from UNC trailing path that is not part of share - * name and fixup missing '\' in the beginning of DFS node refferal - * if necessary. - * Returns pointer to share name on success or ERR_PTR on error. - * Caller is responsible for freeing returned string. + * Build a new cifs devicename after chasing a DFS referral. Allocate a buffer + * big enough to hold the final thing. Copy the UNC from the nodename, and + * concatenate the prepath onto the end of it if there is one. + * + * Returns pointer to the built string, or a ERR_PTR. Caller is responsible + * for freeing the returned string. */ -static char *cifs_get_share_name(const char *node_name) +static char * +cifs_build_devname(char *nodename, const char *prepath) { - int len; - char *UNC; - char *pSep; - - len = strlen(node_name); - UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */, - GFP_KERNEL); - if (!UNC) - return ERR_PTR(-ENOMEM); + size_t pplen; + size_t unclen; + char *dev; + char *pos; + + /* skip over any preceding delimiters */ + nodename += strspn(nodename, "\\"); + if (!*nodename) + return ERR_PTR(-EINVAL); - /* get share name and server name */ - if (node_name[1] != '\\') { - UNC[0] = '\\'; - strncpy(UNC+1, node_name, len); - len++; - UNC[len] = 0; - } else { - strncpy(UNC, node_name, len); - UNC[len] = 0; - } + /* get length of UNC and set pos to last char */ + unclen = strlen(nodename); + pos = nodename + unclen - 1; - /* find server name end */ - pSep = memchr(UNC+2, '\\', len-2); - if (!pSep) { - cifs_dbg(VFS, "%s: no server name end in node name: %s\n", - __func__, node_name); - kfree(UNC); - return ERR_PTR(-EINVAL); + /* trim off any trailing delimiters */ + while (*pos == '\\') { + --pos; + --unclen; } - /* find sharename end */ - pSep++; - pSep = memchr(UNC+(pSep-UNC), '\\', len-(pSep-UNC)); - if (pSep) { - /* trim path up to sharename end - * now we have share name in UNC */ - *pSep = 0; + /* allocate a buffer: + * +2 for preceding "//" + * +1 for delimiter between UNC and prepath + * +1 for trailing NULL + */ + pplen = prepath ? strlen(prepath) : 0; + dev = kmalloc(2 + unclen + 1 + pplen + 1, GFP_KERNEL); + if (!dev) + return ERR_PTR(-ENOMEM); + + pos = dev; + /* add the initial "//" */ + *pos = '/'; + ++pos; + *pos = '/'; + ++pos; + + /* copy in the UNC portion from referral */ + memcpy(pos, nodename, unclen); + pos += unclen; + + /* copy the prefixpath remainder (if there is one) */ + if (pplen) { + *pos = '/'; + ++pos; + memcpy(pos, prepath, pplen); + pos += pplen; } - return UNC; + /* NULL terminator */ + *pos = '\0'; + + convert_delimiter(dev, '/'); + return dev; } @@ -124,6 +140,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata, { int rc; char *mountdata = NULL; + const char *prepath = NULL; int md_len; char *tkn_e; char *srvIP = NULL; @@ -133,7 +150,10 @@ char *cifs_compose_mount_options(const char *sb_mountdata, if (sb_mountdata == NULL) return ERR_PTR(-EINVAL); - *devname = cifs_get_share_name(ref->node_name); + if (strlen(fullpath) - ref->path_consumed) + prepath = fullpath + ref->path_consumed; + + *devname = cifs_build_devname(ref->node_name, prepath); if (IS_ERR(*devname)) { rc = PTR_ERR(*devname); *devname = NULL; @@ -147,13 +167,14 @@ char *cifs_compose_mount_options(const char *sb_mountdata, goto compose_mount_options_err; } - /* md_len = strlen(...) + 12 for 'sep+prefixpath=' - * assuming that we have 'unc=' and 'ip=' in - * the original sb_mountdata + /* + * In most cases, we'll be building a shorter string than the original, + * but we do have to assume that the address in the ip= option may be + * much longer than the original. Add the max length of an address + * string to the length of the original string to allow for worst case. */ - md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12 + - INET6_ADDRSTRLEN; - mountdata = kzalloc(md_len+1, GFP_KERNEL); + md_len = strlen(sb_mountdata) + INET6_ADDRSTRLEN; + mountdata = kzalloc(md_len + 1, GFP_KERNEL); if (mountdata == NULL) { rc = -ENOMEM; goto compose_mount_options_err; @@ -197,26 +218,6 @@ char *cifs_compose_mount_options(const char *sb_mountdata, strncat(mountdata, &sep, 1); strcat(mountdata, "ip="); strcat(mountdata, srvIP); - strncat(mountdata, &sep, 1); - strcat(mountdata, "unc="); - strcat(mountdata, *devname); - - /* find & copy prefixpath */ - tkn_e = strchr(ref->node_name + 2, '\\'); - if (tkn_e == NULL) { - /* invalid unc, missing share name*/ - rc = -EINVAL; - goto compose_mount_options_err; - } - - tkn_e = strchr(tkn_e + 1, '\\'); - if (tkn_e || (strlen(fullpath) - ref->path_consumed)) { - strncat(mountdata, &sep, 1); - strcat(mountdata, "prefixpath="); - if (tkn_e) - strcat(mountdata, tkn_e + 1); - strcat(mountdata, fullpath + ref->path_consumed); - } /*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/ /*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/ diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index e7512e497611..7ede7306599f 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -34,7 +34,7 @@ /** * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address. - * @unc: UNC path specifying the server + * @unc: UNC path specifying the server (with '/' as delimiter) * @ip_addr: Where to return the IP address. * * The IP address will be returned in string form, and the caller is @@ -64,7 +64,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) hostname = unc + 2; /* Search for server name delimiter */ - sep = memchr(hostname, '\\', len); + sep = memchr(hostname, '/', len); if (sep) len = sep - hostname; else -- cgit v1.2.2 From e9b376671910d105c5e61103111b96209c729529 Mon Sep 17 00:00:00 2001 From: Vahram Martirosyan Date: Fri, 24 May 2013 13:57:12 +0500 Subject: jfs: Several bugs in jfs_freeze() and jfs_unfreeze() The mentioned functions do not pay attention to the error codes returned by the functions updateSuper(), lmLogInit() and lmLogShutdown(). It brings to system crash later when writing to log. The patch adds corresponding code to check and return the error codes and to print correct error messages in case of errors. Found by Linux File System Verification project (linuxtesting.org). Signed-off-by: Vahram Martirosyan Reviewed-by: Gu Zheng Signed-off-by: Dave Kleikamp --- fs/jfs/super.c | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 2003e830ed1c..788e0a9c1fb0 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -611,11 +611,28 @@ static int jfs_freeze(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; + int rc = 0; if (!(sb->s_flags & MS_RDONLY)) { txQuiesce(sb); - lmLogShutdown(log); - updateSuper(sb, FM_CLEAN); + rc = lmLogShutdown(log); + if (rc) { + jfs_error(sb, "jfs_freeze: lmLogShutdown failed"); + + /* let operations fail rather than hang */ + txResume(sb); + + return rc; + } + rc = updateSuper(sb, FM_CLEAN); + if (rc) { + jfs_err("jfs_freeze: updateSuper failed\n"); + /* + * Don't fail here. Everything succeeded except + * marking the superblock clean, so there's really + * no harm in leaving it frozen for now. + */ + } } return 0; } @@ -627,13 +644,18 @@ static int jfs_unfreeze(struct super_block *sb) int rc = 0; if (!(sb->s_flags & MS_RDONLY)) { - updateSuper(sb, FM_MOUNT); - if ((rc = lmLogInit(log))) - jfs_err("jfs_unlock failed with return code %d", rc); - else - txResume(sb); + rc = updateSuper(sb, FM_MOUNT); + if (rc) { + jfs_error(sb, "jfs_unfreeze: updateSuper failed"); + goto out; + } + rc = lmLogInit(log); + if (rc) + jfs_error(sb, "jfs_unfreeze: lmLogInit failed"); +out: + txResume(sb); } - return 0; + return rc; } static struct dentry *jfs_do_mount(struct file_system_type *fs_type, -- cgit v1.2.2 From 95bbb82f60c80808e5a49d8233c2de8451901531 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Thu, 23 May 2013 16:14:19 +0800 Subject: fs/jfs: Add check if journaling to disk has been disabled in lbmRead() Signed-off-by: Gu Zheng Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_logmgr.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index c57499dca89c..360d27c48887 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2009,7 +2009,13 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio->bi_end_io = lbmIODone; bio->bi_private = bp; - submit_bio(READ_SYNC, bio); + /*check if journaling to disk has been disabled*/ + if (log->no_integrity) { + bio->bi_size = 0; + lbmIODone(bio, 0); + } else { + submit_bio(READ_SYNC, bio); + } wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD)); -- cgit v1.2.2 From 480d7467e4aaa3dc38088baf56bc3eb3599f5d26 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 20 May 2013 09:51:08 +1000 Subject: xfs: fix sub-page blocksize data integrity writes FSX on 512 byte block size filesystems has been failing for some time with corrupted data. The fault dates back to the change in the writeback data integrity algorithm that uses a mark-and-sweep approach to avoid data writeback livelocks. Unfortunately, a side effect of this mark-and-sweep approach is that each page will only be written once for a data integrity sync, and there is a condition in writeback in XFS where a page may require two writeback attempts to be fully written. As a result of the high level change, we now only get a partial page writeback during the integrity sync because the first pass through writeback clears the mark left on the page index to tell writeback that the page needs writeback.... The cause is writing a partial page in the clustering code. This can happen when a mapping boundary falls in the middle of a page - we end up writing back the first part of the page that the mapping covers, but then never revisit the page to have the remainder mapped and written. The fix is simple - if the mapping boundary falls inside a page, then simple abort clustering without touching the page. This means that the next ->writepage entry that write_cache_pages() will make is the page we aborted on, and xfs_vm_writepage() will map all sections of the page correctly. This behaviour is also optimal for non-data integrity writes, as it results in contiguous sequential writeback of the file rather than missing small holes and having to write them a "random" writes in a future pass. With this fix, all the fsx tests in xfstests now pass on a 512 byte block size filesystem on a 4k page machine. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Ben Myers (cherry picked from commit 49b137cbbcc836ef231866c137d24f42c42bb483) --- fs/xfs/xfs_aops.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 2b2691b73428..41a695048be7 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -725,6 +725,25 @@ xfs_convert_page( (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, i_size_read(inode)); + /* + * If the current map does not span the entire page we are about to try + * to write, then give up. The only way we can write a page that spans + * multiple mappings in a single writeback iteration is via the + * xfs_vm_writepage() function. Data integrity writeback requires the + * entire page to be written in a single attempt, otherwise the part of + * the page we don't write here doesn't get written as part of the data + * integrity sync. + * + * For normal writeback, we also don't attempt to write partial pages + * here as it simply means that write_cache_pages() will see it under + * writeback and ignore the page until some point in the future, at + * which time this will be the only page in the file that needs + * writeback. Hence for more optimal IO patterns, we should always + * avoid partial page writeback due to multiple mappings on a page here. + */ + if (!xfs_imap_valid(inode, imap, end_offset)) + goto fail_unlock_page; + len = 1 << inode->i_blkbits; p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), PAGE_CACHE_SIZE); -- cgit v1.2.2 From 7031d0e1c46e2b1c869458233dd216cb72af41b2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 20 May 2013 09:51:09 +1000 Subject: xfs: fix rounding in xfs_free_file_space The offset passed into xfs_free_file_space() needs to be rounded down to a certain size, but the rounding mask is built by a 32 bit variable. Hence the mask will always mask off the upper 32 bits of the offset and lead to incorrect writeback and invalidation ranges. This is not actually exposed as a bug because we writeback and invalidate from the rounded offset to the end of the file, and hence the offset we are actually punching a hole out of will always be covered by the code. This needs fixing, however, if we ever want to use exact ranges for writeback/invalidation here... Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Ben Myers (cherry picked from commit 28ca489c63e9aceed8801d2f82d731b3c9aa50f5) --- fs/xfs/xfs_vnodeops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 1501f4fa51a6..0176bb21f09a 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1453,7 +1453,7 @@ xfs_free_file_space( xfs_mount_t *mp; int nimap; uint resblks; - uint rounding; + xfs_off_t rounding; int rt; xfs_fileoff_t startoffset_fsb; xfs_trans_t *tp; @@ -1482,7 +1482,7 @@ xfs_free_file_space( inode_dio_wait(VFS_I(ip)); } - rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); + rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); ioffset = offset & ~(rounding - 1); error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset, -1); -- cgit v1.2.2 From 509e708a8929c5b75a16c985c03db5329e09cad4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 20 May 2013 09:51:10 +1000 Subject: xfs: Don't reference the EFI after it is freed Checking the EFI for whether it is being released from recovery after we've already released the known active reference is a mistake worthy of a brown paper bag. Fix the (now) obvious use after free that it can cause. Reported-by: Dave Jones Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Ben Myers (cherry picked from commit 52c24ad39ff02d7bd73c92eb0c926fb44984a41d) --- fs/xfs/xfs_extfree_item.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index c0f375087efc..452920a3f03f 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -305,11 +305,12 @@ xfs_efi_release(xfs_efi_log_item_t *efip, { ASSERT(atomic_read(&efip->efi_next_extent) >= nextents); if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) { - __xfs_efi_release(efip); - /* recovery needs us to drop the EFI reference, too */ if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) __xfs_efi_release(efip); + + __xfs_efi_release(efip); + /* efip may now have been freed, do not reference it again. */ } } -- cgit v1.2.2 From b17cb364dbbbf65add79f1610599d01bcb6851f9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 20 May 2013 09:51:12 +1000 Subject: xfs: fix missing KM_NOFS tags to keep lockdep happy There are several places where we use KM_SLEEP allocation contexts and use the fact that they are called from transaction context to add KM_NOFS where appropriate. Unfortunately, there are several places where the code makes this assumption but can be called from outside transaction context but with filesystem locks held. These places need explicit KM_NOFS annotations to avoid lockdep complaining about reclaim contexts. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit ac14876cf9255175bf3bdad645bf8aa2b8fb2d7c) --- fs/xfs/xfs_buf.c | 2 +- fs/xfs/xfs_da_btree.c | 6 ++++-- fs/xfs/xfs_dir2_leaf.c | 2 +- fs/xfs/xfs_log_cil.c | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 82b70bda9f47..0d2554299688 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1649,7 +1649,7 @@ xfs_alloc_buftarg( { xfs_buftarg_t *btp; - btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); + btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 9b26a99ebfe9..41ea7e14a7b6 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2464,7 +2464,8 @@ xfs_buf_map_from_irec( ASSERT(nirecs >= 1); if (nirecs > 1) { - map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_SLEEP); + map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), + KM_SLEEP | KM_NOFS); if (!map) return ENOMEM; *mapp = map; @@ -2520,7 +2521,8 @@ xfs_dabuf_map( * Optimize the one-block case. */ if (nfsb != 1) - irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_SLEEP); + irecs = kmem_zalloc(sizeof(irec) * nfsb, + KM_SLEEP | KM_NOFS); nirecs = nfsb; error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs, diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 721ba2fe8e54..da71a1819d78 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -1336,7 +1336,7 @@ xfs_dir2_leaf_getdents( mp->m_sb.sb_blocksize); map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) + (length * sizeof(struct xfs_bmbt_irec)), - KM_SLEEP); + KM_SLEEP | KM_NOFS); map_info->map_size = length; /* diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index e3d0b85d852b..d0833b54e55d 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -139,7 +139,7 @@ xlog_cil_prepare_log_vecs( new_lv = kmem_zalloc(sizeof(*new_lv) + niovecs * sizeof(struct xfs_log_iovec), - KM_SLEEP); + KM_SLEEP|KM_NOFS); /* The allocated iovec region lies beyond the log vector. */ new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; -- cgit v1.2.2 From 7ced60cae46cb37273a03c196e6f473b089bd8e1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 20 May 2013 09:51:13 +1000 Subject: xfs: xfs_da3_node_read_verify() doesn't handle XFS_ATTR3_LEAF_MAGIC Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit 72916fb8cbcf0c2928f56cdc2fbe8c7bf5517758) --- fs/xfs/xfs_da_btree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 41ea7e14a7b6..0b8b2a13cd24 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -270,6 +270,7 @@ xfs_da3_node_read_verify( break; return; case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: bp->b_ops = &xfs_attr3_leaf_buf_ops; bp->b_ops->verify_read(bp); return; -- cgit v1.2.2 From cf257abf02709dba3cc745d950f144ce49432b4f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 20 May 2013 09:51:14 +1000 Subject: xfs: xfs_attr_shortform_allfit() does not handle attr3 format. xfstests generic/117 fails with: XFS: Assertion failed: leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) indicating a function that does not handle the attr3 format correctly. Fix it. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit b38958d715316031fe9ea0cc6c22043072a55f49) --- fs/xfs/xfs_attr_leaf.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 08d5457c948e..8eeb88fb3201 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -931,20 +931,22 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) */ int xfs_attr_shortform_allfit( - struct xfs_buf *bp, - struct xfs_inode *dp) + struct xfs_buf *bp, + struct xfs_inode *dp) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; xfs_attr_leaf_name_local_t *name_loc; - int bytes, i; + struct xfs_attr3_icleaf_hdr leafhdr; + int bytes; + int i; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + entry = xfs_attr3_leaf_entryp(leaf); - entry = &leaf->entries[0]; bytes = sizeof(struct xfs_attr_sf_hdr); - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + for (i = 0; i < leafhdr.count; entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) continue; /* don't copy partial entries */ if (!(entry->flags & XFS_ATTR_LOCAL)) @@ -954,15 +956,15 @@ xfs_attr_shortform_allfit( return(0); if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) return(0); - bytes += sizeof(struct xfs_attr_sf_entry)-1 + bytes += sizeof(struct xfs_attr_sf_entry) - 1 + name_loc->namelen + be16_to_cpu(name_loc->valuelen); } if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) - return(-1); - return(xfs_attr_shortform_bytesfit(dp, bytes)); + return -1; + return xfs_attr_shortform_bytesfit(dp, bytes); } /* -- cgit v1.2.2 From 7ae077802c9f12959a81fa1a16c1ec2842dbae05 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 20 May 2013 09:51:16 +1000 Subject: xfs: remote attribute lookups require the value length When reading a remote attribute, to correctly calculate the length of the data buffer for CRC enable filesystems, we need to know the length of the attribute data. We get this information when we look up the attribute, but we don't store it in the args structure along with the other remote attr information we get from the lookup. Add this information to the args structure so we can use it appropriately. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit e461fcb194172b3f709e0b478d2ac1bdac7ab9a3) --- fs/xfs/xfs_attr_leaf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 8eeb88fb3201..0bce1b348580 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -2332,9 +2332,10 @@ xfs_attr3_leaf_lookup_int( if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; + args->valuelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, - be32_to_cpu(name_rmt->valuelen)); + args->valuelen); return XFS_ERROR(EEXIST); } } -- cgit v1.2.2 From c15cddd900e867c5adfb3c79596479dc5975f743 Mon Sep 17 00:00:00 2001 From: Paul Taysom Date: Thu, 23 May 2013 14:31:43 -0700 Subject: ecryptfs: fixed msync to flush data When msync is called on a memory mapped file, that data is not flushed to the disk. In Linux, msync calls fsync for the file. For ecryptfs, fsync just calls the lower level file system's fsync. Changed the ecryptfs fsync code to call filemap_write_and_wait before calling the lower level fsync. Addresses the problem described in http://crbug.com/239536 Signed-off-by: Paul Taysom Signed-off-by: Tyler Hicks Cc: stable@vger.kernel.org # v3.6+ --- fs/ecryptfs/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 201f0a0d6b0a..16f509d6fa49 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -295,6 +295,7 @@ static int ecryptfs_release(struct inode *inode, struct file *file) static int ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { + filemap_write_and_wait(file->f_mapping); return vfs_fsync(ecryptfs_file_to_lower(file), datasync); } -- cgit v1.2.2 From 7b92d03c3239f43e5b86c9cc9630f026d36ee995 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Fri, 24 May 2013 15:55:08 -0700 Subject: fat: fix possible overflow for fat_clusters Intermediate value of fat_clusters can be overflowed on 32bits arch. Reported-by: Krzysztof Strasburger Signed-off-by: OGAWA Hirofumi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/inode.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fat/inode.c b/fs/fat/inode.c index dfce656ddb33..5d4513cb1b3c 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1229,6 +1229,19 @@ static int fat_read_root(struct inode *inode) return 0; } +static unsigned long calc_fat_clusters(struct super_block *sb) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + + /* Divide first to avoid overflow */ + if (sbi->fat_bits != 12) { + unsigned long ent_per_sec = sb->s_blocksize * 8 / sbi->fat_bits; + return ent_per_sec * sbi->fat_length; + } + + return sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits; +} + /* * Read the super block of an MS-DOS FS. */ @@ -1434,7 +1447,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, sbi->dirty = b->fat16.state & FAT_STATE_DIRTY; /* check that FAT table does not overflow */ - fat_clusters = sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits; + fat_clusters = calc_fat_clusters(sb); total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT); if (total_clusters > MAX_FAT(sb)) { if (!silent) -- cgit v1.2.2 From afe1bb73f8ed588ab6268c27c5a447fe0484e48f Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Fri, 24 May 2013 15:55:12 -0700 Subject: ocfs2: unlock rw lock if inode lock failed In ocfs2_file_aio_write(), it does ocfs2_rw_lock() first and then ocfs2_inode_lock(). But if ocfs2_inode_lock() failed, it goes to out_sems without unlocking rw lock. This will cause a bug in ocfs2_lock_res_free() when testing res->l_ex_holders, which is increased in __ocfs2_cluster_lock() and decreased in __ocfs2_cluster_unlock(). Signed-off-by: Joseph Qi Cc: Joel Becker Cc: Mark Fasheh Cc: Li Zefan Cc: "Duyongfeng (B)" Acked-by: Sunil Mushran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 8a7509f9e6f5..ff54014a24ec 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2288,7 +2288,7 @@ relock: ret = ocfs2_inode_lock(inode, NULL, 1); if (ret < 0) { mlog_errno(ret); - goto out_sems; + goto out; } ocfs2_inode_unlock(inode, 1); -- cgit v1.2.2 From fb09c3733a94b5f1dba50359d09c9e217c763fb9 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 24 May 2013 15:55:16 -0700 Subject: hfs: avoid crash in hfs_bnode_create Commit 634725a92938 ("hfs: cleanup HFS+ prints") removed the BUG_ON in hfs_bnode_create in hfsplus. This patch removes it from the hfs version and avoids an fsfuzzer crash. Signed-off-by: Jeff Mahoney Acked-by: Jeff Mahoney Signed-off-by: Jiri Slaby Cc: Vyacheslav Dubeyko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfs/bnode.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index f3b1a15ccd59..d3fa6bd9503e 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -415,7 +415,11 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) spin_lock(&tree->hash_lock); node = hfs_bnode_findhash(tree, num); spin_unlock(&tree->hash_lock); - BUG_ON(node); + if (node) { + pr_crit("new node %u already hashed?\n", num); + WARN_ON(1); + return node; + } node = __hfs_bnode_create(tree, num); if (!node) return ERR_PTR(-ENOMEM); -- cgit v1.2.2 From 6900807c6b95dcb004902302b8ac5dbfbf6feb89 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 24 May 2013 15:55:24 -0700 Subject: aio: fix io_getevents documentation In reviewing man pages, I noticed that io_getevents is documented to update the timeout that gets passed into the library call. This doesn't happen in kernel space or in the library (even though it's documented to do so in both places). Unless there is objection, I'd like to fix the comments/docs to match the code (I will also update the man page upon consensus). Signed-off-by: Jeff Moyer Signed-off-by: Benjamin LaHaise Acked-by: Cyril Hrubis Acked-by: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index c5b1a8c10411..3fcdd73b6f1b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1299,8 +1299,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, * < min_nr if the timeout specified by timeout has elapsed * before sufficient events are available, where timeout == NULL * specifies an infinite timeout. Note that the timeout pointed to by - * timeout is relative and will be updated if not NULL and the - * operation blocks. Will fail with -ENOSYS if not implemented. + * timeout is relative. Will fail with -ENOSYS if not implemented. */ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, long, min_nr, -- cgit v1.2.2 From 136e8770cd5d1fe38b3c613100dd6dc4db6d4fa6 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 24 May 2013 15:55:29 -0700 Subject: nilfs2: fix issue of nilfs_set_page_dirty() for page at EOF boundary nilfs2: fix issue of nilfs_set_page_dirty for page at EOF boundary DESCRIPTION: There are use-cases when NILFS2 file system (formatted with block size lesser than 4 KB) can be remounted in RO mode because of encountering of "broken bmap" issue. The issue was reported by Anthony Doggett : "The machine I've been trialling nilfs on is running Debian Testing, Linux version 3.2.0-4-686-pae (debian-kernel@lists.debian.org) (gcc version 4.6.3 (Debian 4.6.3-14) ) #1 SMP Debian 3.2.35-2), but I've also reproduced it (identically) with Debian Unstable amd64 and Debian Experimental (using the 3.8-trunk kernel). The problematic partitions were formatted with "mkfs.nilfs2 -b 1024 -B 8192"." SYMPTOMS: (1) System log contains error messages likewise: [63102.496756] nilfs_direct_assign: invalid pointer: 0 [63102.496786] NILFS error (device dm-17): nilfs_bmap_assign: broken bmap (inode number=28) [63102.496798] [63102.524403] Remounting filesystem read-only (2) The NILFS2 file system is remounted in RO mode. REPRODUSING PATH: (1) Create volume group with name "unencrypted" by means of vgcreate utility. (2) Run script (prepared by Anthony Doggett ): ----------------[BEGIN SCRIPT]-------------------- VG=unencrypted lvcreate --size 2G --name ntest $VG mkfs.nilfs2 -b 1024 -B 8192 /dev/mapper/$VG-ntest mkdir /var/tmp/n mkdir /var/tmp/n/ntest mount /dev/mapper/$VG-ntest /var/tmp/n/ntest mkdir /var/tmp/n/ntest/thedir cd /var/tmp/n/ntest/thedir sleep 2 date darcs init sleep 2 dmesg|tail -n 5 date darcs whatsnew || true date sleep 2 dmesg|tail -n 5 ----------------[END SCRIPT]-------------------- REPRODUCIBILITY: 100% INVESTIGATION: As it was discovered, the issue takes place during segment construction after executing such sequence of user-space operations: open("_darcs/index", O_RDWR|O_CREAT|O_NOCTTY, 0666) = 7 fstat(7, {st_mode=S_IFREG|0644, st_size=0, ...}) = 0 ftruncate(7, 60) The error message "NILFS error (device dm-17): nilfs_bmap_assign: broken bmap (inode number=28)" takes place because of trying to get block number for third block of the file with logical offset #3072 bytes. As it is possible to see from above output, the file has 60 bytes of the whole size. So, it is enough one block (1 KB in size) allocation for the whole file. Trying to operate with several blocks instead of one takes place because of discovering several dirty buffers for this file in nilfs_segctor_scan_file() method. The root cause of this issue is in nilfs_set_page_dirty function which is called just before writing to an mmapped page. When nilfs_page_mkwrite function handles a page at EOF boundary, it fills hole blocks only inside EOF through __block_page_mkwrite(). The __block_page_mkwrite() function calls set_page_dirty() after filling hole blocks, thus nilfs_set_page_dirty function (= a_ops->set_page_dirty) is called. However, the current implementation of nilfs_set_page_dirty() wrongly marks all buffers dirty even for page at EOF boundary. As a result, buffers outside EOF are inconsistently marked dirty and queued for write even though they are not mapped with nilfs_get_block function. FIX: This modifies nilfs_set_page_dirty() not to mark hole blocks dirty. Thanks to Vyacheslav Dubeyko for his effort on analysis and proposals for this issue. Signed-off-by: Ryusuke Konishi Reported-by: Anthony Doggett Reported-by: Vyacheslav Dubeyko Cc: Vyacheslav Dubeyko Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/inode.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 689fb608648e..bccfec8343c5 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -219,13 +219,32 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc) static int nilfs_set_page_dirty(struct page *page) { - int ret = __set_page_dirty_buffers(page); + int ret = __set_page_dirty_nobuffers(page); - if (ret) { + if (page_has_buffers(page)) { struct inode *inode = page->mapping->host; - unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits); + unsigned nr_dirty = 0; + struct buffer_head *bh, *head; - nilfs_set_file_dirty(inode, nr_dirty); + /* + * This page is locked by callers, and no other thread + * concurrently marks its buffers dirty since they are + * only dirtied through routines in fs/buffer.c in + * which call sites of mark_buffer_dirty are protected + * by page lock. + */ + bh = head = page_buffers(page); + do { + /* Do not mark hole blocks dirty */ + if (buffer_dirty(bh) || !buffer_mapped(bh)) + continue; + + set_buffer_dirty(bh); + nr_dirty++; + } while (bh = bh->b_this_page, bh != head); + + if (nr_dirty) + nilfs_set_file_dirty(inode, nr_dirty); } return ret; } -- cgit v1.2.2 From b4ca2b4b577c3530e34dcfaafccb2cc680ce95d1 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Fri, 24 May 2013 15:55:34 -0700 Subject: ocfs2: goto out_unlock if ocfs2_get_clusters_nocache() failed in ocfs2_fiemap() Last time we found there is lock/unlock bug in ocfs2_file_aio_write, and then we did a thorough search for all lock resources in ocfs2_inode_info, including rw, inode and open lockres and found this bug. My kernel version is 3.0.13, and it is also in the lastest version 3.9. In ocfs2_fiemap, once ocfs2_get_clusters_nocache failed, it should goto out_unlock instead of out, because we need release buffer head, up read alloc sem and unlock inode. Signed-off-by: Joseph Qi Reviewed-by: Jie Liu Cc: Mark Fasheh Cc: Joel Becker Acked-by: Sunil Mushran Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/extent_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 1c39efb71bab..2487116d0d33 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -790,7 +790,7 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, &hole_size, &rec, &is_last); if (ret) { mlog_errno(ret); - goto out; + goto out_unlock; } if (rec.e_blkno == 0ULL) { -- cgit v1.2.2 From 03e04f048d2774aabd126fbad84729d4ba9dc40a Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Fri, 24 May 2013 15:55:38 -0700 Subject: aio: fix kioctx not being freed after cancellation at exit time The recent changes overhauling fs/aio.c introduced a bug that results in the kioctx not being freed when outstanding kiocbs are cancelled at exit_aio() time. Specifically, a kiocb that is cancelled has its completion events discarded by batch_complete_aio(), which then fails to wake up the process stuck in free_ioctx(). Fix this by modifying the wait_event() condition in free_ioctx() appropriately. This patch was tested with the cancel operation in the thread based code posted yesterday. [akpm@linux-foundation.org: fix build] Signed-off-by: Benjamin LaHaise Signed-off-by: Kent Overstreet Cc: Kent Overstreet Cc: Josh Boyer Cc: Zach Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 3fcdd73b6f1b..7fe5bdee1630 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -307,7 +307,9 @@ static void free_ioctx(struct kioctx *ctx) kunmap_atomic(ring); while (atomic_read(&ctx->reqs_active) > 0) { - wait_event(ctx->wait, head != ctx->tail); + wait_event(ctx->wait, + head != ctx->tail || + atomic_read(&ctx->reqs_active) <= 0); avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; -- cgit v1.2.2 From 15ef0298deb3929eb6ad6d2334fd2059fd53807c Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Fri, 17 May 2013 02:12:03 +0400 Subject: posix-timers: Show clock ID in proc file Expand information about posix-timers in /proc//timers by adding info about clock, with which the timer was created. I.e. in the forth line of timer info after "notify:" line go "ClockID: ". Signed-off-by: Pavel Tikhomirov Cc: Michael Kerrisk Cc: Matthew Helsley Cc: Pavel Emelyanov Link: http://lkml.kernel.org/r/1368742323-46949-2-git-send-email-snorcht@gmail.com Signed-off-by: Thomas Gleixner --- fs/proc/base.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index dd51e50001fe..c3834dad09b3 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2118,6 +2118,7 @@ static int show_timer(struct seq_file *m, void *v) nstr[notify & ~SIGEV_THREAD_ID], (notify & SIGEV_THREAD_ID) ? "tid" : "pid", pid_nr_ns(timer->it_pid, tp->ns)); + seq_printf(m, "ClockID: %d\n", timer->it_clock); return 0; } -- cgit v1.2.2 From f448badd34700ae728a32ba024249626d49c10e1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 29 May 2013 15:36:40 -0400 Subject: NFSv4: Fix a thinko in nfs4_try_open_cached We need to pass the full open mode flags to nfs_may_open() when doing a delegated open. Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4e2fe714d5c2..d7ba5616989c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1078,7 +1078,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) struct nfs4_state *state = opendata->state; struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_delegation *delegation; - int open_mode = opendata->o_arg.open_flags & (O_EXCL|O_TRUNC); + int open_mode = opendata->o_arg.open_flags; fmode_t fmode = opendata->o_arg.fmode; nfs4_stateid stateid; int ret = -EAGAIN; -- cgit v1.2.2 From eb54d43707c69340581940e1fcaecb4d7d17b814 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 14 May 2013 14:37:56 -0400 Subject: NFS: Fix security flavor negotiation with legacy binary mounts Darrick J. Wong reports: > I have a kvm-based testing setup that netboots VMs over NFS, the > client end of which seems to have broken somehow in 3.10-rc1. The > server's exports file looks like this: > > /storage/mtr/x64 192.168.122.0/24(ro,sync,no_root_squash,no_subtree_check) > > On the client end (inside the VM), the initrd runs the following > command to try to mount the rootfs over NFS: > > # mount -o nolock -o ro -o retrans=10 192.168.122.1:/storage/mtr/x64/ /root > > (Note: This is the busybox mount command.) > > The mount fails with -EINVAL. Commit 4580a92d44 "NFS: Use server-recommended security flavor by default (NFSv3)" introduced a behavior regression for NFS mounts done via a legacy binary mount(2) call. Ensure that a default security flavor is specified for legacy binary mount requests, since they do not invoke nfs_select_flavor() in the kernel. Busybox uses klibc's nfsmount command, which performs NFS mounts using the legacy binary mount data format. /sbin/mount.nfs is not affected by this regression. Reported-by: Darrick J. Wong Signed-off-by: Chuck Lever Tested-by: Darrick J. Wong Acked-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index a366107a7331..2d7525fbcf25 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1942,6 +1942,7 @@ static int nfs23_validate_mount_data(void *options, args->namlen = data->namlen; args->bsize = data->bsize; + args->auth_flavors[0] = RPC_AUTH_UNIX; if (data->flags & NFS_MOUNT_SECFLAVOUR) args->auth_flavors[0] = data->pseudoflavor; if (!args->nfs_server.hostname) @@ -2637,6 +2638,7 @@ static int nfs4_validate_mount_data(void *options, goto out_no_address; args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); + args->auth_flavors[0] = RPC_AUTH_UNIX; if (data->auth_flavourlen) { if (data->auth_flavourlen > 1) goto out_inval_auth; -- cgit v1.2.2 From 08fb39051f5581df45ae2a20c6cf2d0c4cddf7c2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 21 May 2013 18:02:00 +1000 Subject: xfs: avoid nesting transactions in xfs_qm_scall_setqlim() Lockdep reports: ============================================= [ INFO: possible recursive locking detected ] 3.9.0+ #3 Not tainted --------------------------------------------- setquota/28368 is trying to acquire lock: (sb_internal){++++.?}, at: [] xfs_trans_alloc+0x26/0x50 but task is already holding lock: (sb_internal){++++.?}, at: [] xfs_trans_alloc+0x26/0x50 from xfs_qm_scall_setqlim()->xfs_dqread() when a dquot needs to be allocated. xfs_qm_scall_setqlim() is starting a transaction and then not passing it into xfs_qm_dqet() and so it starts it's own transaction when allocating the dquot. Splat! Fix this by not allocating the dquot in xfs_qm_scall_setqlim() inside the setqlim transaction. This requires getting the dquot first (and allocating it if necessary) then dropping and relocking the dquot before joining it to the setqlim transaction. Reported-by: Michael L. Semon Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit f648167f3ac79018c210112508c732ea9bf67c7b) --- fs/xfs/xfs_qm_syscalls.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index c41190cad6e9..6cdf6ffc36a1 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -489,31 +489,36 @@ xfs_qm_scall_setqlim( if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0) return 0; - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); - error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp), - 0, 0, XFS_DEFAULT_LOG_COUNT); - if (error) { - xfs_trans_cancel(tp, 0); - return (error); - } - /* * We don't want to race with a quotaoff so take the quotaoff lock. - * (We don't hold an inode lock, so there's nothing else to stop - * a quotaoff from happening). (XXXThis doesn't currently happen - * because we take the vfslock before calling xfs_qm_sysent). + * We don't hold an inode lock, so there's nothing else to stop + * a quotaoff from happening. */ mutex_lock(&q->qi_quotaofflock); /* - * Get the dquot (locked), and join it to the transaction. - * Allocate the dquot if this doesn't exist. + * Get the dquot (locked) before we start, as we need to do a + * transaction to allocate it if it doesn't exist. Once we have the + * dquot, unlock it so we can start the next transaction safely. We hold + * a reference to the dquot, so it's safe to do this unlock/lock without + * it being reclaimed in the mean time. */ - if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) { - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp); + if (error) { ASSERT(error != ENOENT); goto out_unlock; } + xfs_dqunlock(dqp); + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); + error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp), + 0, 0, XFS_DEFAULT_LOG_COUNT); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_rele; + } + + xfs_dqlock(dqp); xfs_trans_dqjoin(tp, dqp); ddq = &dqp->q_core; @@ -621,9 +626,10 @@ xfs_qm_scall_setqlim( xfs_trans_log_dquot(tp, dqp); error = xfs_trans_commit(tp, 0); - xfs_qm_dqrele(dqp); - out_unlock: +out_rele: + xfs_qm_dqrele(dqp); +out_unlock: mutex_unlock(&q->qi_quotaofflock); return error; } -- cgit v1.2.2 From 2962f5a5dcc56f69cbf62121a7be67cc15d6940b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 27 May 2013 16:38:25 +1000 Subject: xfs: kill suid/sgid through the truncate path. XFS has failed to kill suid/sgid bits correctly when truncating files of non-zero size since commit c4ed4243 ("xfs: split xfs_setattr") introduced in the 3.1 kernel. Fix it. Fix it. cc: stable kernel Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Ben Myers (cherry picked from commit 56c19e89b38618390addfc743d822f99519055c6) --- fs/xfs/xfs_iops.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index d82efaa2ac73..ca9ecaa81112 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -455,6 +455,28 @@ xfs_vn_getattr( return 0; } +static void +xfs_setattr_mode( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct iattr *iattr) +{ + struct inode *inode = VFS_I(ip); + umode_t mode = iattr->ia_mode; + + ASSERT(tp); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + + ip->i_d.di_mode &= S_IFMT; + ip->i_d.di_mode |= mode & ~S_IFMT; + + inode->i_mode &= S_IFMT; + inode->i_mode |= mode & ~S_IFMT; +} + int xfs_setattr_nonsize( struct xfs_inode *ip, @@ -606,18 +628,8 @@ xfs_setattr_nonsize( /* * Change file access modes. */ - if (mask & ATTR_MODE) { - umode_t mode = iattr->ia_mode; - - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) - mode &= ~S_ISGID; - - ip->i_d.di_mode &= S_IFMT; - ip->i_d.di_mode |= mode & ~S_IFMT; - - inode->i_mode &= S_IFMT; - inode->i_mode |= mode & ~S_IFMT; - } + if (mask & ATTR_MODE) + xfs_setattr_mode(tp, ip, iattr); /* * Change file access or modified times. @@ -714,9 +726,8 @@ xfs_setattr_size( return XFS_ERROR(error); ASSERT(S_ISREG(ip->i_d.di_mode)); - ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| - ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID| - ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); + ASSERT((mask & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| + ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); if (!(flags & XFS_ATTR_NOLOCK)) { lock_flags |= XFS_IOLOCK_EXCL; @@ -860,6 +871,12 @@ xfs_setattr_size( xfs_inode_clear_eofblocks_tag(ip); } + /* + * Change file access modes. + */ + if (mask & ATTR_MODE) + xfs_setattr_mode(tp, ip, iattr); + if (mask & ATTR_CTIME) { inode->i_ctime = iattr->ia_ctime; ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; -- cgit v1.2.2 From 7d2ffe80aa000a149246b3745968634192eb5358 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 27 May 2013 16:38:23 +1000 Subject: xfs: fix split buffer vector log recovery support A long time ago in a galaxy far away.... .. the was a commit made to fix some ilinux specific "fragmented buffer" log recovery problem: http://oss.sgi.com/cgi-bin/gitweb.cgi?p=archive/xfs-import.git;a=commitdiff;h=b29c0bece51da72fb3ff3b61391a391ea54e1603 That problem occurred when a contiguous dirty region of a buffer was split across across two pages of an unmapped buffer. It's been a long time since that has been done in XFS, and the changes to log the entire inode buffers for CRC enabled filesystems has re-introduced that corner case. And, of course, it turns out that the above commit didn't actually fix anything - it just ensured that log recovery is guaranteed to fail when this situation occurs. And now for the gory details. xfstest xfs/085 is failing with this assert: XFS (vdb): bad number of regions (0) in inode log format XFS: Assertion failed: 0, file: fs/xfs/xfs_log_recover.c, line: 1583 Largely undocumented factoid #1: Log recovery depends on all log buffer format items starting with this format: struct foo_log_format { __uint16_t type; __uint16_t size; .... As recoery uses the size field and assumptions about 32 bit alignment in decoding format items. So don't pay much attention to the fact log recovery thinks that it decoding an inode log format item - it just uses them to determine what the size of the item is. But why would it see a log format item with a zero size? Well, luckily enough xfs_logprint uses the same code and gives the same error, so with a bit of gdb magic, it turns out that it isn't a log format that is being decoded. What logprint tells us is this: Oper (130): tid: a0375e1a len: 28 clientid: TRANS flags: none BUF: #regs: 2 start blkno: 144 (0x90) len: 16 bmap size: 2 flags: 0x4000 Oper (131): tid: a0375e1a len: 4096 clientid: TRANS flags: none BUF DATA ---------------------------------------------------------------------------- Oper (132): tid: a0375e1a len: 4096 clientid: TRANS flags: none xfs_logprint: unknown log operation type (4e49) ********************************************************************** * ERROR: data block=2 * ********************************************************************** That we've got a buffer format item (oper 130) that has two regions; the format item itself and one dirty region. The subsequent region after the buffer format item and it's data is them what we are tripping over, and the first bytes of it at an inode magic number. Not a log opheader like there is supposed to be. That means there's a problem with the buffer format item. It's dirty data region is 4096 bytes, and it contains - you guessed it - initialised inodes. But inode buffers are 8k, not 4k, and we log them in their entirety. So something is wrong here. The buffer format item contains: (gdb) p /x *(struct xfs_buf_log_format *)in_f $22 = {blf_type = 0x123c, blf_size = 0x2, blf_flags = 0x4000, blf_len = 0x10, blf_blkno = 0x90, blf_map_size = 0x2, blf_data_map = {0xffffffff, 0xffffffff, .... }} Two regions, and a signle dirty contiguous region of 64 bits. 64 * 128 = 8k, so this should be followed by a single 8k region of data. And the blf_flags tell us that the type of buffer is a XFS_BLFT_DINO_BUF. It contains inodes. And because it doesn't have the XFS_BLF_INODE_BUF flag set, that means it's an inode allocation buffer. So, it should be followed by 8k of inode data. But we know that the next region has a header of: (gdb) p /x *ohead $25 = {oh_tid = 0x1a5e37a0, oh_len = 0x100000, oh_clientid = 0x69, oh_flags = 0x0, oh_res2 = 0x0} and so be32_to_cpu(oh_len) = 0x1000 = 4096 bytes. It's simply not long enough to hold all the logged data. There must be another region. There is - there's a following opheader for another 4k of data that contains the other half of the inode cluster data - the one we assert fail on because it's not a log format header. So why is the second part of the data not being accounted to the correct buffer log format structure? It took a little more work with gdb to work out that the buffer log format structure was both expecting it to be there but hadn't accounted for it. It was at that point I went to the kernel code, as clearly this wasn't a bug in xfs_logprint and the kernel was writing bad stuff to the log. First port of call was the buffer item formatting code, and the discontiguous memory/contiguous dirty region handling code immediately stood out. I've wondered for a long time why the code had this comment in it: vecp->i_addr = xfs_buf_offset(bp, buffer_offset); vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; /* * You would think we need to bump the nvecs here too, but we do not * this number is used by recovery, and it gets confused by the boundary * split here * nvecs++; */ vecp++; And it didn't account for the extra vector pointer. The case being handled here is that a contiguous dirty region lies across a boundary that cannot be memcpy()d across, and so has to be split into two separate operations for xlog_write() to perform. What this code assumes is that what is written to the log is two consecutive blocks of data that are accounted in the buf log format item as the same contiguous dirty region and so will get decoded as such by the log recovery code. The thing is, xlog_write() knows nothing about this, and so just does it's normal thing of adding an opheader for each vector. That means the 8k region gets written to the log as two separate regions of 4k each, but because nvecs has not been incremented, the buf log format item accounts for only one of them. Hence when we come to log recovery, we process the first 4k region and then expect to come across a new item that starts with a log format structure of some kind that tells us whenteh next data is going to be. Instead, we hit raw buffer data and things go bad real quick. So, the commit from 2002 that commented out nvecs++ is just plain wrong. It breaks log recovery completely, and it would seem the only reason this hasn't been since then is that we don't log large contigous regions of multi-page unmapped buffers very often. Never would be a closer estimate, at least until the CRC code came along.... So, lets fix that by restoring the nvecs accounting for the extra region when we hit this case..... .... and there's the problemin log recovery it is apparently working around: XFS: Assertion failed: i == item->ri_total, file: fs/xfs/xfs_log_recover.c, line: 2135 Yup, xlog_recover_do_reg_buffer() doesn't handle contigous dirty regions being broken up into multiple regions by the log formatting code. That's an easy fix, though - if the number of contiguous dirty bits exceeds the length of the region being copied out of the log, only account for the number of dirty bits that region covers, and then loop again and copy more from the next region. It's a 2 line fix. Now xfstests xfs/085 passes, we have one less piece of mystery code, and one more important piece of knowledge about how to structure new log format items.. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers (cherry picked from commit 709da6a61aaf12181a8eea8443919ae5fc1b731d) --- fs/xfs/xfs_buf_item.c | 7 +------ fs/xfs/xfs_log_recover.c | 11 +++++++++++ 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index cf263476d6b4..4ec431777048 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -262,12 +262,7 @@ xfs_buf_item_format_segment( vecp->i_addr = xfs_buf_offset(bp, buffer_offset); vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; -/* - * You would think we need to bump the nvecs here too, but we do not - * this number is used by recovery, and it gets confused by the boundary - * split here - * nvecs++; - */ + nvecs++; vecp++; first_bit = next_bit; last_bit = next_bit; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 93f03ec17eec..d9e4d3c3991a 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2096,6 +2096,17 @@ xlog_recover_do_reg_buffer( ASSERT(BBTOB(bp->b_io_length) >= ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); + /* + * The dirty regions logged in the buffer, even though + * contiguous, may span multiple chunks. This is because the + * dirty region may span a physical page boundary in a buffer + * and hence be split into two separate vectors for writing into + * the log. Hence we need to trim nbits back to the length of + * the current region being copied out of the log. + */ + if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) + nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; + /* * Do a sanity check if this is a dquot buffer. Just checking * the first dquot in the buffer should do. XXXThis is -- cgit v1.2.2 From 1de09d1ae48152e56399aba0bfd984fb0ddae6b0 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 27 May 2013 16:38:20 +1000 Subject: xfs: fix incorrect remote symlink block count When CRCs are enabled, the number of blocks needed to hold a remote symlink on a 1k block size filesystem may be 2 instead of 1. The transaction reservation for the allocated blocks was not taking this into account and only allocating one block. Hence when trying to read or invalidate such symlinks, we are mapping a hole where there should be a block and things go bad at that point. Fix the reservation to use the correct block count, clean up the block count calculation similar to the remote attribute calculation, and add a debug guard to detect when we don't write the entire symlink to disk. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Reviewed-by: Brian Foster Signed-off-by: Ben Myers (cherry picked from commit 321a95839e65db3759a07a3655184b0283af90fe) --- fs/xfs/xfs_symlink.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 5f234389327c..195a403e1522 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -56,16 +56,9 @@ xfs_symlink_blocks( struct xfs_mount *mp, int pathlen) { - int fsblocks = 0; - int len = pathlen; + int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize); - do { - fsblocks++; - len -= XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize); - } while (len > 0); - - ASSERT(fsblocks <= XFS_SYMLINK_MAPS); - return fsblocks; + return (pathlen + buflen - 1) / buflen; } static int @@ -405,7 +398,7 @@ xfs_symlink( if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version)) fs_blocks = 0; else - fs_blocks = XFS_B_TO_FSB(mp, pathlen); + fs_blocks = xfs_symlink_blocks(mp, pathlen); resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); @@ -512,7 +505,7 @@ xfs_symlink( cur_chunk = target_path; offset = 0; for (n = 0; n < nmaps; n++) { - char *buf; + char *buf; d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); @@ -525,9 +518,7 @@ xfs_symlink( bp->b_ops = &xfs_symlink_buf_ops; byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); - if (pathlen < byte_cnt) { - byte_cnt = pathlen; - } + byte_cnt = min(byte_cnt, pathlen); buf = bp->b_addr; buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, @@ -542,6 +533,7 @@ xfs_symlink( xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - (char *)bp->b_addr); } + ASSERT(pathlen == 0); } /* -- cgit v1.2.2 From e7927e879d12d27aa06b9bbed57cc32dcd7d17fd Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 27 May 2013 16:38:26 +1000 Subject: xfs: add fsgeom flag for v5 superblock support. Currently userspace has no way of determining that a filesystem is CRC enabled. Add a flag to the XFS_IOC_FSGEOMETRY ioctl output to indicate that the filesystem has v5 superblock support enabled. This will allow xfs_info to correctly report the state of the filesystem. Signed-off-by: Dave Chinner Reviewed-by: Eric Sandeen Reviewed-by: Brian Foster Signed-off-by: Ben Myers (cherry picked from commit 74137fff067961c9aca1e14d073805c3de8549bd) --- fs/xfs/xfs_fs.h | 1 + fs/xfs/xfs_fsops.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 6dda3f949b04..d04695545397 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -236,6 +236,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */ #define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ #define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ +#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ /* diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 87595b211da1..3c3644ea825b 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -99,7 +99,9 @@ xfs_fs_geometry( (xfs_sb_version_hasattr2(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) | (xfs_sb_version_hasprojid32bit(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_PROJID32 : 0); + XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) | + (xfs_sb_version_hascrc(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_V5SB : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; -- cgit v1.2.2 From 7c9950fd2ac97431230544142d5e652e1b948372 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 27 May 2013 16:38:24 +1000 Subject: xfs: disable swap extents ioctl on CRC enabled filesystems Currently, swapping extents from one inode to another is a simple act of switching data and attribute forks from one inode to another. This, unfortunately in no longer so simple with CRC enabled filesystems as there is owner information embedded into the BMBT blocks that are swapped between inodes. Hence swapping the forks between inodes results in the inodes having mapping blocks that point to the wrong owner and hence are considered corrupt. To fix this we need an extent tree block or record based swap algorithm so that the BMBT block owner information can be updated atomically in the swap transaction. This is a significant piece of new work, so for the moment simply don't allow swap extent operations to succeed on CRC enabled filesystems. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Reviewed-by: Brian Foster Signed-off-by: Ben Myers (cherry picked from commit 02f75405a75eadfb072609f6bf839e027de6a29a) --- fs/xfs/xfs_dfrag.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index f852b082a084..c407e1ccff43 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -219,6 +219,14 @@ xfs_swap_extents( int taforkblks = 0; __uint64_t tmp; + /* + * We have no way of updating owner information in the BMBT blocks for + * each inode on CRC enabled filesystems, so to avoid corrupting the + * this metadata we simply don't allow extent swaps to occur. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + return XFS_ERROR(EINVAL); + tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); if (!tempifp) { error = XFS_ERROR(ENOMEM); -- cgit v1.2.2 From e400d27d1690d609f203f2d7d8efebc98cbc3089 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 28 May 2013 18:37:17 +1000 Subject: xfs: fix dir3 freespace block corruption When the directory freespace index grows to a second block (2017 4k data blocks in the directory), the initialisation of the second new block header goes wrong. The write verifier fires a corruption error indicating that the block number in the header is zero. This was being tripped by xfs/110. The problem is that the initialisation of the new block is done just fine in xfs_dir3_free_get_buf(), but the caller then users a dirv2 structure to zero on-disk header fields that xfs_dir3_free_get_buf() has already zeroed. These lined up with the block number in the dir v3 header format. While looking at this, I noticed that the struct xfs_dir3_free_hdr() had 4 bytes of padding in it that wasn't defined as padding or being zeroed by the initialisation. Add a pad field declaration and fully zero the on disk and in-core headers in xfs_dir3_free_get_buf() so that this is never an issue in the future. Note that this doesn't change the on-disk layout, just makes the 32 bits of padding in the layout explicit. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit 5ae6e6a401957698f2bd8c9f4a86d86d02199fea) --- fs/xfs/xfs_dir2_format.h | 1 + fs/xfs/xfs_dir2_node.c | 13 ++++++------- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h index a3b1bd841a80..995f1f505a52 100644 --- a/fs/xfs/xfs_dir2_format.h +++ b/fs/xfs/xfs_dir2_format.h @@ -715,6 +715,7 @@ struct xfs_dir3_free_hdr { __be32 firstdb; /* db of first entry */ __be32 nvalid; /* count of valid entries */ __be32 nused; /* count of used entries */ + __be32 pad; /* 64 bit alignment. */ }; struct xfs_dir3_free { diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 5246de4912d4..2226a00acd15 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -263,18 +263,19 @@ xfs_dir3_free_get_buf( * Initialize the new block to be empty, and remember * its first slot as our empty slot. */ - hdr.magic = XFS_DIR2_FREE_MAGIC; - hdr.firstdb = 0; - hdr.nused = 0; - hdr.nvalid = 0; + memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr)); + memset(&hdr, 0, sizeof(hdr)); + if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; hdr.magic = XFS_DIR3_FREE_MAGIC; + hdr3->hdr.blkno = cpu_to_be64(bp->b_bn); hdr3->hdr.owner = cpu_to_be64(dp->i_ino); uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid); - } + } else + hdr.magic = XFS_DIR2_FREE_MAGIC; xfs_dir3_free_hdr_to_disk(bp->b_addr, &hdr); *bpp = bp; return 0; @@ -1921,8 +1922,6 @@ xfs_dir2_node_addname_int( */ freehdr.firstdb = (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * xfs_dir3_free_max_bests(mp); - free->hdr.nvalid = 0; - free->hdr.nused = 0; } else { free = fbp->b_addr; bests = xfs_dir3_free_bests_p(mp, free); -- cgit v1.2.2 From 9531e2de6b7f04bd734b4bbc1e16a6955121615a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 21 May 2013 18:02:01 +1000 Subject: xfs: remote attribute allocation may be contiguous When CRCs are enabled, there may be multiple allocations made if the headers cause a length overflow. This, however, does not mean that the number of headers required increases, as the second and subsequent extents may be contiguous with the previous extent. Hence when we map the extents to write the attribute data, we may end up with less extents than allocations made. Hence the assertion that we consume the number of headers we calculated in the allocation loop is incorrect and needs to be removed. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit 90253cf142469a40f89f989904abf0a1e500e1a6) --- fs/xfs/xfs_attr_remote.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index dee84466dcc9..aad95b08e76b 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -359,6 +359,11 @@ xfs_attr_rmtval_set( * into requiring more blocks. e.g. for 512 byte blocks, we'll * spill for another block every 9 headers we require in this * loop. + * + * Note that this can result in contiguous allocation of blocks, + * so we don't use all the space we allocate for headers as we + * have one less header for each contiguous allocation that + * occurs in the map/write loop below. */ if (crcs && blkcnt == 0) { int total_len; @@ -439,7 +444,6 @@ xfs_attr_rmtval_set( lblkno += map.br_blockcount; } ASSERT(valuelen == 0); - ASSERT(hdrcnt == 0); return 0; } -- cgit v1.2.2 From 551b382f5368900d6d82983505cb52553c946a2b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 21 May 2013 18:02:02 +1000 Subject: xfs: remote attribute read too short Reading a maximally size remote attribute fails when CRCs are enabled with this verification error: XFS (vdb): remote attribute header does not match required off/len/owner) There are two reasons for this, the first being that the length of the buffer being read is determined from the args->rmtblkcnt which doesn't take into account CRC headers. Hence the mapped length ends up being too short and so we need to calculate it directly from the value length. The second is that the byte count of valid data within a buffer is capped by the length of the data and so doesn't take into account that the buffer might be longer due to headers. Hence we need to calculate the data space in the buffer first before calculating the actual byte count of data. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit 913e96bc292e1bb248854686c79d6545ef3ee720) --- fs/xfs/xfs_attr_remote.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index aad95b08e76b..bcdc07c4e8f4 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -52,9 +52,11 @@ xfs_attr3_rmt_blocks( struct xfs_mount *mp, int attrlen) { - int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, - mp->m_sb.sb_blocksize); - return (attrlen + buflen - 1) / buflen; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize); + return (attrlen + buflen - 1) / buflen; + } + return XFS_B_TO_FSB(mp, attrlen); } static bool @@ -206,8 +208,9 @@ xfs_attr_rmtval_get( while (valuelen > 0) { nmap = ATTR_RMTVALUE_MAPSIZE; + blkcnt = xfs_attr3_rmt_blocks(mp, valuelen); error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, map, &nmap, + blkcnt, map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return error; @@ -227,8 +230,8 @@ xfs_attr_rmtval_get( if (error) return error; - byte_cnt = min_t(int, valuelen, BBTOB(bp->b_length)); - byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, byte_cnt); + byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, BBTOB(bp->b_length)); + byte_cnt = min_t(int, valuelen, byte_cnt); src = bp->b_addr; if (xfs_sb_version_hascrc(&mp->m_sb)) { -- cgit v1.2.2 From 26f714450c3907ce07c41a0bd1bea40368e0b4da Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 21 May 2013 18:02:03 +1000 Subject: xfs: remote attribute tail zeroing does too much When an attribute data does not fill then entire remote block, we zero the remaining part of the buffer. This, however, needs to take into account that the buffer has a header, and so the offset where zeroing starts and the length of zeroing need to take this into account. Otherwise we end up with zeros over the end of the attribute value when CRCs are enabled. While there, make sure we only ask to map an extent that covers the remaining range of the attribute, rather than asking every time for the full length of remote data. If the remote attribute blocks are contiguous with other parts of the attribute tree, it will map those blocks as well and we can potentially zero them incorrectly. We can also get buffer size mistmatches when trying to read or remove the remote attribute, and this can lead to not finding the correct buffer when looking it up in cache. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit 4af3644c9a53eb2f1ecf69cc53576561b64be4c6) --- fs/xfs/xfs_attr_remote.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index bcdc07c4e8f4..e207bf0004b6 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -296,10 +296,7 @@ xfs_attr_rmtval_set( * and we may not need that many, so we have to handle this when * allocating the blocks below. */ - if (!crcs) - blkcnt = XFS_B_TO_FSB(mp, args->valuelen); - else - blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, XFS_ATTR_FORK); @@ -394,8 +391,11 @@ xfs_attr_rmtval_set( */ lblkno = args->rmtblkno; valuelen = args->valuelen; + blkcnt = args->rmtblkcnt; while (valuelen > 0) { int byte_cnt; + int hdr_size; + int dblkcnt; char *buf; /* @@ -404,7 +404,7 @@ xfs_attr_rmtval_set( xfs_bmap_init(args->flist, args->firstblock); nmap = 1; error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, &map, &nmap, + blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return(error); @@ -413,26 +413,25 @@ xfs_attr_rmtval_set( (map.br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0); + bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0); if (!bp) return ENOMEM; bp->b_ops = &xfs_attr3_rmt_buf_ops; - - byte_cnt = BBTOB(bp->b_length); - byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, byte_cnt); - if (valuelen < byte_cnt) - byte_cnt = valuelen; - buf = bp->b_addr; - buf += xfs_attr3_rmt_hdr_set(mp, dp->i_ino, offset, + + byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, BBTOB(bp->b_length)); + byte_cnt = min_t(int, valuelen, byte_cnt); + hdr_size = xfs_attr3_rmt_hdr_set(mp, dp->i_ino, offset, byte_cnt, bp); - memcpy(buf, src, byte_cnt); + ASSERT(hdr_size + byte_cnt <= BBTOB(bp->b_length)); - if (byte_cnt < BBTOB(bp->b_length)) - xfs_buf_zero(bp, byte_cnt, - BBTOB(bp->b_length) - byte_cnt); + memcpy(buf + hdr_size, src, byte_cnt); + + if (byte_cnt + hdr_size < BBTOB(bp->b_length)) + xfs_buf_zero(bp, byte_cnt + hdr_size, + BBTOB(bp->b_length) - byte_cnt - hdr_size); error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ xfs_buf_relse(bp); @@ -442,9 +441,9 @@ xfs_attr_rmtval_set( src += byte_cnt; valuelen -= byte_cnt; offset += byte_cnt; - hdrcnt--; lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; } ASSERT(valuelen == 0); return 0; -- cgit v1.2.2 From 58a72281555bf301f6dff24db2db205c87ef8db1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 21 May 2013 18:02:04 +1000 Subject: xfs: correctly map remote attr buffers during removal If we don't map the buffers correctly (same as for get/set operations) then the incore buffer lookup will fail. If a block number matches but a length is wrong, then debug kernels will ASSERT fail in _xfs_buf_find() due to the length mismatch. Ensure that we map the buffers correctly by basing the length of the buffer on the attribute data length rather than the remote block count. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit 6863ef8449f1908c19f43db572e4474f24a1e9da) --- fs/xfs/xfs_attr_remote.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index e207bf0004b6..d8bcb2d742d1 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -468,19 +468,25 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) mp = args->dp->i_mount; /* - * Roll through the "value", invalidating the attribute value's - * blocks. + * Roll through the "value", invalidating the attribute value's blocks. + * Note that args->rmtblkcnt is the minimum number of data blocks we'll + * see for a CRC enabled remote attribute. Each extent will have a + * header, and so we may have more blocks than we realise here. If we + * fail to map the blocks correctly, we'll have problems with the buffer + * lookups. */ lblkno = args->rmtblkno; - valuelen = args->rmtblkcnt; + valuelen = args->valuelen; + blkcnt = xfs_attr3_rmt_blocks(mp, valuelen); while (valuelen > 0) { + int dblkcnt; + /* * Try to remember where we decided to put the value. */ nmap = 1; error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, &map, &nmap, - XFS_BMAPI_ATTRFORK); + blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return(error); ASSERT(nmap == 1); @@ -488,28 +494,31 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) (map.br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); /* * If the "remote" value is in the cache, remove it. */ - bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); + bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); if (bp) { xfs_buf_stale(bp); xfs_buf_relse(bp); bp = NULL; } - valuelen -= map.br_blockcount; + valuelen -= XFS_ATTR3_RMT_BUF_SPACE(mp, + XFS_FSB_TO_B(mp, map.br_blockcount)); lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + blkcnt = max(blkcnt, xfs_attr3_rmt_blocks(mp, valuelen)); } /* * Keep de-allocating extents until the remote-value region is gone. */ + blkcnt = lblkno - args->rmtblkno; lblkno = args->rmtblkno; - blkcnt = args->rmtblkcnt; done = 0; while (!done) { xfs_bmap_init(args->flist, args->firstblock); -- cgit v1.2.2 From 9e80c76205b46b338cb56c336148f54b2326342f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 21 May 2013 18:02:05 +1000 Subject: xfs: fully initialise temp leaf in xfs_attr3_leaf_unbalance xfs_attr3_leaf_unbalance() uses a temporary buffer for recombining the entries in two leaves when the destination leaf requires compaction. The temporary buffer ends up being copied back over the original destination buffer, so the header in the temporary buffer needs to contain all the information that is in the destination buffer. To make sure the temporary buffer is fully initialised, once we've set up the temporary incore header appropriately, write is back to the temporary buffer before starting to move entries around. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit 8517de2a81da830f5d90da66b4799f4040c76dc9) --- fs/xfs/xfs_attr_leaf.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 0bce1b348580..79ece72976ae 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -2181,14 +2181,24 @@ xfs_attr3_leaf_unbalance( struct xfs_attr_leafblock *tmp_leaf; struct xfs_attr3_icleaf_hdr tmphdr; - tmp_leaf = kmem_alloc(state->blocksize, KM_SLEEP); - memset(tmp_leaf, 0, state->blocksize); - memset(&tmphdr, 0, sizeof(tmphdr)); + tmp_leaf = kmem_zalloc(state->blocksize, KM_SLEEP); + + /* + * Copy the header into the temp leaf so that all the stuff + * not in the incore header is present and gets copied back in + * once we've moved all the entries. + */ + memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf)); + memset(&tmphdr, 0, sizeof(tmphdr)); tmphdr.magic = savehdr.magic; tmphdr.forw = savehdr.forw; tmphdr.back = savehdr.back; tmphdr.firstused = state->blocksize; + + /* write the header to the temp buffer to initialise it */ + xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr); + if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, drop_blk->bp, &drophdr)) { xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, -- cgit v1.2.2 From 634fd5322a3e6ae632dcf5f20eebc0583ba50838 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 21 May 2013 18:02:06 +1000 Subject: xfs: fully initialise temp leaf in xfs_attr3_leaf_compact xfs_attr3_leaf_compact() uses a temporary buffer for compacting the the entries in a leaf. It copies the the original buffer into the temporary buffer, then zeros the original buffer completely. It then copies the entries back into the original buffer. However, the original buffer has not been correctly initialised, and so the movement of the entries goes horribly wrong. Make sure the zeroed destination buffer is fully initialised, and once we've set up the destination incore header appropriately, write is back to the buffer before starting to move entries around. While debugging this, the _d/_s prefixes weren't sufficient to remind me what buffer was what, so rename then all _src/_dst. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit d4c712bcf26a25c2b67c90e44e0b74c7993b5334) --- fs/xfs/xfs_attr_leaf.c | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 79ece72976ae..5b03d15b707b 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -1445,11 +1445,12 @@ xfs_attr3_leaf_add_work( STATIC void xfs_attr3_leaf_compact( struct xfs_da_args *args, - struct xfs_attr3_icleaf_hdr *ichdr_d, + struct xfs_attr3_icleaf_hdr *ichdr_dst, struct xfs_buf *bp) { - xfs_attr_leafblock_t *leaf_s, *leaf_d; - struct xfs_attr3_icleaf_hdr ichdr_s; + struct xfs_attr_leafblock *leaf_src; + struct xfs_attr_leafblock *leaf_dst; + struct xfs_attr3_icleaf_hdr ichdr_src; struct xfs_trans *trans = args->trans; struct xfs_mount *mp = trans->t_mountp; char *tmpbuffer; @@ -1457,29 +1458,38 @@ xfs_attr3_leaf_compact( trace_xfs_attr_leaf_compact(args); tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); - ASSERT(tmpbuffer != NULL); memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp)); memset(bp->b_addr, 0, XFS_LBSIZE(mp)); + leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; + leaf_dst = bp->b_addr; /* - * Copy basic information + * Copy the on-disk header back into the destination buffer to ensure + * all the information in the header that is not part of the incore + * header structure is preserved. */ - leaf_s = (xfs_attr_leafblock_t *)tmpbuffer; - leaf_d = bp->b_addr; - ichdr_s = *ichdr_d; /* struct copy */ - ichdr_d->firstused = XFS_LBSIZE(mp); - ichdr_d->usedbytes = 0; - ichdr_d->count = 0; - ichdr_d->holes = 0; - ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_s); - ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base; + memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src)); + + /* Initialise the incore headers */ + ichdr_src = *ichdr_dst; /* struct copy */ + ichdr_dst->firstused = XFS_LBSIZE(mp); + ichdr_dst->usedbytes = 0; + ichdr_dst->count = 0; + ichdr_dst->holes = 0; + ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src); + ichdr_dst->freemap[0].size = ichdr_dst->firstused - + ichdr_dst->freemap[0].base; + + + /* write the header back to initialise the underlying buffer */ + xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst); /* * Copy all entry's in the same (sorted) order, * but allocate name/value pairs packed and in sequence. */ - xfs_attr3_leaf_moveents(leaf_s, &ichdr_s, 0, leaf_d, ichdr_d, 0, - ichdr_s.count, mp); + xfs_attr3_leaf_moveents(leaf_src, &ichdr_src, 0, leaf_dst, ichdr_dst, 0, + ichdr_src.count, mp); /* * this logs the entire buffer, but the caller must write the header * back to the buffer when it is finished modifying it. -- cgit v1.2.2 From 7bc0dc271e494e12be3afd3c6431e5216347c624 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 21 May 2013 18:02:08 +1000 Subject: xfs: rework remote attr CRCs Note: this changes the on-disk remote attribute format. I assert that this is OK to do as CRCs are marked experimental and the first kernel it is included in has not yet reached release yet. Further, the userspace utilities are still evolving and so anyone using this stuff right now is a developer or tester using volatile filesystems for testing this feature. Hence changing the format right now to save longer term pain is the right thing to do. The fundamental change is to move from a header per extent in the attribute to a header per filesytem block in the attribute. This means there are more header blocks and the parsing of the attribute data is slightly more complex, but it has the advantage that we always know the size of the attribute on disk based on the length of the data it contains. This is where the header-per-extent method has problems. We don't know the size of the attribute on disk without first knowing how many extents are used to hold it. And we can't tell from a mapping lookup, either, because remote attributes can be allocated contiguously with other attribute blocks and so there is no obvious way of determining the actual size of the atribute on disk short of walking and mapping buffers. The problem with this approach is that if we map a buffer incorrectly (e.g. we make the last buffer for the attribute data too long), we then get buffer cache lookup failure when we map it correctly. i.e. we get a size mismatch on lookup. This is not necessarily fatal, but it's a cache coherency problem that can lead to returning the wrong data to userspace or writing the wrong data to disk. And debug kernels will assert fail if this occurs. I found lots of niggly little problems trying to fix this issue on a 4k block size filesystem, finally getting it to pass with lots of fixes. The thing is, 1024 byte filesystems still failed, and it was getting really complex handling all the corner cases that were showing up. And there were clearly more that I hadn't found yet. It is complex, fragile code, and if we don't fix it now, it will be complex, fragile code forever more. Hence the simple fix is to add a header to each filesystem block. This gives us the same relationship between the attribute data length and the number of blocks on disk as we have without CRCs - it's a linear mapping and doesn't require us to guess anything. It is simple to implement, too - the remote block count calculated at lookup time can be used by the remote attribute set/get/remove code without modification for both CRC and non-CRC filesystems. The world becomes sane again. Because the copy-in and copy-out now need to iterate over each filesystem block, I moved them into helper functions so we separate the block mapping and buffer manupulations from the attribute data and CRC header manipulations. The code becomes much clearer as a result, and it is a lot easier to understand and debug. It also appears to be much more robust - once it worked on 4k block size filesystems, it has worked without failure on 1k block size filesystems, too. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit ad1858d77771172e08016890f0eb2faedec3ecee) --- fs/xfs/xfs_attr_leaf.c | 13 +- fs/xfs/xfs_attr_remote.c | 381 ++++++++++++++++++++++++++++------------------- fs/xfs/xfs_attr_remote.h | 10 ++ fs/xfs/xfs_buf.c | 1 + 4 files changed, 247 insertions(+), 158 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 5b03d15b707b..d788302e506a 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -1412,7 +1412,7 @@ xfs_attr3_leaf_add_work( name_rmt->valuelen = 0; name_rmt->valueblk = 0; args->rmtblkno = 1; - args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen); + args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); } xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), @@ -2354,8 +2354,9 @@ xfs_attr3_leaf_lookup_int( args->index = probe; args->valuelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); - args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, - args->valuelen); + args->rmtblkcnt = xfs_attr3_rmt_blocks( + args->dp->i_mount, + args->valuelen); return XFS_ERROR(EEXIST); } } @@ -2406,7 +2407,8 @@ xfs_attr3_leaf_getvalue( ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); valuelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); - args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen); + args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, + valuelen); if (args->flags & ATTR_KERNOVAL) { args->valuelen = valuelen; return 0; @@ -2732,7 +2734,8 @@ xfs_attr3_leaf_list_int( args.valuelen = valuelen; args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); args.rmtblkno = be32_to_cpu(name_rmt->valueblk); - args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen); + args.rmtblkcnt = xfs_attr3_rmt_blocks( + args.dp->i_mount, valuelen); retval = xfs_attr_rmtval_get(&args); if (retval) return retval; diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index d8bcb2d742d1..ef6b0c124528 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -47,7 +47,7 @@ * Each contiguous block has a header, so it is not just a simple attribute * length to FSB conversion. */ -static int +int xfs_attr3_rmt_blocks( struct xfs_mount *mp, int attrlen) @@ -59,12 +59,43 @@ xfs_attr3_rmt_blocks( return XFS_B_TO_FSB(mp, attrlen); } +/* + * Checking of the remote attribute header is split into two parts. The verifier + * does CRC, location and bounds checking, the unpacking function checks the + * attribute parameters and owner. + */ +static bool +xfs_attr3_rmt_hdr_ok( + struct xfs_mount *mp, + void *ptr, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + xfs_daddr_t bno) +{ + struct xfs_attr3_rmt_hdr *rmt = ptr; + + if (bno != be64_to_cpu(rmt->rm_blkno)) + return false; + if (offset != be32_to_cpu(rmt->rm_offset)) + return false; + if (size != be32_to_cpu(rmt->rm_bytes)) + return false; + if (ino != be64_to_cpu(rmt->rm_owner)) + return false; + + /* ok */ + return true; +} + static bool xfs_attr3_rmt_verify( - struct xfs_buf *bp) + struct xfs_mount *mp, + void *ptr, + int fsbsize, + xfs_daddr_t bno) { - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + struct xfs_attr3_rmt_hdr *rmt = ptr; if (!xfs_sb_version_hascrc(&mp->m_sb)) return false; @@ -72,7 +103,9 @@ xfs_attr3_rmt_verify( return false; if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid)) return false; - if (bp->b_bn != be64_to_cpu(rmt->rm_blkno)) + if (be64_to_cpu(rmt->rm_blkno) != bno) + return false; + if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) return false; if (be32_to_cpu(rmt->rm_offset) + be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX) @@ -88,17 +121,40 @@ xfs_attr3_rmt_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + char *ptr; + int len; + bool corrupt = false; + xfs_daddr_t bno; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) return; - if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_ATTR3_RMT_CRC_OFF) || - !xfs_attr3_rmt_verify(bp)) { + ptr = bp->b_addr; + bno = bp->b_bn; + len = BBTOB(bp->b_length); + ASSERT(len >= XFS_LBSIZE(mp)); + + while (len > 0) { + if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp), + XFS_ATTR3_RMT_CRC_OFF)) { + corrupt = true; + break; + } + if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { + corrupt = true; + break; + } + len -= XFS_LBSIZE(mp); + ptr += XFS_LBSIZE(mp); + bno += mp->m_bsize; + } + + if (corrupt) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); - } + } else + ASSERT(len == 0); } static void @@ -107,23 +163,39 @@ xfs_attr3_rmt_write_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_buf_log_item *bip = bp->b_fspriv; + char *ptr; + int len; + xfs_daddr_t bno; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) return; - if (!xfs_attr3_rmt_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); - xfs_buf_ioerror(bp, EFSCORRUPTED); - return; - } + ptr = bp->b_addr; + bno = bp->b_bn; + len = BBTOB(bp->b_length); + ASSERT(len >= XFS_LBSIZE(mp)); + + while (len > 0) { + if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { + XFS_CORRUPTION_ERROR(__func__, + XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + if (bip) { + struct xfs_attr3_rmt_hdr *rmt; - if (bip) { - struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; - rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); + rmt = (struct xfs_attr3_rmt_hdr *)ptr; + rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_update_cksum(ptr, XFS_LBSIZE(mp), XFS_ATTR3_RMT_CRC_OFF); + + len -= XFS_LBSIZE(mp); + ptr += XFS_LBSIZE(mp); + bno += mp->m_bsize; } - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_ATTR3_RMT_CRC_OFF); + ASSERT(len == 0); } const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { @@ -131,15 +203,16 @@ const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { .verify_write = xfs_attr3_rmt_write_verify, }; -static int +STATIC int xfs_attr3_rmt_hdr_set( struct xfs_mount *mp, + void *ptr, xfs_ino_t ino, uint32_t offset, uint32_t size, - struct xfs_buf *bp) + xfs_daddr_t bno) { - struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + struct xfs_attr3_rmt_hdr *rmt = ptr; if (!xfs_sb_version_hascrc(&mp->m_sb)) return 0; @@ -149,36 +222,107 @@ xfs_attr3_rmt_hdr_set( rmt->rm_bytes = cpu_to_be32(size); uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid); rmt->rm_owner = cpu_to_be64(ino); - rmt->rm_blkno = cpu_to_be64(bp->b_bn); - bp->b_ops = &xfs_attr3_rmt_buf_ops; + rmt->rm_blkno = cpu_to_be64(bno); return sizeof(struct xfs_attr3_rmt_hdr); } /* - * Checking of the remote attribute header is split into two parts. the verifier - * does CRC, location and bounds checking, the unpacking function checks the - * attribute parameters and owner. + * Helper functions to copy attribute data in and out of the one disk extents */ -static bool -xfs_attr3_rmt_hdr_ok( - struct xfs_mount *mp, - xfs_ino_t ino, - uint32_t offset, - uint32_t size, - struct xfs_buf *bp) +STATIC int +xfs_attr_rmtval_copyout( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_ino_t ino, + int *offset, + int *valuelen, + char **dst) { - struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + char *src = bp->b_addr; + xfs_daddr_t bno = bp->b_bn; + int len = BBTOB(bp->b_length); - if (offset != be32_to_cpu(rmt->rm_offset)) - return false; - if (size != be32_to_cpu(rmt->rm_bytes)) - return false; - if (ino != be64_to_cpu(rmt->rm_owner)) - return false; + ASSERT(len >= XFS_LBSIZE(mp)); - /* ok */ - return true; + while (len > 0 && *valuelen > 0) { + int hdr_size = 0; + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp)); + + byte_cnt = min_t(int, *valuelen, byte_cnt); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset, + byte_cnt, bno)) { + xfs_alert(mp, +"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", + bno, *offset, byte_cnt, ino); + return EFSCORRUPTED; + } + hdr_size = sizeof(struct xfs_attr3_rmt_hdr); + } + + memcpy(*dst, src + hdr_size, byte_cnt); + + /* roll buffer forwards */ + len -= XFS_LBSIZE(mp); + src += XFS_LBSIZE(mp); + bno += mp->m_bsize; + + /* roll attribute data forwards */ + *valuelen -= byte_cnt; + *dst += byte_cnt; + *offset += byte_cnt; + } + return 0; +} + +STATIC void +xfs_attr_rmtval_copyin( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_ino_t ino, + int *offset, + int *valuelen, + char **src) +{ + char *dst = bp->b_addr; + xfs_daddr_t bno = bp->b_bn; + int len = BBTOB(bp->b_length); + + ASSERT(len >= XFS_LBSIZE(mp)); + + while (len > 0 && *valuelen > 0) { + int hdr_size; + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp)); + + byte_cnt = min(*valuelen, byte_cnt); + hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset, + byte_cnt, bno); + + memcpy(dst + hdr_size, *src, byte_cnt); + + /* + * If this is the last block, zero the remainder of it. + * Check that we are actually the last block, too. + */ + if (byte_cnt + hdr_size < XFS_LBSIZE(mp)) { + ASSERT(*valuelen - byte_cnt == 0); + ASSERT(len == XFS_LBSIZE(mp)); + memset(dst + hdr_size + byte_cnt, 0, + XFS_LBSIZE(mp) - hdr_size - byte_cnt); + } + + /* roll buffer forwards */ + len -= XFS_LBSIZE(mp); + dst += XFS_LBSIZE(mp); + bno += mp->m_bsize; + + /* roll attribute data forwards */ + *valuelen -= byte_cnt; + *src += byte_cnt; + *offset += byte_cnt; + } } /* @@ -192,13 +336,12 @@ xfs_attr_rmtval_get( struct xfs_bmbt_irec map[ATTR_RMTVALUE_MAPSIZE]; struct xfs_mount *mp = args->dp->i_mount; struct xfs_buf *bp; - xfs_daddr_t dblkno; xfs_dablk_t lblkno = args->rmtblkno; - void *dst = args->value; + char *dst = args->value; int valuelen = args->valuelen; int nmap; int error; - int blkcnt; + int blkcnt = args->rmtblkcnt; int i; int offset = 0; @@ -208,7 +351,6 @@ xfs_attr_rmtval_get( while (valuelen > 0) { nmap = ATTR_RMTVALUE_MAPSIZE; - blkcnt = xfs_attr3_rmt_blocks(mp, valuelen); error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, blkcnt, map, &nmap, XFS_BMAPI_ATTRFORK); @@ -217,45 +359,29 @@ xfs_attr_rmtval_get( ASSERT(nmap >= 1); for (i = 0; (i < nmap) && (valuelen > 0); i++) { - int byte_cnt; - char *src; + xfs_daddr_t dblkno; + int dblkcnt; ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) && (map[i].br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); - blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); + dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, - dblkno, blkcnt, 0, &bp, + dblkno, dblkcnt, 0, &bp, &xfs_attr3_rmt_buf_ops); if (error) return error; - byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, BBTOB(bp->b_length)); - byte_cnt = min_t(int, valuelen, byte_cnt); - - src = bp->b_addr; - if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (!xfs_attr3_rmt_hdr_ok(mp, args->dp->i_ino, - offset, byte_cnt, bp)) { - xfs_alert(mp, -"remote attribute header does not match required off/len/owner (0x%x/Ox%x,0x%llx)", - offset, byte_cnt, args->dp->i_ino); - xfs_buf_relse(bp); - return EFSCORRUPTED; - - } - - src += sizeof(struct xfs_attr3_rmt_hdr); - } - - memcpy(dst, src, byte_cnt); + error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, + &offset, &valuelen, + &dst); xfs_buf_relse(bp); + if (error) + return error; - offset += byte_cnt; - dst += byte_cnt; - valuelen -= byte_cnt; - + /* roll attribute extent map forwards */ lblkno += map[i].br_blockcount; + blkcnt -= map[i].br_blockcount; } } ASSERT(valuelen == 0); @@ -273,17 +399,13 @@ xfs_attr_rmtval_set( struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_bmbt_irec map; - struct xfs_buf *bp; - xfs_daddr_t dblkno; xfs_dablk_t lblkno; xfs_fileoff_t lfileoff = 0; - void *src = args->value; + char *src = args->value; int blkcnt; int valuelen; int nmap; int error; - int hdrcnt = 0; - bool crcs = xfs_sb_version_hascrc(&mp->m_sb); int offset = 0; trace_xfs_attr_rmtval_set(args); @@ -292,21 +414,14 @@ xfs_attr_rmtval_set( * Find a "hole" in the attribute address space large enough for * us to drop the new attribute's value into. Because CRC enable * attributes have headers, we can't just do a straight byte to FSB - * conversion. We calculate the worst case block count in this case - * and we may not need that many, so we have to handle this when - * allocating the blocks below. + * conversion and have to take the header space into account. */ blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); - error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, XFS_ATTR_FORK); if (error) return error; - /* Start with the attribute data. We'll allocate the rest afterwards. */ - if (crcs) - blkcnt = XFS_B_TO_FSB(mp, args->valuelen); - args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; args->rmtblkcnt = blkcnt; @@ -349,31 +464,6 @@ xfs_attr_rmtval_set( (map.br_startblock != HOLESTARTBLOCK)); lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; - hdrcnt++; - - /* - * If we have enough blocks for the attribute data, calculate - * how many extra blocks we need for headers. We might run - * through this multiple times in the case that the additional - * headers in the blocks needed for the data fragments spills - * into requiring more blocks. e.g. for 512 byte blocks, we'll - * spill for another block every 9 headers we require in this - * loop. - * - * Note that this can result in contiguous allocation of blocks, - * so we don't use all the space we allocate for headers as we - * have one less header for each contiguous allocation that - * occurs in the map/write loop below. - */ - if (crcs && blkcnt == 0) { - int total_len; - - total_len = args->valuelen + - hdrcnt * sizeof(struct xfs_attr3_rmt_hdr); - blkcnt = XFS_B_TO_FSB(mp, total_len); - blkcnt -= args->rmtblkcnt; - args->rmtblkcnt += blkcnt; - } /* * Start the next trans in the chain. @@ -390,17 +480,15 @@ xfs_attr_rmtval_set( * the INCOMPLETE flag. */ lblkno = args->rmtblkno; - valuelen = args->valuelen; blkcnt = args->rmtblkcnt; + valuelen = args->valuelen; while (valuelen > 0) { - int byte_cnt; - int hdr_size; - int dblkcnt; - char *buf; + struct xfs_buf *bp; + xfs_daddr_t dblkno; + int dblkcnt; + + ASSERT(blkcnt > 0); - /* - * Try to remember where we decided to put the value. - */ xfs_bmap_init(args->flist, args->firstblock); nmap = 1; error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, @@ -419,29 +507,17 @@ xfs_attr_rmtval_set( if (!bp) return ENOMEM; bp->b_ops = &xfs_attr3_rmt_buf_ops; - buf = bp->b_addr; - - byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, BBTOB(bp->b_length)); - byte_cnt = min_t(int, valuelen, byte_cnt); - hdr_size = xfs_attr3_rmt_hdr_set(mp, dp->i_ino, offset, - byte_cnt, bp); - ASSERT(hdr_size + byte_cnt <= BBTOB(bp->b_length)); - memcpy(buf + hdr_size, src, byte_cnt); - - if (byte_cnt + hdr_size < BBTOB(bp->b_length)) - xfs_buf_zero(bp, byte_cnt + hdr_size, - BBTOB(bp->b_length) - byte_cnt - hdr_size); + xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset, + &valuelen, &src); error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ xfs_buf_relse(bp); if (error) return error; - src += byte_cnt; - valuelen -= byte_cnt; - offset += byte_cnt; + /* roll attribute extent map forwards */ lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; } @@ -454,19 +530,17 @@ xfs_attr_rmtval_set( * out-of-line buffer that it is stored on. */ int -xfs_attr_rmtval_remove(xfs_da_args_t *args) +xfs_attr_rmtval_remove( + struct xfs_da_args *args) { - xfs_mount_t *mp; - xfs_bmbt_irec_t map; - xfs_buf_t *bp; - xfs_daddr_t dblkno; - xfs_dablk_t lblkno; - int valuelen, blkcnt, nmap, error, done, committed; + struct xfs_mount *mp = args->dp->i_mount; + xfs_dablk_t lblkno; + int blkcnt; + int error; + int done; trace_xfs_attr_rmtval_remove(args); - mp = args->dp->i_mount; - /* * Roll through the "value", invalidating the attribute value's blocks. * Note that args->rmtblkcnt is the minimum number of data blocks we'll @@ -476,10 +550,13 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) * lookups. */ lblkno = args->rmtblkno; - valuelen = args->valuelen; - blkcnt = xfs_attr3_rmt_blocks(mp, valuelen); - while (valuelen > 0) { - int dblkcnt; + blkcnt = args->rmtblkcnt; + while (blkcnt > 0) { + struct xfs_bmbt_irec map; + struct xfs_buf *bp; + xfs_daddr_t dblkno; + int dblkcnt; + int nmap; /* * Try to remember where we decided to put the value. @@ -506,21 +583,19 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) bp = NULL; } - valuelen -= XFS_ATTR3_RMT_BUF_SPACE(mp, - XFS_FSB_TO_B(mp, map.br_blockcount)); - lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; - blkcnt = max(blkcnt, xfs_attr3_rmt_blocks(mp, valuelen)); } /* * Keep de-allocating extents until the remote-value region is gone. */ - blkcnt = lblkno - args->rmtblkno; lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; done = 0; while (!done) { + int committed; + xfs_bmap_init(args->flist, args->firstblock); error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/xfs_attr_remote.h index c7cca60a062a..92a8fd7977cc 100644 --- a/fs/xfs/xfs_attr_remote.h +++ b/fs/xfs/xfs_attr_remote.h @@ -20,6 +20,14 @@ #define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */ +/* + * There is one of these headers per filesystem block in a remote attribute. + * This is done to ensure there is a 1:1 mapping between the attribute value + * length and the number of blocks needed to store the attribute. This makes the + * verification of a buffer a little more complex, but greatly simplifies the + * allocation, reading and writing of these attributes as we don't have to guess + * the number of blocks needed to store the attribute data. + */ struct xfs_attr3_rmt_hdr { __be32 rm_magic; __be32 rm_offset; @@ -39,6 +47,8 @@ struct xfs_attr3_rmt_hdr { extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; +int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); + int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_set(struct xfs_da_args *args); int xfs_attr_rmtval_remove(struct xfs_da_args *args); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 0d2554299688..1b2472a46e46 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -513,6 +513,7 @@ _xfs_buf_find( xfs_alert(btp->bt_mount, "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", __func__, blkno, eofs); + WARN_ON(1); return NULL; } -- cgit v1.2.2 From 5d477b6079619910dab882fa229cce1f14f86cf8 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 10 May 2013 14:04:11 +0200 Subject: vfs: Fix invalid ida_remove() call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the group id of a shared mount is not allocated, the umount still tries to call mnt_release_group_id(), which eventually hits a kernel warning at ida_remove() spewing a message like: ida_remove called for id=0 which is not allocated. This patch fixes the bug simply checking the group id in the caller. Reported-by: Cristian Rodríguez Signed-off-by: Takashi Iwai Signed-off-by: Al Viro --- fs/pnode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/pnode.c b/fs/pnode.c index 3d2a7141b87a..9af0df15256e 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -83,7 +83,8 @@ static int do_make_slave(struct mount *mnt) if (peer_mnt == mnt) peer_mnt = NULL; } - if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share)) + if (mnt->mnt_group_id && IS_MNT_SHARED(mnt) && + list_empty(&mnt->mnt_share)) mnt_release_group_id(mnt); list_del_init(&mnt->mnt_share); -- cgit v1.2.2 From 1d7095c72d35eee4ebc28e66563e636b9adafeb2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 May 2013 15:21:56 -0400 Subject: qnx6: qnx6_readdir() has a braino in pos calculation We want to mask lower 5 bits out, not leave only those and clear the rest... As it is, we end up always starting to read from the beginning of directory, no matter what the current position had been. Signed-off-by: Al Viro --- fs/qnx6/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index 8798d065e400..afa6be6fc397 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -120,7 +120,7 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir) struct inode *inode = file_inode(filp); struct super_block *s = inode->i_sb; struct qnx6_sb_info *sbi = QNX6_SB(s); - loff_t pos = filp->f_pos & (QNX6_DIR_ENTRY_SIZE - 1); + loff_t pos = filp->f_pos & ~(QNX6_DIR_ENTRY_SIZE - 1); unsigned long npages = dir_pages(inode); unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE; -- cgit v1.2.2 From 31abdab9c11bb1694ecd1476a7edbe8e964d94ac Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 May 2013 02:38:52 -0400 Subject: hpfs: deadlock and race in directory lseek() For one thing, there's an ABBA deadlock on hpfs fs-wide lock and i_mutex in hpfs_dir_lseek() - there's a lot of methods that grab the former with the caller already holding the latter, so it must take i_mutex first. For another, locking the damn thing, carefully validating the offset, then dropping locks and assigning the offset is obviously racy. Moreover, we _must_ do hpfs_add_pos(), or the machinery in dnode.c won't modify the sucker on B-tree surgeries. Signed-off-by: Al Viro --- fs/hpfs/dir.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 546f6d39713a..834ac13c04b7 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -33,25 +33,27 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) if (whence == SEEK_DATA || whence == SEEK_HOLE) return -EINVAL; + mutex_lock(&i->i_mutex); hpfs_lock(s); /*printk("dir lseek\n");*/ if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok; - mutex_lock(&i->i_mutex); pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1; while (pos != new_off) { if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh); else goto fail; if (pos == 12) goto fail; } - mutex_unlock(&i->i_mutex); + hpfs_add_pos(i, &filp->f_pos); ok: + filp->f_pos = new_off; hpfs_unlock(s); - return filp->f_pos = new_off; -fail: mutex_unlock(&i->i_mutex); + return new_off; +fail: /*printk("illegal lseek: %016llx\n", new_off);*/ hpfs_unlock(s); + mutex_unlock(&i->i_mutex); return -ESPIPE; } -- cgit v1.2.2 From 448293aadb54ab38b9c053bf9f1eecafdc0ed214 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 22 May 2013 13:41:26 -0400 Subject: befs_readdir(): do not increment ->f_pos if filldir tells us to stop Signed-off-by: Al Viro --- fs/befs/linuxvfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 8615ee89ab55..f95dddced968 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -265,8 +265,8 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir) result = filldir(dirent, keybuf, keysize, filp->f_pos, (ino_t) value, d_type); } - - filp->f_pos++; + if (!result) + filp->f_pos++; befs_debug(sb, "<--- befs_readdir() filp->f_pos %Ld", filp->f_pos); -- cgit v1.2.2 From 0bdc7acba56a7ca4232f15f37b16f7ec079385ab Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 31 May 2013 15:07:52 -0400 Subject: reiserfs: fix spurious multiple-fill in reiserfs_readdir_dentry After sleeping for filldir(), we check to see if the file system has changed and research. The next_pos pointer is updated but its value isn't pushed into the key used for the search itself. As a result, the search returns the same item that the last cycle of the loop did and filldir() is called multiple times with the same data. The end result is that the buffer can contain the same name multiple times. This can be returned to userspace or used internally in the xattr code where it can manifest with the following warning: jdm-20004 reiserfs_delete_xattrs: Couldn't delete all xattrs (-2) reiserfs_for_each_xattr uses reiserfs_readdir_dentry to iterate over the xattr names and ends up trying to unlink the same name twice. The second attempt fails with -ENOENT and the error is returned. At some point I'll need to add support into reiserfsck to remove the orphaned directories left behind when this occurs. The fix is to push the value into the key before researching. Signed-off-by: Jeff Mahoney Signed-off-by: Jan Kara --- fs/reiserfs/dir.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 66c53b642a88..6c2d136561cb 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -204,6 +204,8 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, next_pos = deh_offset(deh) + 1; if (item_moved(&tmp_ih, &path_to_entry)) { + set_cpu_key_k_offset(&pos_key, + next_pos); goto research; } } /* for */ -- cgit v1.2.2 From 4a8570112b76a63ad21cfcbe2783f98f7fd5ba1b Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 31 May 2013 15:54:17 -0400 Subject: reiserfs: fix problems with chowning setuid file w/ xattrs reiserfs_chown_xattrs() takes the iattr struct passed into ->setattr and uses it to iterate over all the attrs associated with a file to change ownership of xattrs (and transfer quota associated with the xattr files). When the setuid bit is cleared during chown, ATTR_MODE and iattr->ia_mode are passed to all the xattrs as well. This means that the xattr directory will have S_IFREG added to its mode bits. This has been prevented in practice by a missing IS_PRIVATE check in reiserfs_acl_chmod, which caused a double-lock to occur while holding the write lock. Since the file system was completely locked up, the writeout of the corrupted mode never happened. This patch temporarily clears everything but ATTR_UID|ATTR_GID for the calls to reiserfs_setattr and adds the missing IS_PRIVATE check. Signed-off-by: Jeff Mahoney Signed-off-by: Jan Kara --- fs/reiserfs/xattr.c | 14 +++++++++++++- fs/reiserfs/xattr_acl.c | 3 +++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 4cce1d9552fb..821bcf70e467 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -318,7 +318,19 @@ static int delete_one_xattr(struct dentry *dentry, void *data) static int chown_one_xattr(struct dentry *dentry, void *data) { struct iattr *attrs = data; - return reiserfs_setattr(dentry, attrs); + int ia_valid = attrs->ia_valid; + int err; + + /* + * We only want the ownership bits. Otherwise, we'll do + * things like change a directory to a regular file if + * ATTR_MODE is set. + */ + attrs->ia_valid &= (ATTR_UID|ATTR_GID); + err = reiserfs_setattr(dentry, attrs); + attrs->ia_valid = ia_valid; + + return err; } /* No i_mutex, but the inode is unconnected. */ diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index d7c01ef64eda..6c8767fdfc6a 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -443,6 +443,9 @@ int reiserfs_acl_chmod(struct inode *inode) int depth; int error; + if (IS_PRIVATE(inode)) + return 0; + if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; -- cgit v1.2.2 From a1457c0ce976bad1356b9b0437f2a5c3ab8a9cfc Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 31 May 2013 15:51:17 -0400 Subject: reiserfs: fix deadlock with nfs racing on create/lookup Reiserfs is currently able to be deadlocked by having two NFS clients where one has removed and recreated a file and another is accessing the file with an open file handle. If one client deletes and recreates a file with timing such that the recreated file obtains the same [dirid, objectid] pair as the original file while another client accesses the file via file handle, the create and lookup can race and deadlock if the lookup manages to create the in-memory inode first. The create thread, in insert_inode_locked4, will hold the write lock while waiting on the other inode to be unlocked. The lookup thread, anywhere in the iget path, will release and reacquire the write lock while it schedules. If it needs to reacquire the lock while the create thread has it, it will never be able to make forward progress because it needs to reacquire the lock before ultimately unlocking the inode. This patch drops the write lock across the insert_inode_locked4 call so that the ordering of inode_wait -> write lock is retained. Since this would have been the case before the BKL push-down, this is safe. Signed-off-by: Jeff Mahoney Signed-off-by: Jan Kara --- fs/reiserfs/inode.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 77d6d47abc83..f844533792ee 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1811,11 +1811,16 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE); args.dirid = le32_to_cpu(ih.ih_key.k_dir_id); - if (insert_inode_locked4(inode, args.objectid, - reiserfs_find_actor, &args) < 0) { + + reiserfs_write_unlock(inode->i_sb); + err = insert_inode_locked4(inode, args.objectid, + reiserfs_find_actor, &args); + reiserfs_write_lock(inode->i_sb); + if (err) { err = -EINVAL; goto out_bad_inode; } + if (old_format_only(sb)) /* not a perfect generation count, as object ids can be reused, but ** this is as good as reiserfs can do right now. -- cgit v1.2.2 From 1fc29bacedeabb278080e31bb9c1ecb49f143c3b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 31 May 2013 10:00:18 -0400 Subject: cifs: fix off-by-one bug in build_unc_path_to_root commit 839db3d10a (cifs: fix up handling of prefixpath= option) changed the code such that the vol->prepath no longer contained a leading delimiter and then fixed up the places that accessed that field to account for that change. One spot in build_unc_path_to_root was missed however. When doing the pointer addition on pos, that patch failed to account for the fact that we had already incremented "pos" by one when adding the length of the prepath. This caused a buffer overrun by one byte. This patch fixes the problem by correcting the handling of "pos". Cc: # v3.8+ Reported-by: Marcus Moeller Reported-by: Ken Fallon Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 5b97e56ddbca..e3bc39bb9d12 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3279,8 +3279,8 @@ build_unc_path_to_root(const struct smb_vol *vol, pos = full_path + unc_len; if (pplen) { - *pos++ = CIFS_DIR_SEP(cifs_sb); - strncpy(pos, vol->prepath, pplen); + *pos = CIFS_DIR_SEP(cifs_sb); + strncpy(pos + 1, vol->prepath, pplen); pos += pplen; } -- cgit v1.2.2 From 28420dad233520811c0e0860e7fb4975ed863fc4 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 3 Jun 2013 14:40:22 +0200 Subject: fuse: fix readdirplus Oops in fuse_dentry_revalidate Fix bug introduced by commit 4582a4ab2a "FUSE: Adapt readdirplus to application usage patterns". We need to check for a positive dentry; negative dentries are not added by readdirplus. Secondly we need to advise the use of readdirplus on the *parent*, otherwise the whole thing is useless. Thirdly all this is only relevant if "readdirplus_auto" mode is selected by the filesystem. We advise the use of readdirplus only if the dentry was still valid. If we had to redo the lookup then there was no use in doing the -plus version. Reported-by: Bernd Schubert Signed-off-by: Miklos Szeredi CC: Feng Shuo CC: stable@vger.kernel.org --- fs/fuse/dir.c | 12 +++++++++--- fs/fuse/inode.c | 7 ++++--- 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 254df56b847b..f3f783dc4f75 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -180,6 +180,8 @@ u64 fuse_get_attr_version(struct fuse_conn *fc) static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) { struct inode *inode; + struct dentry *parent; + struct fuse_conn *fc; inode = ACCESS_ONCE(entry->d_inode); if (inode && is_bad_inode(inode)) @@ -187,10 +189,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) else if (fuse_dentry_time(entry) < get_jiffies_64()) { int err; struct fuse_entry_out outarg; - struct fuse_conn *fc; struct fuse_req *req; struct fuse_forget_link *forget; - struct dentry *parent; u64 attr_version; /* For negative dentries, always do a fresh lookup */ @@ -241,8 +241,14 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) entry_attr_timeout(&outarg), attr_version); fuse_change_entry_timeout(entry, &outarg); + } else if (inode) { + fc = get_fuse_conn(inode); + if (fc->readdirplus_auto) { + parent = dget_parent(entry); + fuse_advise_use_readdirplus(parent->d_inode); + dput(parent); + } } - fuse_advise_use_readdirplus(inode); return 1; } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 6201f81e4d3a..9a0cdde14a08 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -867,10 +867,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->dont_mask = 1; if (arg->flags & FUSE_AUTO_INVAL_DATA) fc->auto_inval_data = 1; - if (arg->flags & FUSE_DO_READDIRPLUS) + if (arg->flags & FUSE_DO_READDIRPLUS) { fc->do_readdirplus = 1; - if (arg->flags & FUSE_READDIRPLUS_AUTO) - fc->readdirplus_auto = 1; + if (arg->flags & FUSE_READDIRPLUS_AUTO) + fc->readdirplus_auto = 1; + } if (arg->flags & FUSE_ASYNC_DIO) fc->async_dio = 1; } else { -- cgit v1.2.2 From c9ecf989cc7626e9edf8abef79f64b909542129b Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 30 May 2013 15:35:50 -0400 Subject: fuse: return -EIOCBQUEUED from fuse_direct_IO() for all async requests If request submission fails for an async request (i.e., get_user_pages() returns -ERESTARTSYS), we currently skip the -EIOCBQUEUED return and drop into wait_for_sync_kiocb() forever. Avoid this by always returning -EIOCBQUEUED for async requests. If an error occurs, the error is passed into fuse_aio_complete(), returned via aio_complete() and thus propagated to userspace via io_getevents(). Signed-off-by: Brian Foster Reviewed-by: Maxim Patlasov Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d9f467907791..b3ad8d61a162 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2432,7 +2432,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, fuse_aio_complete(io, ret < 0 ? ret : 0, -1); /* we have a non-extending, async request, so return */ - if (ret > 0 && !is_sync_kiocb(iocb)) + if (!is_sync_kiocb(iocb)) return -EIOCBQUEUED; ret = wait_on_sync_kiocb(iocb); -- cgit v1.2.2 From e5c5f05dca0cf90f0f3bb1aea85dcf658baff185 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Thu, 30 May 2013 16:41:34 +0400 Subject: fuse: fix alignment in short read optimization for async_dio The bug was introduced with async_dio feature: trying to optimize short reads, we cut number-of-bytes-to-read to i_size boundary. Hence the following example: truncate --size=300 /mnt/file dd if=/mnt/file of=/dev/null iflag=direct led to FUSE_READ request of 300 bytes size. This turned out to be problem for userspace fuse implementations who rely on assumption that kernel fuse does not change alignment of request from client FS. The patch turns off the optimization if async_dio is disabled. And, if it's enabled, the patch fixes adjustment of number-of-bytes-to-read to preserve alignment. Note, that we cannot throw out short read optimization entirely because otherwise a direct read of a huge size issued on a tiny file would generate a huge amount of fuse requests and most of them would be ACKed by userspace with zero bytes read. Signed-off-by: Maxim Patlasov Reviewed-by: Brian Foster Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b3ad8d61a162..e570081f9f76 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2373,6 +2373,11 @@ static void fuse_do_truncate(struct file *file) fuse_do_setattr(inode, &attr, file); } +static inline loff_t fuse_round_up(loff_t off) +{ + return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); +} + static ssize_t fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) @@ -2380,6 +2385,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, ssize_t ret = 0; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; + bool async_dio = ff->fc->async_dio; loff_t pos = 0; struct inode *inode; loff_t i_size; @@ -2391,10 +2397,10 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, i_size = i_size_read(inode); /* optimization for short read */ - if (rw != WRITE && offset + count > i_size) { + if (async_dio && rw != WRITE && offset + count > i_size) { if (offset >= i_size) return 0; - count = i_size - offset; + count = min_t(loff_t, count, fuse_round_up(i_size - offset)); } io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL); @@ -2412,7 +2418,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * By default, we want to optimize all I/Os with async request * submission to the client filesystem if supported. */ - io->async = ff->fc->async_dio; + io->async = async_dio; io->iocb = iocb; /* @@ -2420,7 +2426,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * to wait on real async I/O requests, so we must submit this request * synchronously. */ - if (!is_sync_kiocb(iocb) && (offset + count > i_size)) + if (!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE) io->async = false; if (rw == WRITE) -- cgit v1.2.2 From 4a586812055dbd2588b0836ab758f6b9670c3949 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 24 May 2013 15:02:49 -0400 Subject: GFS2: Set log descriptor type for jdata blocks This patch sets the log descriptor type according to whether the journal commit is for (journaled) data or metadata. This was recently broken when the functions to process data and metadata log ops were combined. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/lops.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 68b4c8f1fce8..6c33d7b6e0c4 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -419,7 +419,9 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit, if (total > limit) num = limit; gfs2_log_unlock(sdp); - page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_METADATA, num + 1, num); + page = gfs2_get_log_desc(sdp, + is_databuf ? GFS2_LOG_DESC_JDATA : + GFS2_LOG_DESC_METADATA, num + 1, num); ld = page_address(page); gfs2_log_lock(sdp); ptr = (__be64 *)(ld + 1); -- cgit v1.2.2 From 2b3dcf35810ff02ad0e785527a25c1b13bf82b19 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 28 May 2013 10:04:44 -0400 Subject: GFS2: Increase i_writecount during gfs2_setattr_size This patch calls get_write_access in a few functions. This merely increases inode->i_writecount for the duration of the function. That will ensure that any file closes won't delete the inode's multi-block reservation while the function is running. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/bmap.c | 17 +++++++++++++---- fs/gfs2/file.c | 19 +++++++++++++------ fs/gfs2/rgrp.c | 4 +++- 3 files changed, 29 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 1dc9a13ce6bb..93b5809c20bb 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1286,17 +1286,26 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) if (ret) return ret; + ret = get_write_access(inode); + if (ret) + return ret; + inode_dio_wait(inode); ret = gfs2_rs_alloc(GFS2_I(inode)); if (ret) - return ret; + goto out; oldsize = inode->i_size; - if (newsize >= oldsize) - return do_grow(inode, newsize); + if (newsize >= oldsize) { + ret = do_grow(inode, newsize); + goto out; + } - return do_shrink(inode, oldsize, newsize); + ret = do_shrink(inode, oldsize, newsize); +out: + put_write_access(inode); + return ret; } int gfs2_truncatei_resume(struct gfs2_inode *ip) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index acd16764b133..ad0dc38d87ab 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -402,16 +402,20 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) /* Update file times before taking page lock */ file_update_time(vma->vm_file); + ret = get_write_access(inode); + if (ret) + goto out; + ret = gfs2_rs_alloc(ip); if (ret) - return ret; + goto out_write_access; gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret) - goto out; + goto out_uninit; set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); @@ -480,12 +484,15 @@ out_quota_unlock: gfs2_quota_unlock(ip); out_unlock: gfs2_glock_dq(&gh); -out: +out_uninit: gfs2_holder_uninit(&gh); if (ret == 0) { set_page_dirty(page); wait_for_stable_page(page); } +out_write_access: + put_write_access(inode); +out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(ret); } @@ -594,10 +601,10 @@ static int gfs2_release(struct inode *inode, struct file *file) kfree(file->private_data); file->private_data = NULL; - if ((file->f_mode & FMODE_WRITE) && - (atomic_read(&inode->i_writecount) == 1)) - gfs2_rs_delete(ip); + if (!(file->f_mode & FMODE_WRITE)) + return 0; + gfs2_rs_delete(ip); return 0; } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 5232525934ae..9809156e3d04 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -638,8 +638,10 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs) */ void gfs2_rs_delete(struct gfs2_inode *ip) { + struct inode *inode = &ip->i_inode; + down_write(&ip->i_rw_mutex); - if (ip->i_res) { + if (ip->i_res && atomic_read(&inode->i_writecount) <= 1) { gfs2_rs_deltree(ip->i_res); BUG_ON(ip->i_res->rs_free); kmem_cache_free(gfs2_rsrv_cachep, ip->i_res); -- cgit v1.2.2 From e8830d8856e3ad61067dd46c05438b0d75a0441a Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 30 May 2013 09:48:56 -0400 Subject: GFS2: Fall back to vmalloc if kmalloc fails for dir hash tables This version has one more correction: the vmalloc calls are replaced by __vmalloc calls to preserve the GFP_NOFS flag. When GFS2's directory management code allocates buffers for a directory hash table, if it can't get the memory it needs, it currently gives a bad return code. Rather than giving an error, this patch allows it to use virtual memory rather than kernel memory for the hash table. This should make it possible for directories to function properly, even when kernel memory becomes very fragmented. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/dir.c | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index c3e82bd23179..b631c9043460 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -354,22 +354,31 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) return ERR_PTR(-EIO); } - hc = kmalloc(hsize, GFP_NOFS); - ret = -ENOMEM; + hc = kmalloc(hsize, GFP_NOFS | __GFP_NOWARN); + if (hc == NULL) + hc = __vmalloc(hsize, GFP_NOFS, PAGE_KERNEL); + if (hc == NULL) return ERR_PTR(-ENOMEM); ret = gfs2_dir_read_data(ip, hc, hsize); if (ret < 0) { - kfree(hc); + if (is_vmalloc_addr(hc)) + vfree(hc); + else + kfree(hc); return ERR_PTR(ret); } spin_lock(&inode->i_lock); - if (ip->i_hash_cache) - kfree(hc); - else + if (ip->i_hash_cache) { + if (is_vmalloc_addr(hc)) + vfree(hc); + else + kfree(hc); + } else { ip->i_hash_cache = hc; + } spin_unlock(&inode->i_lock); return ip->i_hash_cache; @@ -385,7 +394,10 @@ void gfs2_dir_hash_inval(struct gfs2_inode *ip) { __be64 *hc = ip->i_hash_cache; ip->i_hash_cache = NULL; - kfree(hc); + if (is_vmalloc_addr(hc)) + vfree(hc); + else + kfree(hc); } static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent) @@ -1113,7 +1125,10 @@ static int dir_double_exhash(struct gfs2_inode *dip) if (IS_ERR(hc)) return PTR_ERR(hc); - h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS); + h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN); + if (hc2 == NULL) + hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL); + if (!hc2) return -ENOMEM; @@ -1145,7 +1160,10 @@ fail: gfs2_dinode_out(dip, dibh->b_data); brelse(dibh); out_kfree: - kfree(hc2); + if (is_vmalloc_addr(hc2)) + vfree(hc2); + else + kfree(hc2); return error; } @@ -1846,6 +1864,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); ht = kzalloc(size, GFP_NOFS); + if (ht == NULL) + ht = vzalloc(size); if (!ht) return -ENOMEM; @@ -1933,7 +1953,10 @@ out_rlist: gfs2_rlist_free(&rlist); gfs2_quota_unhold(dip); out: - kfree(ht); + if (is_vmalloc_addr(ht)) + vfree(ht); + else + kfree(ht); return error; } -- cgit v1.2.2 From a6a4d98b0124b5d3befe8b3a99f51f1b4fcc6dcf Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 29 May 2013 11:51:52 -0400 Subject: GFS2: Don't cache iopen glocks This patch makes GFS2 immediately reclaim/delete all iopen glocks as soon as they're dequeued. This allows deleters to get an EXclusive lock on iopen so files are deleted properly instead of being set as unlinked. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 1 + fs/gfs2/super.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 8833a4f264e3..62b484e4a9e4 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -189,6 +189,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, return inode; fail_refresh: + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; ip->i_iopen_gh.gh_gl->gl_object = NULL; gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_iopen: diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 917c8e1eb4ae..e5639dec66c4 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1444,6 +1444,7 @@ static void gfs2_evict_inode(struct inode *inode) /* Must not read inode block until block type has been verified */ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (unlikely(error)) { + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); goto out; } @@ -1514,8 +1515,10 @@ out_unlock: if (gfs2_rs_active(ip->i_res)) gfs2_rs_deltree(ip->i_res); - if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) + if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq(&ip->i_iopen_gh); + } gfs2_holder_uninit(&ip->i_iopen_gh); gfs2_glock_dq_uninit(&gh); if (error && error != GLR_TRYFAILED && error != -EROFS) @@ -1534,6 +1537,7 @@ out: ip->i_gl = NULL; if (ip->i_iopen_gh.gh_gl) { ip->i_iopen_gh.gh_gl->gl_object = NULL; + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); } } -- cgit v1.2.2 From bc5abcf7e411b889f73ea2a90439071a0f451011 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Tue, 4 Jun 2013 10:24:56 -0700 Subject: eCryptfs: Check return of filemap_write_and_wait during fsync Error out of ecryptfs_fsync() if filemap_write_and_wait() fails. Signed-off-by: Tyler Hicks Cc: Paul Taysom Cc: Olof Johansson Cc: stable@vger.kernel.org # v3.6+ --- fs/ecryptfs/file.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 16f509d6fa49..a7abbea2c096 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -295,7 +295,12 @@ static int ecryptfs_release(struct inode *inode, struct file *file) static int ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - filemap_write_and_wait(file->f_mapping); + int rc; + + rc = filemap_write_and_wait(file->f_mapping); + if (rc) + return rc; + return vfs_fsync(ecryptfs_file_to_lower(file), datasync); } -- cgit v1.2.2 From bb9b8e86ad083ecb2567ae909c1d6cb0bbaa60fe Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 3 Jun 2013 15:28:46 +1000 Subject: xfs: rework dquot CRCs Calculating dquot CRCs when the backing buffer is written back just doesn't work reliably. There are several places which manipulate dquots directly in the buffers, and they don't calculate CRCs appropriately, nor do they always set the buffer up to calculate CRCs appropriately. Firstly, if we log a dquot buffer (e.g. during allocation) it gets logged without valid CRC, and so on recovery we end up with a dquot that is not valid. Secondly, if we recover/repair a dquot, we don't have a verifier attached to the buffer and hence CRCs are not calculated on the way down to disk. Thirdly, calculating the CRC after we've changed the contents means that if we re-read the dquot from the buffer, we cannot verify the contents of the dquot are valid, as the CRC is invalid. So, to avoid all the dquot CRC errors that are being detected by the read verifier, change to using the same model as for inodes. That is, dquot CRCs are calculated and written to the backing buffer at the time the dquot is flushed to the backing buffer. If we modify the dquot directly in the backing buffer, calculate the CRC immediately after the modification is complete. Hence the dquot in the on-disk buffer should always have a valid CRC. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit 6fcdc59de28817d1fbf1bd58cc01f4f3fac858fb) --- fs/xfs/xfs_dquot.c | 37 ++++++++++++++++--------------------- fs/xfs/xfs_log_recover.c | 10 ++++++++++ fs/xfs/xfs_qm.c | 40 ++++++++++++++++++++++++++++++---------- fs/xfs/xfs_quota.h | 2 ++ 4 files changed, 58 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index a41f8bf1da37..044e97a33c8d 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -249,8 +249,11 @@ xfs_qm_init_dquot_blk( d->dd_diskdq.d_version = XFS_DQUOT_VERSION; d->dd_diskdq.d_id = cpu_to_be32(curid); d->dd_diskdq.d_flags = type; - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_sb_version_hascrc(&mp->m_sb)) { uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } } xfs_trans_dquot_buf(tp, bp, @@ -286,23 +289,6 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5; } -STATIC void -xfs_dquot_buf_calc_crc( - struct xfs_mount *mp, - struct xfs_buf *bp) -{ - struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; - int i; - - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return; - - for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++, d++) { - xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), - offsetof(struct xfs_dqblk, dd_crc)); - } -} - STATIC bool xfs_dquot_buf_verify_crc( struct xfs_mount *mp, @@ -328,12 +314,11 @@ xfs_dquot_buf_verify_crc( for (i = 0; i < ndquots; i++, d++) { if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), - offsetof(struct xfs_dqblk, dd_crc))) + XFS_DQUOT_CRC_OFF)) return false; if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid)) return false; } - return true; } @@ -393,6 +378,11 @@ xfs_dquot_buf_read_verify( } } +/* + * we don't calculate the CRC here as that is done when the dquot is flushed to + * the buffer after the update is done. This ensures that the dquot in the + * buffer always has an up-to-date CRC value. + */ void xfs_dquot_buf_write_verify( struct xfs_buf *bp) @@ -404,7 +394,6 @@ xfs_dquot_buf_write_verify( xfs_buf_ioerror(bp, EFSCORRUPTED); return; } - xfs_dquot_buf_calc_crc(mp, bp); } const struct xfs_buf_ops xfs_dquot_buf_ops = { @@ -1151,11 +1140,17 @@ xfs_qm_dqflush( * copy the lsn into the on-disk dquot now while we have the in memory * dquot here. This can't be done later in the write verifier as we * can't get access to the log item at that point in time. + * + * We also calculate the CRC here so that the on-disk dquot in the + * buffer always has a valid CRC. This ensures there is no possibility + * of a dquot without an up-to-date CRC getting to disk. */ if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp; dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); + xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); } /* diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index d9e4d3c3991a..d6204d1ac47f 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2266,6 +2266,12 @@ xfs_qm_dqcheck( d->dd_diskdq.d_flags = type; d->dd_diskdq.d_id = cpu_to_be32(id); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + return errs; } @@ -2793,6 +2799,10 @@ xlog_recover_dquot_pass2( } memcpy(ddq, recddq, item->ri_buf[1].i_len); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_target->bt_mount == mp); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index f41702b43003..b75c9bb6e71e 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -41,6 +41,7 @@ #include "xfs_qm.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_cksum.h" /* * The global quota manager. There is only one of these for the entire @@ -839,7 +840,7 @@ xfs_qm_reset_dqcounts( xfs_dqid_t id, uint type) { - xfs_disk_dquot_t *ddq; + struct xfs_dqblk *dqb; int j; trace_xfs_reset_dqcounts(bp, _RET_IP_); @@ -853,8 +854,12 @@ xfs_qm_reset_dqcounts( do_div(j, sizeof(xfs_dqblk_t)); ASSERT(mp->m_quotainfo->qi_dqperchunk == j); #endif - ddq = bp->b_addr; + dqb = bp->b_addr; for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { + struct xfs_disk_dquot *ddq; + + ddq = (struct xfs_disk_dquot *)&dqb[j]; + /* * Do a sanity check, and if needed, repair the dqblk. Don't * output any warnings because it's perfectly possible to @@ -871,7 +876,12 @@ xfs_qm_reset_dqcounts( ddq->d_bwarns = 0; ddq->d_iwarns = 0; ddq->d_rtbwarns = 0; - ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)&dqb[j], + sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } } } @@ -907,19 +917,29 @@ xfs_qm_dqiter_bufs( XFS_FSB_TO_DADDR(mp, bno), mp->m_quotainfo->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); - if (error) - break; /* - * XXX(hch): need to figure out if it makes sense to validate - * the CRC here. + * CRC and validation errors will return a EFSCORRUPTED here. If + * this occurs, re-read without CRC validation so that we can + * repair the damage via xfs_qm_reset_dqcounts(). This process + * will leave a trace in the log indicating corruption has + * been detected. */ + if (error == EFSCORRUPTED) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, bno), + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + NULL); + } + + if (error) + break; + xfs_qm_reset_dqcounts(mp, bp, firstid, type); xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); - /* - * goto the next block. - */ + + /* goto the next block. */ bno++; firstid += mp->m_quotainfo->qi_dqperchunk; } diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index c61e31c7d997..c38068f26c55 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -87,6 +87,8 @@ typedef struct xfs_dqblk { uuid_t dd_uuid; /* location information */ } xfs_dqblk_t; +#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc) + /* * flags for q_flags field in the dquot. */ -- cgit v1.2.2 From ea929536a43226a01d1a73ac8b14d52e81163bd4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 3 Jun 2013 15:28:49 +1000 Subject: xfs: fix remote attribute invalidation for a leaf When invalidating an attribute leaf block block, there might be remote attributes that it points to. With the recent rework of the remote attribute format, we have to make sure we calculate the length of the attribute correctly. We aren't doing that in xfs_attr3_leaf_inactive(), so fix it. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers (cherry picked from commit 59913f14dfe8eb772ff93eb442947451b4416329) --- fs/xfs/xfs_attr_leaf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index d788302e506a..31d3cd129269 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -3258,7 +3258,7 @@ xfs_attr3_leaf_inactive( name_rmt = xfs_attr3_leaf_name_remote(leaf, i); if (name_rmt->valueblk) { lp->valueblk = be32_to_cpu(name_rmt->valueblk); - lp->valuelen = XFS_B_TO_FSB(dp->i_mount, + lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount, be32_to_cpu(name_rmt->valuelen)); lp++; } -- cgit v1.2.2 From 75406170751b4de88a01f73dda56efa617ddd5d7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 5 Jun 2013 12:09:07 +1000 Subject: xfs: fix log recovery transaction item reordering There are several constraints that inode allocation and unlink logging impose on log recovery. These all stem from the fact that inode alloc/unlink are logged in buffers, but all other inode changes are logged in inode items. Hence there are ordering constraints that recovery must follow to ensure the correct result occurs. As it turns out, this ordering has been working mostly by chance than good management. The existing code moves all buffers except cancelled buffers to the head of the list, and everything else to the tail of the list. The problem with this is that is interleaves inode items with the buffer cancellation items, and hence whether the inode item in an cancelled buffer gets replayed is essentially left to chance. Further, this ordering causes problems for log recovery when inode CRCs are enabled. It typically replays the inode unlink buffer long before it replays the inode core changes, and so the CRC recorded in an unlink buffer is going to be invalid and hence any attempt to validate the inode in the buffer is going to fail. Hence we really need to enforce the ordering that the inode alloc/unlink code has expected log recovery to have since inode chunk de-allocation was introduced back in 2003... Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers (cherry picked from commit a775ad778073d55744ed6709ccede36310638911) --- fs/xfs/xfs_log_recover.c | 65 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index d6204d1ac47f..83088d96e6c4 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1599,10 +1599,43 @@ xlog_recover_add_to_trans( } /* - * Sort the log items in the transaction. Cancelled buffers need - * to be put first so they are processed before any items that might - * modify the buffers. If they are cancelled, then the modifications - * don't need to be replayed. + * Sort the log items in the transaction. + * + * The ordering constraints are defined by the inode allocation and unlink + * behaviour. The rules are: + * + * 1. Every item is only logged once in a given transaction. Hence it + * represents the last logged state of the item. Hence ordering is + * dependent on the order in which operations need to be performed so + * required initial conditions are always met. + * + * 2. Cancelled buffers are recorded in pass 1 in a separate table and + * there's nothing to replay from them so we can simply cull them + * from the transaction. However, we can't do that until after we've + * replayed all the other items because they may be dependent on the + * cancelled buffer and replaying the cancelled buffer can remove it + * form the cancelled buffer table. Hence they have tobe done last. + * + * 3. Inode allocation buffers must be replayed before inode items that + * read the buffer and replay changes into it. + * + * 4. Inode unlink buffers must be replayed after inode items are replayed. + * This ensures that inodes are completely flushed to the inode buffer + * in a "free" state before we remove the unlinked inode list pointer. + * + * Hence the ordering needs to be inode allocation buffers first, inode items + * second, inode unlink buffers third and cancelled buffers last. + * + * But there's a problem with that - we can't tell an inode allocation buffer + * apart from a regular buffer, so we can't separate them. We can, however, + * tell an inode unlink buffer from the others, and so we can separate them out + * from all the other buffers and move them to last. + * + * Hence, 4 lists, in order from head to tail: + * - buffer_list for all buffers except cancelled/inode unlink buffers + * - item_list for all non-buffer items + * - inode_buffer_list for inode unlink buffers + * - cancel_list for the cancelled buffers */ STATIC int xlog_recover_reorder_trans( @@ -1612,6 +1645,10 @@ xlog_recover_reorder_trans( { xlog_recover_item_t *item, *n; LIST_HEAD(sort_list); + LIST_HEAD(cancel_list); + LIST_HEAD(buffer_list); + LIST_HEAD(inode_buffer_list); + LIST_HEAD(inode_list); list_splice_init(&trans->r_itemq, &sort_list); list_for_each_entry_safe(item, n, &sort_list, ri_list) { @@ -1619,12 +1656,18 @@ xlog_recover_reorder_trans( switch (ITEM_TYPE(item)) { case XFS_LI_BUF: - if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { + if (buf_f->blf_flags & XFS_BLF_CANCEL) { trace_xfs_log_recover_item_reorder_head(log, trans, item, pass); - list_move(&item->ri_list, &trans->r_itemq); + list_move(&item->ri_list, &cancel_list); break; } + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { + list_move(&item->ri_list, &inode_buffer_list); + break; + } + list_move_tail(&item->ri_list, &buffer_list); + break; case XFS_LI_INODE: case XFS_LI_DQUOT: case XFS_LI_QUOTAOFF: @@ -1632,7 +1675,7 @@ xlog_recover_reorder_trans( case XFS_LI_EFI: trace_xfs_log_recover_item_reorder_tail(log, trans, item, pass); - list_move_tail(&item->ri_list, &trans->r_itemq); + list_move_tail(&item->ri_list, &inode_list); break; default: xfs_warn(log->l_mp, @@ -1643,6 +1686,14 @@ xlog_recover_reorder_trans( } } ASSERT(list_empty(&sort_list)); + if (!list_empty(&buffer_list)) + list_splice(&buffer_list, &trans->r_itemq); + if (!list_empty(&inode_list)) + list_splice_tail(&inode_list, &trans->r_itemq); + if (!list_empty(&inode_buffer_list)) + list_splice_tail(&inode_buffer_list, &trans->r_itemq); + if (!list_empty(&cancel_list)) + list_splice_tail(&cancel_list, &trans->r_itemq); return 0; } -- cgit v1.2.2 From ad868afddb908a5d4015c6b7637721b48fb9c8f9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 5 Jun 2013 12:09:08 +1000 Subject: xfs: inode unlinked list needs to recalculate the inode CRC The inode unlinked list manipulations operate directly on the inode buffer, and so bypass the inode CRC calculation mechanisms. Hence an inode on the unlinked list has an invalid CRC. Fix this by recalculating the CRC whenever we modify an unlinked list pointer in an inode, ncluding during log recovery. This is trivial to do and results in unlinked list operations always leaving a consistent inode in the buffer. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers (cherry picked from commit 0a32c26e720a8b38971d0685976f4a7d63f9e2ef) --- fs/xfs/xfs_inode.c | 16 ++++++++++++++++ fs/xfs/xfs_log_recover.c | 9 +++++++++ 2 files changed, 25 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index efbe1accb6ca..7f7be5f98f52 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1638,6 +1638,10 @@ xfs_iunlink( dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1723,6 +1727,10 @@ xfs_iunlink_remove( dip->di_next_unlinked = cpu_to_be32(NULLAGINO); offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1796,6 +1804,10 @@ xfs_iunlink_remove( dip->di_next_unlinked = cpu_to_be32(NULLAGINO); offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1809,6 +1821,10 @@ xfs_iunlink_remove( last_dip->di_next_unlinked = cpu_to_be32(next_agino); ASSERT(next_agino != 0); offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, last_dip); + xfs_trans_inode_buf(tp, last_ibp); xfs_trans_log_buf(tp, last_ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 83088d96e6c4..45a85ff84da1 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1912,6 +1912,15 @@ xlog_recover_do_inode_buffer( buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, next_unlinked_offset); *buffer_nextp = *logged_nextp; + + /* + * If necessary, recalculate the CRC in the on-disk inode. We + * have to leave the inode in a consistent state for whoever + * reads it next.... + */ + xfs_dinode_calc_crc(mp, (struct xfs_dinode *) + xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); + } return 0; -- cgit v1.2.2 From f763fd440e094be37b38596ee14f1d64caa9bf9c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 5 Jun 2013 12:09:09 +1000 Subject: xfs: disable noattr2/attr2 mount options for CRC enabled filesystems attr2 format is always enabled for v5 superblock filesystems, so the mount options to enable or disable it need to be cause mount errors. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Ben Myers (cherry picked from commit d3eaace84e40bf946129e516dcbd617173c1cf14) --- fs/xfs/xfs_super.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ea341cea68cb..3033ba5e9762 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1372,6 +1372,17 @@ xfs_finish_flags( } } + /* + * V5 filesystems always use attr2 format for attributes. + */ + if (xfs_sb_version_hascrc(&mp->m_sb) && + (mp->m_flags & XFS_MOUNT_NOATTR2)) { + xfs_warn(mp, +"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.", + MNTOPT_NOATTR2, MNTOPT_ATTR2); + return XFS_ERROR(EINVAL); + } + /* * mkfs'ed attr2 will turn on attr2 mount unless explicitly * told by noattr2 to turn it off -- cgit v1.2.2 From 0a8aa1939777dd114479677f0044652c1fd72398 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 5 Jun 2013 12:09:10 +1000 Subject: xfs: increase number of ACL entries for V5 superblocks The limit of 25 ACL entries is arbitrary, but baked into the on-disk format. For version 5 superblocks, increase it to the maximum nuber of ACLs that can fit into a single xattr. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers (cherry picked from commit 5c87d4bc1a86bd6e6754ac3d6e111d776ddcfe57) --- fs/xfs/xfs_acl.c | 31 ++++++++++++++++++------------- fs/xfs/xfs_acl.h | 31 ++++++++++++++++++++++++------- 2 files changed, 42 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 1d32f1d52763..306d883d89bc 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -21,6 +21,8 @@ #include "xfs_bmap_btree.h" #include "xfs_inode.h" #include "xfs_vnodeops.h" +#include "xfs_sb.h" +#include "xfs_mount.h" #include "xfs_trace.h" #include #include @@ -34,7 +36,9 @@ */ STATIC struct posix_acl * -xfs_acl_from_disk(struct xfs_acl *aclp) +xfs_acl_from_disk( + struct xfs_acl *aclp, + int max_entries) { struct posix_acl_entry *acl_e; struct posix_acl *acl; @@ -42,7 +46,7 @@ xfs_acl_from_disk(struct xfs_acl *aclp) unsigned int count, i; count = be32_to_cpu(aclp->acl_cnt); - if (count > XFS_ACL_MAX_ENTRIES) + if (count > max_entries) return ERR_PTR(-EFSCORRUPTED); acl = posix_acl_alloc(count, GFP_KERNEL); @@ -108,9 +112,9 @@ xfs_get_acl(struct inode *inode, int type) struct xfs_inode *ip = XFS_I(inode); struct posix_acl *acl; struct xfs_acl *xfs_acl; - int len = sizeof(struct xfs_acl); unsigned char *ea_name; int error; + int len; acl = get_cached_acl(inode, type); if (acl != ACL_NOT_CACHED) @@ -133,8 +137,8 @@ xfs_get_acl(struct inode *inode, int type) * If we have a cached ACLs value just return it, not need to * go out to the disk. */ - - xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL); + len = XFS_ACL_MAX_SIZE(ip->i_mount); + xfs_acl = kzalloc(len, GFP_KERNEL); if (!xfs_acl) return ERR_PTR(-ENOMEM); @@ -153,7 +157,7 @@ xfs_get_acl(struct inode *inode, int type) goto out; } - acl = xfs_acl_from_disk(xfs_acl); + acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount)); if (IS_ERR(acl)) goto out; @@ -189,16 +193,17 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) if (acl) { struct xfs_acl *xfs_acl; - int len; + int len = XFS_ACL_MAX_SIZE(ip->i_mount); - xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL); + xfs_acl = kzalloc(len, GFP_KERNEL); if (!xfs_acl) return -ENOMEM; xfs_acl_to_disk(xfs_acl, acl); - len = sizeof(struct xfs_acl) - - (sizeof(struct xfs_acl_entry) * - (XFS_ACL_MAX_ENTRIES - acl->a_count)); + + /* subtract away the unused acl entries */ + len -= sizeof(struct xfs_acl_entry) * + (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count); error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, len, ATTR_ROOT); @@ -243,7 +248,7 @@ xfs_set_mode(struct inode *inode, umode_t mode) static int xfs_acl_exists(struct inode *inode, unsigned char *name) { - int len = sizeof(struct xfs_acl); + int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb)); return (xfs_attr_get(XFS_I(inode), name, NULL, &len, ATTR_ROOT|ATTR_KERNOVAL) == 0); @@ -379,7 +384,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, goto out_release; error = -EINVAL; - if (acl->a_count > XFS_ACL_MAX_ENTRIES) + if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb))) goto out_release; if (type == ACL_TYPE_ACCESS) { diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 39632d941354..4016a567b83c 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -22,19 +22,36 @@ struct inode; struct posix_acl; struct xfs_inode; -#define XFS_ACL_MAX_ENTRIES 25 #define XFS_ACL_NOT_PRESENT (-1) /* On-disk XFS access control list structure */ +struct xfs_acl_entry { + __be32 ae_tag; + __be32 ae_id; + __be16 ae_perm; + __be16 ae_pad; /* fill the implicit hole in the structure */ +}; + struct xfs_acl { - __be32 acl_cnt; - struct xfs_acl_entry { - __be32 ae_tag; - __be32 ae_id; - __be16 ae_perm; - } acl_entry[XFS_ACL_MAX_ENTRIES]; + __be32 acl_cnt; + struct xfs_acl_entry acl_entry[0]; }; +/* + * The number of ACL entries allowed is defined by the on-disk format. + * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is + * limited only by the maximum size of the xattr that stores the information. + */ +#define XFS_ACL_MAX_ENTRIES(mp) \ + (xfs_sb_version_hascrc(&mp->m_sb) \ + ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \ + sizeof(struct xfs_acl_entry) \ + : 25) + +#define XFS_ACL_MAX_SIZE(mp) \ + (sizeof(struct xfs_acl) + \ + sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp))) + /* On-disk XFS extended attribute names */ #define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" #define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" -- cgit v1.2.2 From 698b8223631472bf982ed570b0812faa61955683 Mon Sep 17 00:00:00 2001 From: Dave Chiluk Date: Tue, 28 May 2013 16:06:08 -0500 Subject: ncpfs: fix rmdir returns Device or resource busy 1d2ef5901483004d74947bbf78d5146c24038fe7 caused a regression in ncpfs such that directories could no longer be removed. This was because ncp_rmdir checked to see if a dentry could be unhashed before allowing it to be removed. Since 1d2ef5901483004d74947bbf78d5146c24038fe7 introduced a change that incremented dentry->d_count causing it to always be greater than 1 unhash would always fail. Thus causing the error path in ncp_rmdir to always be taken. Removing this error path is safe as unhashing is still accomplished by calls to dput from vfs_rmdir. Signed-off-by: Dave Chiluk Signed-off-by: Petr Vandrovec Signed-off-by: Al Viro --- fs/ncpfs/dir.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'fs') diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 816326093656..6792ce11f2bf 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -1029,15 +1029,6 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry) DPRINTK("ncp_rmdir: removing %s/%s\n", dentry->d_parent->d_name.name, dentry->d_name.name); - /* - * fail with EBUSY if there are still references to this - * directory. - */ - dentry_unhash(dentry); - error = -EBUSY; - if (!d_unhashed(dentry)) - goto out; - len = sizeof(__name); error = ncp_io2vol(server, __name, &len, dentry->d_name.name, dentry->d_name.len, !ncp_preserve_case(dir)); -- cgit v1.2.2 From 7b5ff90ed081787ec0765ceb4fe5ccf5677493a6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 6 Jun 2013 10:29:40 -0400 Subject: Btrfs: don't delete fs_roots until after we cleanup the transaction We get a use after free if we had a transaction to cleanup since there could be delayed inodes which refer to their respective fs_root. Thanks Reported-by: David Sterba Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e7b3cb5286a5..bdaa092d6296 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2859,8 +2859,8 @@ fail_qgroup: btrfs_free_qgroup_config(fs_info); fail_trans_kthread: kthread_stop(fs_info->transaction_kthread); - del_fs_roots(fs_info); btrfs_cleanup_transaction(fs_info->tree_root); + del_fs_roots(fs_info); fail_cleaner: kthread_stop(fs_info->cleaner_kthread); -- cgit v1.2.2 From 6379ef9fb2482a92b5fe09f927d6ce1f989c0c6d Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 6 Jun 2013 09:56:34 +0000 Subject: btrfs: Drop inode if inode root is NULL There is a path where btrfs_drop_inode() is called with its inode's root is NULL: In btrfs_new_inode(), when btrfs_set_inode_index() fails, iput() is called. We should handle this case before taking look at the root->root_item. Signed-off-by: Naohiro Aota Reviewed-by: Miao Xie Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 23c596cd1b98..c931a4dbd031 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8011,6 +8011,9 @@ int btrfs_drop_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + if (root == NULL) + return 1; + /* the snap/subvol tree is on deleting */ if (btrfs_root_refs(&root->root_item) == 0 && root != root->fs_info->tree_root) -- cgit v1.2.2 From a9995eece39a0630ebbfc1ab38570bce6c8a8f5b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 31 May 2013 13:04:36 -0400 Subject: Btrfs: init relocate extent_io_tree with a mapping Dave reported a NULL pointer deref. This is caused because he thought he'd be smart and add sanity checks to the extent_io bit operations, but he didn't expect a tree to have a NULL mapping. To fix this we just need to init the relocation's processed_blocks with the btree_inode->i_mapping. Thanks, Reported-by: David Sterba Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 395b82031a42..4febca4fc2de 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4082,7 +4082,7 @@ out: return inode; } -static struct reloc_control *alloc_reloc_control(void) +static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) { struct reloc_control *rc; @@ -4093,7 +4093,8 @@ static struct reloc_control *alloc_reloc_control(void) INIT_LIST_HEAD(&rc->reloc_roots); backref_cache_init(&rc->backref_cache); mapping_tree_init(&rc->reloc_root_tree); - extent_io_tree_init(&rc->processed_blocks, NULL); + extent_io_tree_init(&rc->processed_blocks, + fs_info->btree_inode->i_mapping); return rc; } @@ -4110,7 +4111,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) int rw = 0; int err = 0; - rc = alloc_reloc_control(); + rc = alloc_reloc_control(fs_info); if (!rc) return -ENOMEM; @@ -4311,7 +4312,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) if (list_empty(&reloc_roots)) goto out; - rc = alloc_reloc_control(); + rc = alloc_reloc_control(root->fs_info); if (!rc) { err = -ENOMEM; goto out; -- cgit v1.2.2 From 2932505abe7c56477315a3d93ffb3c27c5182e9d Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Sun, 26 May 2013 13:50:27 +0000 Subject: Btrfs: fix use-after-free bug during umount Commit be283b2e674a09457d4563729015adb637ce7cc1 ( Btrfs: use helper to cleanup tree roots) introduced the following bug, BUG: unable to handle kernel NULL pointer dereference at 0000000000000034 IP: [] extent_buffer_get+0x4/0xa [btrfs] [...] Pid: 2463, comm: btrfs-cache-1 Tainted: G O 3.9.0+ #4 innotek GmbH VirtualBox/VirtualBox RIP: 0010:[] [] extent_buffer_get+0x4/0xa [btrfs] Process btrfs-cache-1 (pid: 2463, threadinfo ffff880112d60000, task ffff880117679730) [...] Call Trace: [] btrfs_search_slot+0x104/0x64d [btrfs] [] btrfs_next_old_leaf+0xa7/0x334 [btrfs] [] btrfs_next_leaf+0x10/0x12 [btrfs] [] caching_thread+0x1a3/0x2e0 [btrfs] [] worker_loop+0x14b/0x48e [btrfs] [] ? btrfs_queue_worker+0x25c/0x25c [btrfs] [] kthread+0x8d/0x95 [] ? kthread_freezable_should_stop+0x43/0x43 [] ret_from_fork+0x7c/0xb0 [] ? kthread_freezable_should_stop+0x43/0x43 RIP [] extent_buffer_get+0x4/0xa [btrfs] We've free'ed commit_root before actually getting to free block groups where caching thread needs valid extent_root->commit_root. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index bdaa092d6296..7c66c2314c14 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3512,10 +3512,10 @@ int close_ctree(struct btrfs_root *root) percpu_counter_sum(&fs_info->delalloc_bytes)); } - free_root_pointers(fs_info, 1); - btrfs_free_block_groups(fs_info); + free_root_pointers(fs_info, 1); + del_fs_roots(fs_info); iput(fs_info->btree_inode); -- cgit v1.2.2 From 13e6c37b989859e70b0d73d3f2cb0aa022159b17 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 30 May 2013 16:55:44 -0400 Subject: Btrfs: stop all workers before cleaning up roots Dave reported a panic because the extent_root->commit_root was NULL in the caching kthread. That is because we just unset it in free_root_pointers, which is not the correct thing to do, we have to either wait for the caching kthread to complete or hold the extent_commit_sem lock so we know the thread has exited. This patch makes the kthreads all stop first and then we do our cleanup. This should fix the race. Thanks, Reported-by: David Sterba Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7c66c2314c14..b8b60b660c8f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3514,13 +3514,13 @@ int close_ctree(struct btrfs_root *root) btrfs_free_block_groups(fs_info); - free_root_pointers(fs_info, 1); + btrfs_stop_all_workers(fs_info); del_fs_roots(fs_info); - iput(fs_info->btree_inode); + free_root_pointers(fs_info, 1); - btrfs_stop_all_workers(fs_info); + iput(fs_info->btree_inode); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(root, CHECK_INTEGRITY)) -- cgit v1.2.2 From bbd465df73f0d8ba41b8a0732766a243d0f5b356 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 9 Jun 2013 01:25:57 +0200 Subject: hpfs: fix warnings when the filesystem fills up This patch fixes warnings due to missing lock on write error path. WARNING: at fs/hpfs/hpfs_fn.h:353 hpfs_truncate+0x75/0x80 [hpfs]() Hardware name: empty Pid: 26563, comm: dd Tainted: P O 3.9.4 #12 Call Trace: hpfs_truncate+0x75/0x80 [hpfs] hpfs_write_begin+0x84/0x90 [hpfs] _hpfs_bmap+0x10/0x10 [hpfs] generic_file_buffered_write+0x121/0x2c0 __generic_file_aio_write+0x1c7/0x3f0 generic_file_aio_write+0x7c/0x100 do_sync_write+0x98/0xd0 hpfs_file_write+0xd/0x50 [hpfs] vfs_write+0xa2/0x160 sys_write+0x51/0xa0 page_fault+0x22/0x30 system_call_fastpath+0x1a/0x1f Signed-off-by: Mikulas Patocka Cc: stable@kernel.org # 2.6.39+ Signed-off-by: Linus Torvalds --- fs/hpfs/file.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 3027f4dbbab5..e4ba5fe4c3b5 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -109,10 +109,14 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; + hpfs_lock(inode->i_sb); + if (to > inode->i_size) { truncate_pagecache(inode, to, inode->i_size); hpfs_truncate(inode); } + + hpfs_unlock(inode->i_sb); } static int hpfs_write_begin(struct file *file, struct address_space *mapping, -- cgit v1.2.2 From 637241a900cbd982f744d44646b48a273d609b34 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 12 Jun 2013 14:04:39 -0700 Subject: kmsg: honor dmesg_restrict sysctl on /dev/kmsg The dmesg_restrict sysctl currently covers the syslog method for access dmesg, however /dev/kmsg isn't covered by the same protections. Most people haven't noticed because util-linux dmesg(1) defaults to using the syslog method for access in older versions. With util-linux dmesg(1) defaults to reading directly from /dev/kmsg. To fix /dev/kmsg, let's compare the existing interfaces and what they allow: - /proc/kmsg allows: - open (SYSLOG_ACTION_OPEN) if CAP_SYSLOG since it uses a destructive single-reader interface (SYSLOG_ACTION_READ). - everything, after an open. - syslog syscall allows: - anything, if CAP_SYSLOG. - SYSLOG_ACTION_READ_ALL and SYSLOG_ACTION_SIZE_BUFFER, if dmesg_restrict==0. - nothing else (EPERM). The use-cases were: - dmesg(1) needs to do non-destructive SYSLOG_ACTION_READ_ALLs. - sysklog(1) needs to open /proc/kmsg, drop privs, and still issue the destructive SYSLOG_ACTION_READs. AIUI, dmesg(1) is moving to /dev/kmsg, and systemd-journald doesn't clear the ring buffer. Based on the comments in devkmsg_llseek, it sounds like actions besides reading aren't going to be supported by /dev/kmsg (i.e. SYSLOG_ACTION_CLEAR), so we have a strict subset of the non-destructive syslog syscall actions. To this end, move the check as Josh had done, but also rename the constants to reflect their new uses (SYSLOG_FROM_CALL becomes SYSLOG_FROM_READER, and SYSLOG_FROM_FILE becomes SYSLOG_FROM_PROC). SYSLOG_FROM_READER allows non-destructive actions, and SYSLOG_FROM_PROC allows destructive actions after a capabilities-constrained SYSLOG_ACTION_OPEN check. - /dev/kmsg allows: - open if CAP_SYSLOG or dmesg_restrict==0 - reading/polling, after open Addresses https://bugzilla.redhat.com/show_bug.cgi?id=903192 [akpm@linux-foundation.org: use pr_warn_once()] Signed-off-by: Kees Cook Reported-by: Christian Kujau Tested-by: Josh Boyer Cc: Kay Sievers Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kmsg.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index bd4b5a740ff1..bdfabdaefdce 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -21,12 +21,12 @@ extern wait_queue_head_t log_wait; static int kmsg_open(struct inode * inode, struct file * file) { - return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC); } static int kmsg_release(struct inode * inode, struct file * file) { - (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE); + (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_PROC); return 0; } @@ -34,15 +34,15 @@ static ssize_t kmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { if ((file->f_flags & O_NONBLOCK) && - !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) + !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) return -EAGAIN; - return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC); } static unsigned int kmsg_poll(struct file *file, poll_table *wait) { poll_wait(file, &log_wait, wait); - if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) + if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) return POLLIN | POLLRDNORM; return 0; } -- cgit v1.2.2 From 7869e590679ed71cd1a1e676e8c1c179762c3efe Mon Sep 17 00:00:00 2001 From: "Xiaowei.Hu" Date: Wed, 12 Jun 2013 14:04:41 -0700 Subject: ocfs2: ocfs2_prep_new_orphaned_file() should return ret If an error occurs, for example an EIO in __ocfs2_prepare_orphan_dir, ocfs2_prep_new_orphaned_file will release the inode_ac, then when the caller of ocfs2_prep_new_orphaned_file gets a 0 return, it will refer to a NULL ocfs2_alloc_context struct in the following functions. A kernel panic happens. Signed-off-by: "Xiaowei.Hu" Reviewed-by: shencanquan Acked-by: Sunil Mushran Cc: Joe Jin Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 04ee1b57c243..b563351753f1 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2216,7 +2216,7 @@ out: brelse(orphan_dir_bh); - return 0; + return ret; } int ocfs2_create_inode_in_orphan(struct inode *dir, -- cgit v1.2.2 From e099127169429c19544a8f55dd26937fddd5b1f4 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 12 Jun 2013 14:04:51 -0700 Subject: fs/ocfs2/namei.c: remove unecessary ERROR when removing non-empty directory While removing a non-empty directory, the kernel dumps a message: (rmdir,21743,1):ocfs2_unlink:953 ERROR: status = -39 Suppress the error message from being printed in the dmesg so users don't panic. Signed-off-by: Goldwyn Rodrigues Cc: Mark Fasheh Cc: Joel Becker Acked-by: Sunil Mushran Reviewed-by: Jie Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index b563351753f1..b4a5cdf9dbc5 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -947,7 +947,7 @@ leave: ocfs2_free_dir_lookup_result(&orphan_insert); ocfs2_free_dir_lookup_result(&lookup); - if (status) + if (status && (status != -ENOTEMPTY)) mlog_errno(status); return status; -- cgit v1.2.2 From 4fcc712f5c48b1e32cdbf9b9cfba42a27b2e3160 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 12 Jun 2013 14:04:59 -0700 Subject: aio: fix io_destroy() regression by using call_rcu() There was a regression introduced by 36f5588905c1 ("aio: refcounting cleanup"), reported by Jens Axboe - the refcounting cleanup switched to using RCU in the shutdown path, but the synchronize_rcu() was done in the context of the io_destroy() syscall greatly increasing the time it could block. This patch switches it to call_rcu() and makes shutdown asynchronous (more asynchronous than it was originally; before the refcount changes io_destroy() would still wait on pending kiocbs). Note that there's a global quota on the max outstanding kiocbs, and that quota must be manipulated synchronously; otherwise io_setup() could return -EAGAIN when there isn't quota available, and userspace won't have any way of waiting until shutdown of the old kioctxs has finished (besides busy looping). So we release our quota before kioctx shutdown has finished, which should be fine since the quota never corresponded to anything real anyways. Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Reported-by: Jens Axboe Tested-by: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Signed-off-by: Benjamin LaHaise Tested-by: Benjamin LaHaise Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 7fe5bdee1630..2bbcacf74d0c 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -141,9 +141,6 @@ static void aio_free_ring(struct kioctx *ctx) for (i = 0; i < ctx->nr_pages; i++) put_page(ctx->ring_pages[i]); - if (ctx->mmap_size) - vm_munmap(ctx->mmap_base, ctx->mmap_size); - if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) kfree(ctx->ring_pages); } @@ -322,11 +319,6 @@ static void free_ioctx(struct kioctx *ctx) aio_free_ring(ctx); - spin_lock(&aio_nr_lock); - BUG_ON(aio_nr - ctx->max_reqs > aio_nr); - aio_nr -= ctx->max_reqs; - spin_unlock(&aio_nr_lock); - pr_debug("freeing %p\n", ctx); /* @@ -435,17 +427,24 @@ static void kill_ioctx(struct kioctx *ctx) { if (!atomic_xchg(&ctx->dead, 1)) { hlist_del_rcu(&ctx->list); - /* Between hlist_del_rcu() and dropping the initial ref */ - synchronize_rcu(); /* - * We can't punt to workqueue here because put_ioctx() -> - * free_ioctx() will unmap the ringbuffer, and that has to be - * done in the original process's context. kill_ioctx_rcu/work() - * exist for exit_aio(), as in that path free_ioctx() won't do - * the unmap. + * It'd be more correct to do this in free_ioctx(), after all + * the outstanding kiocbs have finished - but by then io_destroy + * has already returned, so io_setup() could potentially return + * -EAGAIN with no ioctxs actually in use (as far as userspace + * could tell). */ - kill_ioctx_work(&ctx->rcu_work); + spin_lock(&aio_nr_lock); + BUG_ON(aio_nr - ctx->max_reqs > aio_nr); + aio_nr -= ctx->max_reqs; + spin_unlock(&aio_nr_lock); + + if (ctx->mmap_size) + vm_munmap(ctx->mmap_base, ctx->mmap_size); + + /* Between hlist_del_rcu() and dropping the initial ref */ + call_rcu(&ctx->rcu_head, kill_ioctx_rcu); } } @@ -495,10 +494,7 @@ void exit_aio(struct mm_struct *mm) */ ctx->mmap_size = 0; - if (!atomic_xchg(&ctx->dead, 1)) { - hlist_del_rcu(&ctx->list); - call_rcu(&ctx->rcu_head, kill_ioctx_rcu); - } + kill_ioctx(ctx); } } -- cgit v1.2.2 From 27749f2ff0717e115680922000839ad6a576eddf Mon Sep 17 00:00:00 2001 From: Xue jiufei Date: Wed, 12 Jun 2013 14:05:03 -0700 Subject: ocfs2: add missing lockres put in dlm_mig_lockres_handler dlm_mig_lockres_handler() is missing a dlm_lockres_put() on an error path. Signed-off-by: joyce Reviewed-by: shencanquan Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index b3fdd1a323d6..e68588e6b1e8 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1408,6 +1408,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, mres->lockname_len, mres->lockname); ret = -EFAULT; spin_unlock(&res->spinlock); + dlm_lockres_put(res); goto leave; } res->state |= DLM_LOCK_RES_MIGRATING; -- cgit v1.2.2 From 47ad2fcba9ddd0630acccb13c71f19a818947751 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 27 May 2013 16:38:19 +1000 Subject: xfs: don't emit v5 superblock warnings on write We write the superblock every 30s or so which results in the verifier being called. Right now that results in this output every 30s: XFS (vda): Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled! Use of these features in this kernel is at your own risk! And spamming the logs. We don't need to check for whether we support v5 superblocks or whether there are feature bits we don't support set as these are only relevant when we first mount the filesytem. i.e. on superblock read. Hence for the write verification we can just skip all the checks (and hence verbose output) altogether. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Ben Myers (cherry picked from commit 34510185abeaa5be9b178a41c0a03d30aec3db7e) --- fs/xfs/xfs_mount.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index f6bfbd734669..e8e310c05097 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -314,7 +314,8 @@ STATIC int xfs_mount_validate_sb( xfs_mount_t *mp, xfs_sb_t *sbp, - bool check_inprogress) + bool check_inprogress, + bool check_version) { /* @@ -337,9 +338,10 @@ xfs_mount_validate_sb( /* * Version 5 superblock feature mask validation. Reject combinations the - * kernel cannot support up front before checking anything else. + * kernel cannot support up front before checking anything else. For + * write validation, we don't need to check feature masks. */ - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { + if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { xfs_alert(mp, "Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n" "Use of these features in this kernel is at your own risk!"); @@ -675,7 +677,8 @@ xfs_sb_to_disk( static int xfs_sb_verify( - struct xfs_buf *bp) + struct xfs_buf *bp, + bool check_version) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_sb sb; @@ -686,7 +689,8 @@ xfs_sb_verify( * Only check the in progress field for the primary superblock as * mkfs.xfs doesn't clear it from secondary superblocks. */ - return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR); + return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR, + check_version); } /* @@ -719,7 +723,7 @@ xfs_sb_read_verify( goto out_error; } } - error = xfs_sb_verify(bp); + error = xfs_sb_verify(bp, true); out_error: if (error) { @@ -758,7 +762,7 @@ xfs_sb_write_verify( struct xfs_buf_log_item *bip = bp->b_fspriv; int error; - error = xfs_sb_verify(bp); + error = xfs_sb_verify(bp, false); if (error) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, error); -- cgit v1.2.2 From 5170711df79b284cf95b3924322e8ac4c0fd6c76 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 12 Jun 2013 12:19:07 +1000 Subject: xfs: fix implicit padding in directory and attr CRC formats Michael L. Semon has been testing CRC patches on a 32 bit system and been seeing assert failures in the directory code from xfs/080. Thanks to Michael's heroic efforts with printk debugging, we found that the problem was that the last free space being left in the directory structure was too small to fit a unused tag structure and it was being corrupted and attempting to log a region out of bounds. Hence the assert failure looked something like: ..... #5 calling xfs_dir2_data_log_unused() 36 32 #1 4092 4095 4096 #2 8182 8183 4096 XFS: Assertion failed: first <= last && last < BBTOB(bp->b_length), file: fs/xfs/xfs_trans_buf.c, line: 568 Where #1 showed the first region of the dup being logged (i.e. the last 4 bytes of a directory buffer) and #2 shows the corrupt values being calculated from the length of the dup entry which overflowed the size of the buffer. It turns out that the problem was not in the logging code, nor in the freespace handling code. It is an initial condition bug that only shows up on 32 bit systems. When a new buffer is initialised, where's the freespace that is set up: [ 172.316249] calling xfs_dir2_leaf_addname() from xfs_dir_createname() [ 172.316346] #9 calling xfs_dir2_data_log_unused() [ 172.316351] #1 calling xfs_trans_log_buf() 60 63 4096 [ 172.316353] #2 calling xfs_trans_log_buf() 4094 4095 4096 Note the offset of the first region being logged? It's 60 bytes into the buffer. Once I saw that, I pretty much knew that the bug was going to be caused by this. Essentially, all direct entries are rounded to 8 bytes in length, and all entries start with an 8 byte alignment. This means that we can decode inplace as variables are naturally aligned. With the directory data supposedly starting on a 8 byte boundary, and all entries padded to 8 bytes, the minimum freespace in a directory block is supposed to be 8 bytes, which is large enough to fit a unused data entry structure (6 bytes in size). The fact we only have 4 bytes of free space indicates a directory data block alignment problem. And what do you know - there's an implicit hole in the directory data block header for the CRC format, which means the header is 60 byte on 32 bit intel systems and 64 bytes on 64 bit systems. Needs padding. And while looking at the structures, I found the same problem in the attr leaf header. Fix them both. Note that this only affects 32 bit systems with CRCs enabled. Everything else is just fine. Note that CRC enabled filesystems created before this fix on such systems will not be readable with this fix applied. Reported-by: Michael L. Semon Debugged-by: Michael L. Semon Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit 8a1fd2950e1fe267e11fc8c85dcaa6b023b51b60) --- fs/xfs/xfs_attr_leaf.h | 1 + fs/xfs/xfs_dir2_format.h | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index f9d7846097e2..444a7704596c 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -128,6 +128,7 @@ struct xfs_attr3_leaf_hdr { __u8 holes; __u8 pad1; struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE]; + __be32 pad2; /* 64 bit alignment */ }; #define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc)) diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h index 995f1f505a52..7826782b8d78 100644 --- a/fs/xfs/xfs_dir2_format.h +++ b/fs/xfs/xfs_dir2_format.h @@ -266,6 +266,7 @@ struct xfs_dir3_blk_hdr { struct xfs_dir3_data_hdr { struct xfs_dir3_blk_hdr hdr; xfs_dir2_data_free_t best_free[XFS_DIR2_DATA_FD_COUNT]; + __be32 pad; /* 64 bit alignment */ }; #define XFS_DIR3_DATA_CRC_OFF offsetof(struct xfs_dir3_data_hdr, hdr.crc) @@ -477,7 +478,7 @@ struct xfs_dir3_leaf_hdr { struct xfs_da3_blkinfo info; /* header for da routines */ __be16 count; /* count of entries */ __be16 stale; /* count of stale entries */ - __be32 pad; + __be32 pad; /* 64 bit alignment */ }; struct xfs_dir3_icleaf_hdr { @@ -715,7 +716,7 @@ struct xfs_dir3_free_hdr { __be32 firstdb; /* db of first entry */ __be32 nvalid; /* count of valid entries */ __be32 nused; /* count of used entries */ - __be32 pad; /* 64 bit alignment. */ + __be32 pad; /* 64 bit alignment */ }; struct xfs_dir3_free { -- cgit v1.2.2 From 088c9f67c3f53339d2bc20b42a9cb904901fdc5d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 12 Jun 2013 12:19:08 +1000 Subject: xfs: ensure btree root split sets blkno correctly For CRC enabled filesystems, the BMBT is rooted in an inode, so it passes through a different code path on root splits than the freespace and inode btrees. This is much less traversed by xfstests than the other trees. When testing on a 1k block size filesystem, I've been seeing ASSERT failures in generic/234 like: XFS: Assertion failed: cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_private.b.allocated == 0, file: fs/xfs/xfs_btree.c, line: 317 which are generally preceded by a lblock check failure. I noticed this in the bmbt stats: $ pminfo -f xfs.btree.block_map xfs.btree.block_map.lookup value 39135 xfs.btree.block_map.compare value 268432 xfs.btree.block_map.insrec value 15786 xfs.btree.block_map.delrec value 13884 xfs.btree.block_map.newroot value 2 xfs.btree.block_map.killroot value 0 ..... Very little coverage of root splits and merges. Indeed, on a 4k filesystem, block_map.newroot and block_map.killroot are both zero. i.e. the code is not exercised at all, and it's the only generic btree infrastructure operation that is not exercised by a default run of xfstests. Turns out that on a 1k filesystem, generic/234 accounts for one of those two root splits, and that is somewhat of a smoking gun. In fact, it's the same problem we saw in the directory/attr code where headers are memcpy()d from one block to another without updating the self describing metadata. Simple fix - when copying the header out of the root block, make sure the block number is updated correctly. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit ade1335afef556df6538eb02e8c0dc91fbd9cc37) --- fs/xfs/xfs_btree.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 8804b8a3c310..0903960410a2 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -2544,7 +2544,17 @@ xfs_btree_new_iroot( if (error) goto error0; + /* + * we can't just memcpy() the root in for CRC enabled btree blocks. + * In that case have to also ensure the blkno remains correct + */ memcpy(cblock, block, xfs_btree_block_len(cur)); + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn); + else + cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn); + } be16_add_cpu(&block->bb_level, 1); xfs_btree_set_numrecs(block, 1); -- cgit v1.2.2 From d302cf1d316dca5f567e89872cf5d475c9a55f74 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 12 Jun 2013 12:19:06 +1000 Subject: xfs: don't shutdown log recovery on validation errors Unfortunately, we cannot guarantee that items logged multiple times and replayed by log recovery do not take objects back in time. When they are taken back in time, the go into an intermediate state which is corrupt, and hence verification that occurs on this intermediate state causes log recovery to abort with a corruption shutdown. Instead of causing a shutdown and unmountable filesystem, don't verify post-recovery items before they are written to disk. This is less than optimal, but there is no way to detect this issue for non-CRC filesystems If log recovery successfully completes, this will be undone and the object will be consistent by subsequent transactions that are replayed, so in most cases we don't need to take drastic action. For CRC enabled filesystems, leave the verifiers in place - we need to call them to recalculate the CRCs on the objects anyway. This recovery problem can be solved for such filesystems - we have a LSN stamped in all metadata at writeback time that we can to determine whether the item should be replayed or not. This is a separate piece of work, so is not addressed by this patch. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit 9222a9cf86c0d64ffbedf567412b55da18763aa3) --- fs/xfs/xfs_log_recover.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 45a85ff84da1..7cf5e4eafe28 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1845,7 +1845,13 @@ xlog_recover_do_inode_buffer( xfs_agino_t *buffer_nextp; trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); - bp->b_ops = &xfs_inode_buf_ops; + + /* + * Post recovery validation only works properly on CRC enabled + * filesystems. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + bp->b_ops = &xfs_inode_buf_ops; inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { @@ -2205,7 +2211,16 @@ xlog_recover_do_reg_buffer( /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); - xlog_recovery_validate_buf_type(mp, bp, buf_f); + /* + * We can only do post recovery validation on items on CRC enabled + * fielsystems as we need to know when the buffer was written to be able + * to determine if we should have replayed the item. If we replay old + * metadata over a newer buffer, then it will enter a temporarily + * inconsistent state resulting in verification failures. Hence for now + * just avoid the verification stage for non-crc filesystems + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + xlog_recovery_validate_buf_type(mp, bp, buf_f); } /* -- cgit v1.2.2 From e7b2c4069252732d52f1de6d1f7c82d99a156659 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 14 Jun 2013 21:09:47 +0200 Subject: fput: task_work_add() can fail if the caller has passed exit_task_work() fput() assumes that it can't be called after exit_task_work() but this is not true, for example free_ipc_ns()->shm_destroy() can do this. In this case fput() silently leaks the file. Change it to fallback to delayed_fput_work if task_work_add() fails. The patch looks complicated but it is not, it changes the code from if (PF_KTHREAD) { schedule_work(...); return; } task_work_add(...) to if (!PF_KTHREAD) { if (!task_work_add(...)) return; /* fallback */ } schedule_work(...); As for shm_destroy() in particular, we could make another fix but I think this change makes sense anyway. There could be another similar user, it is not safe to assume that task_work_add() can't fail. Reported-by: Andrey Vagin Signed-off-by: Oleg Nesterov Signed-off-by: Al Viro --- fs/file_table.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/file_table.c b/fs/file_table.c index cd4d87a82951..485dc0eddd67 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -306,17 +306,18 @@ void fput(struct file *file) { if (atomic_long_dec_and_test(&file->f_count)) { struct task_struct *task = current; + unsigned long flags; + file_sb_list_del(file); - if (unlikely(in_interrupt() || task->flags & PF_KTHREAD)) { - unsigned long flags; - spin_lock_irqsave(&delayed_fput_lock, flags); - list_add(&file->f_u.fu_list, &delayed_fput_list); - schedule_work(&delayed_fput_work); - spin_unlock_irqrestore(&delayed_fput_lock, flags); - return; + if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { + init_task_work(&file->f_u.fu_rcuhead, ____fput); + if (!task_work_add(task, &file->f_u.fu_rcuhead, true)) + return; } - init_task_work(&file->f_u.fu_rcuhead, ____fput); - task_work_add(task, &file->f_u.fu_rcuhead, true); + spin_lock_irqsave(&delayed_fput_lock, flags); + list_add(&file->f_u.fu_list, &delayed_fput_list); + schedule_work(&delayed_fput_work); + spin_unlock_irqrestore(&delayed_fput_lock, flags); } } -- cgit v1.2.2 From 05252901199d886a68830befb135d1723730ca86 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 6 Jun 2013 19:33:47 -0400 Subject: use can_lookup() instead of direct checks of ->i_op->lookup a couple of places got missed back when Linus has introduced that one... Signed-off-by: Al Viro --- fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 85e40d1c0a8f..9ed9361223c0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1976,7 +1976,7 @@ static int path_lookupat(int dfd, const char *name, err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) { - if (!nd->inode->i_op->lookup) { + if (!can_lookup(nd->inode)) { path_put(&nd->path); err = -ENOTDIR; } @@ -2850,7 +2850,7 @@ finish_lookup: if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode)) goto out; error = -ENOTDIR; - if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup) + if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode)) goto out; audit_inode(name, nd->path.dentry, 0); finish_open: -- cgit v1.2.2 From 14c14414d157ea851119c96c61a17306a2b4a035 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Thu, 13 Jun 2013 12:16:39 +0400 Subject: fuse: hold i_mutex in fuse_file_fallocate() Changing size of a file on server and local update (fuse_write_update_size) should be always protected by inode->i_mutex. Otherwise a race like this is possible: 1. Process 'A' calls fallocate(2) to extend file (~FALLOC_FL_KEEP_SIZE). fuse_file_fallocate() sends FUSE_FALLOCATE request to the server. 2. Process 'B' calls ftruncate(2) shrinking the file. fuse_do_setattr() sends shrinking FUSE_SETATTR request to the server and updates local i_size by i_size_write(inode, outarg.attr.size). 3. Process 'A' resumes execution of fuse_file_fallocate() and calls fuse_write_update_size(inode, offset + length). But 'offset + length' was obsoleted by ftruncate from previous step. Changed in v2 (thanks Brian and Anand for suggestions): - made relation between mutex_lock() and fuse_set_nowrite(inode) more explicit and clear. - updated patch description to use ftruncate(2) in example Signed-off-by: Maxim V. Patlasov Reviewed-by: Brian Foster Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e570081f9f76..35f281033142 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2470,13 +2470,16 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, .mode = mode }; int err; + bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || + (mode & FALLOC_FL_PUNCH_HOLE); if (fc->no_fallocate) return -EOPNOTSUPP; - if (mode & FALLOC_FL_PUNCH_HOLE) { + if (lock_inode) { mutex_lock(&inode->i_mutex); - fuse_set_nowrite(inode); + if (mode & FALLOC_FL_PUNCH_HOLE) + fuse_set_nowrite(inode); } req = fuse_get_req_nopages(fc); @@ -2511,8 +2514,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, fuse_invalidate_attr(inode); out: - if (mode & FALLOC_FL_PUNCH_HOLE) { - fuse_release_nowrite(inode); + if (lock_inode) { + if (mode & FALLOC_FL_PUNCH_HOLE) + fuse_release_nowrite(inode); mutex_unlock(&inode->i_mutex); } -- cgit v1.2.2 From 7995bd287134f6c8f80d94bebe7396f05a9bc42b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 20 Jun 2013 18:58:36 +0400 Subject: splice: don't pass the address of ->f_pos to methods Signed-off-by: Al Viro --- fs/internal.h | 6 ++++++ fs/read_write.c | 24 ++++++++++++++++-------- fs/splice.c | 31 ++++++++++++++++++------------- 3 files changed, 40 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/internal.h b/fs/internal.h index eaa75f75b625..68121584ae37 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -131,6 +131,12 @@ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); */ extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); +/* + * splice.c + */ +extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags); + /* * pipe.c */ diff --git a/fs/read_write.c b/fs/read_write.c index 03430008704e..2cefa417be34 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1064,6 +1064,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, struct fd in, out; struct inode *in_inode, *out_inode; loff_t pos; + loff_t out_pos; ssize_t retval; int fl; @@ -1077,12 +1078,14 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, if (!(in.file->f_mode & FMODE_READ)) goto fput_in; retval = -ESPIPE; - if (!ppos) - ppos = &in.file->f_pos; - else + if (!ppos) { + pos = in.file->f_pos; + } else { + pos = *ppos; if (!(in.file->f_mode & FMODE_PREAD)) goto fput_in; - retval = rw_verify_area(READ, in.file, ppos, count); + } + retval = rw_verify_area(READ, in.file, &pos, count); if (retval < 0) goto fput_in; count = retval; @@ -1099,7 +1102,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, retval = -EINVAL; in_inode = file_inode(in.file); out_inode = file_inode(out.file); - retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count); + out_pos = out.file->f_pos; + retval = rw_verify_area(WRITE, out.file, &out_pos, count); if (retval < 0) goto fput_out; count = retval; @@ -1107,7 +1111,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, if (!max) max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); - pos = *ppos; if (unlikely(pos + count > max)) { retval = -EOVERFLOW; if (pos >= max) @@ -1126,18 +1129,23 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, if (in.file->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif - retval = do_splice_direct(in.file, ppos, out.file, count, fl); + retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); if (retval > 0) { add_rchar(current, retval); add_wchar(current, retval); fsnotify_access(in.file); fsnotify_modify(out.file); + out.file->f_pos = out_pos; + if (ppos) + *ppos = pos; + else + in.file->f_pos = pos; } inc_syscr(current); inc_syscw(current); - if (*ppos > max) + if (pos > max) retval = -EOVERFLOW; fput_out: diff --git a/fs/splice.c b/fs/splice.c index e6b25598c8c4..9eca476227d5 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1274,7 +1274,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, { struct file *file = sd->u.file; - return do_splice_from(pipe, file, &file->f_pos, sd->total_len, + return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); } @@ -1294,7 +1294,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, * */ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - size_t len, unsigned int flags) + loff_t *opos, size_t len, unsigned int flags) { struct splice_desc sd = { .len = len, @@ -1302,6 +1302,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, .flags = flags, .pos = *ppos, .u.file = out, + .opos = opos, }; long ret; @@ -1325,7 +1326,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; - loff_t offset, *off; + loff_t offset; long ret; ipipe = get_pipe_info(in); @@ -1356,13 +1357,15 @@ static long do_splice(struct file *in, loff_t __user *off_in, return -EINVAL; if (copy_from_user(&offset, off_out, sizeof(loff_t))) return -EFAULT; - off = &offset; - } else - off = &out->f_pos; + } else { + offset = out->f_pos; + } - ret = do_splice_from(ipipe, out, off, len, flags); + ret = do_splice_from(ipipe, out, &offset, len, flags); - if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) + if (!off_out) + out->f_pos = offset; + else if (copy_to_user(off_out, &offset, sizeof(loff_t))) ret = -EFAULT; return ret; @@ -1376,13 +1379,15 @@ static long do_splice(struct file *in, loff_t __user *off_in, return -EINVAL; if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; - off = &offset; - } else - off = &in->f_pos; + } else { + offset = in->f_pos; + } - ret = do_splice_to(in, off, opipe, len, flags); + ret = do_splice_to(in, &offset, opipe, len, flags); - if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) + if (!off_in) + in->f_pos = offset; + else if (copy_to_user(off_in, &offset, sizeof(loff_t))) ret = -EFAULT; return ret; -- cgit v1.2.2 From acdb37c361dc87e165889a504e291c1e82ae133c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 22 Jun 2013 19:44:08 -0700 Subject: fs: fix new splice.c kernel-doc warning Fix new kernel-doc warning in fs/splice.c: Warning(fs/splice.c:1298): No description found for parameter 'opos' Signed-off-by: Randy Dunlap Cc: Al Viro Signed-off-by: Linus Torvalds --- fs/splice.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 9eca476227d5..d37431dd60a1 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1283,6 +1283,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, * @in: file to splice from * @ppos: input file offset * @out: file to splice to + * @opos: output file offset * @len: number of bytes to splice * @flags: splice modifier flags * -- cgit v1.2.2 From 2976b10f05bd7f6dab9f9e7524451ddfed656a89 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 20 Jun 2013 11:36:28 +0200 Subject: perf: Disable monitoring on setuid processes for regular users There was a a bug in setup_new_exec(), whereby the test to disabled perf monitoring was not correct because the new credentials for the process were not yet committed and therefore the get_dumpable() test was never firing. The patch fixes the problem by moving the perf_event test until after the credentials are committed. Signed-off-by: Stephane Eranian Tested-by: Jiri Olsa Acked-by: Peter Zijlstra Cc: Signed-off-by: Ingo Molnar --- fs/exec.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 643019585574..ffd7a813ad3d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1135,13 +1135,6 @@ void setup_new_exec(struct linux_binprm * bprm) set_dumpable(current->mm, suid_dumpable); } - /* - * Flush performance counters when crossing a - * security domain: - */ - if (!get_dumpable(current->mm)) - perf_event_exit_task(current); - /* An exec changes our domain. We are no longer part of the thread group */ @@ -1205,6 +1198,15 @@ void install_exec_creds(struct linux_binprm *bprm) commit_creds(bprm->cred); bprm->cred = NULL; + + /* + * Disable monitoring for regular users + * when executing setuid binaries. Must + * wait until new credentials are committed + * by commit_creds() above + */ + if (get_dumpable(current->mm) != SUID_DUMP_USER) + perf_event_exit_task(current); /* * cred_guard_mutex must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's -- cgit v1.2.2 From 33f1a63ae84dfd9ad298cf275b8f1887043ced36 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 28 Jun 2013 14:15:14 +0300 Subject: UBIFS: prepare to fix a horrid bug Al Viro pointed me to the fact that '->readdir()' and '->llseek()' have no mutual exclusion, which means the 'ubifs_dir_llseek()' can be run while we are in the middle of 'ubifs_readdir()'. First of all, this means that 'file->private_data' can be freed while 'ubifs_readdir()' uses it. But this particular patch does not fix the problem. This patch is only a preparation, and the fix will follow next. In this patch we make 'ubifs_readdir()' stop using 'file->f_pos' directly, because 'file->f_pos' can be changed by '->llseek()' at any point. This may lead 'ubifs_readdir()' to returning inconsistent data: directory entry names may correspond to incorrect file positions. So here we introduce a local variable 'pos', read 'file->f_pose' once at very the beginning, and then stick to 'pos'. The result of this is that when 'ubifs_dir_llseek()' changes 'file->f_pos' while we are in the middle of 'ubifs_readdir()', the latter "wins". Cc: stable@vger.kernel.org Reported-by: Al Viro Tested-by: Artem Bityutskiy Signed-off-by: Artem Bityutskiy Signed-off-by: Al Viro --- fs/ubifs/dir.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index de08c92f2e23..8e587afd7732 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -349,15 +349,16 @@ static unsigned int vfs_dent_type(uint8_t type) static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) { int err, over = 0; + loff_t pos = file->f_pos; struct qstr nm; union ubifs_key key; struct ubifs_dent_node *dent; struct inode *dir = file_inode(file); struct ubifs_info *c = dir->i_sb->s_fs_info; - dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos); + dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, pos); - if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2) + if (pos > UBIFS_S_KEY_HASH_MASK || pos == 2) /* * The directory was seek'ed to a senseless position or there * are no more entries. @@ -365,15 +366,15 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) return 0; /* File positions 0 and 1 correspond to "." and ".." */ - if (file->f_pos == 0) { + if (pos == 0) { ubifs_assert(!file->private_data); over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR); if (over) return 0; - file->f_pos = 1; + file->f_pos = pos = 1; } - if (file->f_pos == 1) { + if (pos == 1) { ubifs_assert(!file->private_data); over = filldir(dirent, "..", 2, 1, parent_ino(file->f_path.dentry), DT_DIR); @@ -389,7 +390,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) goto out; } - file->f_pos = key_hash_flash(c, &dent->key); + file->f_pos = pos = key_hash_flash(c, &dent->key); file->private_data = dent; } @@ -397,17 +398,16 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) if (!dent) { /* * The directory was seek'ed to and is now readdir'ed. - * Find the entry corresponding to @file->f_pos or the - * closest one. + * Find the entry corresponding to @pos or the closest one. */ - dent_key_init_hash(c, &key, dir->i_ino, file->f_pos); + dent_key_init_hash(c, &key, dir->i_ino, pos); nm.name = NULL; dent = ubifs_tnc_next_ent(c, &key, &nm); if (IS_ERR(dent)) { err = PTR_ERR(dent); goto out; } - file->f_pos = key_hash_flash(c, &dent->key); + file->f_pos = pos = key_hash_flash(c, &dent->key); file->private_data = dent; } @@ -419,7 +419,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) ubifs_inode(dir)->creat_sqnum); nm.len = le16_to_cpu(dent->nlen); - over = filldir(dirent, dent->name, nm.len, file->f_pos, + over = filldir(dirent, dent->name, nm.len, pos, le64_to_cpu(dent->inum), vfs_dent_type(dent->type)); if (over) @@ -435,7 +435,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) } kfree(file->private_data); - file->f_pos = key_hash_flash(c, &dent->key); + file->f_pos = pos = key_hash_flash(c, &dent->key); file->private_data = dent; cond_resched(); } -- cgit v1.2.2 From 605c912bb843c024b1ed173dc427cd5c08e5d54d Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 28 Jun 2013 14:15:15 +0300 Subject: UBIFS: fix a horrid bug Al Viro pointed me to the fact that '->readdir()' and '->llseek()' have no mutual exclusion, which means the 'ubifs_dir_llseek()' can be run while we are in the middle of 'ubifs_readdir()'. This means that 'file->private_data' can be freed while 'ubifs_readdir()' uses it, and this is a very bad bug: not only 'ubifs_readdir()' can return garbage, but this may corrupt memory and lead to all kinds of problems like crashes an security holes. This patch fixes the problem by using the 'file->f_version' field, which '->llseek()' always unconditionally sets to zero. We set it to 1 in 'ubifs_readdir()' and whenever we detect that it became 0, we know there was a seek and it is time to clear the state saved in 'file->private_data'. I tested this patch by writing a user-space program which runds readdir and seek in parallell. I could easily crash the kernel without these patches, but could not crash it with these patches. Cc: stable@vger.kernel.org Reported-by: Al Viro Tested-by: Artem Bityutskiy Signed-off-by: Artem Bityutskiy Signed-off-by: Al Viro --- fs/ubifs/dir.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 8e587afd7732..605af512aec2 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -365,6 +365,24 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) */ return 0; + if (file->f_version == 0) { + /* + * The file was seek'ed, which means that @file->private_data + * is now invalid. This may also be just the first + * 'ubifs_readdir()' invocation, in which case + * @file->private_data is NULL, and the below code is + * basically a no-op. + */ + kfree(file->private_data); + file->private_data = NULL; + } + + /* + * 'generic_file_llseek()' unconditionally sets @file->f_version to + * zero, and we use this for detecting whether the file was seek'ed. + */ + file->f_version = 1; + /* File positions 0 and 1 correspond to "." and ".." */ if (pos == 0) { ubifs_assert(!file->private_data); @@ -438,6 +456,14 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) file->f_pos = pos = key_hash_flash(c, &dent->key); file->private_data = dent; cond_resched(); + + if (file->f_version == 0) + /* + * The file was seek'ed meanwhile, lets return and start + * reading direntries from the new position on the next + * invocation. + */ + return 0; } out: @@ -448,15 +474,13 @@ out: kfree(file->private_data); file->private_data = NULL; + /* 2 is a special value indicating that there are no more direntries */ file->f_pos = 2; return 0; } -/* If a directory is seeked, we have to free saved readdir() state */ static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence) { - kfree(file->private_data); - file->private_data = NULL; return generic_file_llseek(file, offset, whence); } -- cgit v1.2.2