aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-07-09 15:39:10 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-09 15:39:10 -0400
commit9a5889ae1ce41f376e6a5b56e17e0c5a755fda80 (patch)
tree0eaadb5530d5b82460e0bfb0b4403e080d7b1b8f /fs
parente3a0dd98e1ddfd135b7ef889fcc0269e8c2ca445 (diff)
parent8b8cf8917f9b5d74e04f281272d8719ce335a497 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull Ceph updates from Sage Weil: "There is some follow-on RBD cleanup after the last window's code drop, a series from Yan fixing multi-mds behavior in cephfs, and then a sprinkling of bug fixes all around. Some warnings, sleeping while atomic, a null dereference, and cleanups" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (36 commits) libceph: fix invalid unsigned->signed conversion for timespec encoding libceph: call r_unsafe_callback when unsafe reply is received ceph: fix race between cap issue and revoke ceph: fix cap revoke race ceph: fix pending vmtruncate race ceph: avoid accessing invalid memory libceph: Fix NULL pointer dereference in auth client code ceph: Reconstruct the func ceph_reserve_caps. ceph: Free mdsc if alloc mdsc->mdsmap failed. ceph: remove sb_start/end_write in ceph_aio_write. ceph: avoid meaningless calling ceph_caps_revoking if sync_mode == WB_SYNC_ALL. ceph: fix sleeping function called from invalid context. ceph: move inode to proper flushing list when auth MDS changes rbd: fix a couple warnings ceph: clear migrate seq when MDS restarts ceph: check migrate seq before changing auth cap ceph: fix race between page writeback and truncate ceph: reset iov_len when discarding cap release messages ceph: fix cap release race libceph: fix truncate size calculation ...
Diffstat (limited to 'fs')
-rw-r--r--fs/ceph/addr.c88
-rw-r--r--fs/ceph/caps.c102
-rw-r--r--fs/ceph/file.c4
-rw-r--r--fs/ceph/inode.c14
-rw-r--r--fs/ceph/locks.c2
-rw-r--r--fs/ceph/mds_client.c6
-rw-r--r--fs/ceph/mdsmap.c42
-rw-r--r--fs/ceph/super.c2
-rw-r--r--fs/ceph/super.h4
-rw-r--r--fs/ceph/xattr.c9
10 files changed, 149 insertions, 124 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 38b5c1bc6776..5318a3b704f6 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -439,13 +439,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
439 struct ceph_inode_info *ci; 439 struct ceph_inode_info *ci;
440 struct ceph_fs_client *fsc; 440 struct ceph_fs_client *fsc;
441 struct ceph_osd_client *osdc; 441 struct ceph_osd_client *osdc;
442 loff_t page_off = page_offset(page);
443 int len = PAGE_CACHE_SIZE;
444 loff_t i_size;
445 int err = 0;
446 struct ceph_snap_context *snapc, *oldest; 442 struct ceph_snap_context *snapc, *oldest;
447 u64 snap_size = 0; 443 loff_t page_off = page_offset(page);
448 long writeback_stat; 444 long writeback_stat;
445 u64 truncate_size, snap_size = 0;
446 u32 truncate_seq;
447 int err = 0, len = PAGE_CACHE_SIZE;
449 448
450 dout("writepage %p idx %lu\n", page, page->index); 449 dout("writepage %p idx %lu\n", page, page->index);
451 450
@@ -475,13 +474,20 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
475 } 474 }
476 ceph_put_snap_context(oldest); 475 ceph_put_snap_context(oldest);
477 476
477 spin_lock(&ci->i_ceph_lock);
478 truncate_seq = ci->i_truncate_seq;
479 truncate_size = ci->i_truncate_size;
480 if (!snap_size)
481 snap_size = i_size_read(inode);
482 spin_unlock(&ci->i_ceph_lock);
483
478 /* is this a partial page at end of file? */ 484 /* is this a partial page at end of file? */
479 if (snap_size) 485 if (page_off >= snap_size) {
480 i_size = snap_size; 486 dout("%p page eof %llu\n", page, snap_size);
481 else 487 goto out;
482 i_size = i_size_read(inode); 488 }
483 if (i_size < page_off + len) 489 if (snap_size < page_off + len)
484 len = i_size - page_off; 490 len = snap_size - page_off;
485 491
486 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", 492 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
487 inode, page, page->index, page_off, len, snapc); 493 inode, page, page->index, page_off, len, snapc);
@@ -495,7 +501,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
495 err = ceph_osdc_writepages(osdc, ceph_vino(inode), 501 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
496 &ci->i_layout, snapc, 502 &ci->i_layout, snapc,
497 page_off, len, 503 page_off, len,
498 ci->i_truncate_seq, ci->i_truncate_size, 504 truncate_seq, truncate_size,
499 &inode->i_mtime, &page, 1); 505 &inode->i_mtime, &page, 1);
500 if (err < 0) { 506 if (err < 0) {
501 dout("writepage setting page/mapping error %d %p\n", err, page); 507 dout("writepage setting page/mapping error %d %p\n", err, page);
@@ -632,25 +638,6 @@ static void writepages_finish(struct ceph_osd_request *req,
632 ceph_osdc_put_request(req); 638 ceph_osdc_put_request(req);
633} 639}
634 640
635static struct ceph_osd_request *
636ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len,
637 struct ceph_snap_context *snapc, int num_ops)
638{
639 struct ceph_fs_client *fsc;
640 struct ceph_inode_info *ci;
641 struct ceph_vino vino;
642
643 fsc = ceph_inode_to_client(inode);
644 ci = ceph_inode(inode);
645 vino = ceph_vino(inode);
646 /* BUG_ON(vino.snap != CEPH_NOSNAP); */
647
648 return ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
649 vino, offset, len, num_ops, CEPH_OSD_OP_WRITE,
650 CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK,
651 snapc, ci->i_truncate_seq, ci->i_truncate_size, true);
652}
653
654/* 641/*
655 * initiate async writeback 642 * initiate async writeback
656 */ 643 */
@@ -659,7 +646,8 @@ static int ceph_writepages_start(struct address_space *mapping,
659{ 646{
660 struct inode *inode = mapping->host; 647 struct inode *inode = mapping->host;
661 struct ceph_inode_info *ci = ceph_inode(inode); 648 struct ceph_inode_info *ci = ceph_inode(inode);
662 struct ceph_fs_client *fsc; 649 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
650 struct ceph_vino vino = ceph_vino(inode);
663 pgoff_t index, start, end; 651 pgoff_t index, start, end;
664 int range_whole = 0; 652 int range_whole = 0;
665 int should_loop = 1; 653 int should_loop = 1;
@@ -671,22 +659,22 @@ static int ceph_writepages_start(struct address_space *mapping,
671 unsigned wsize = 1 << inode->i_blkbits; 659 unsigned wsize = 1 << inode->i_blkbits;
672 struct ceph_osd_request *req = NULL; 660 struct ceph_osd_request *req = NULL;
673 int do_sync; 661 int do_sync;
674 u64 snap_size; 662 u64 truncate_size, snap_size;
663 u32 truncate_seq;
675 664
676 /* 665 /*
677 * Include a 'sync' in the OSD request if this is a data 666 * Include a 'sync' in the OSD request if this is a data
678 * integrity write (e.g., O_SYNC write or fsync()), or if our 667 * integrity write (e.g., O_SYNC write or fsync()), or if our
679 * cap is being revoked. 668 * cap is being revoked.
680 */ 669 */
681 do_sync = wbc->sync_mode == WB_SYNC_ALL; 670 if ((wbc->sync_mode == WB_SYNC_ALL) ||
682 if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) 671 ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
683 do_sync = 1; 672 do_sync = 1;
684 dout("writepages_start %p dosync=%d (mode=%s)\n", 673 dout("writepages_start %p dosync=%d (mode=%s)\n",
685 inode, do_sync, 674 inode, do_sync,
686 wbc->sync_mode == WB_SYNC_NONE ? "NONE" : 675 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
687 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); 676 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
688 677
689 fsc = ceph_inode_to_client(inode);
690 if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { 678 if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
691 pr_warning("writepage_start %p on forced umount\n", inode); 679 pr_warning("writepage_start %p on forced umount\n", inode);
692 return -EIO; /* we're in a forced umount, don't write! */ 680 return -EIO; /* we're in a forced umount, don't write! */
@@ -729,6 +717,14 @@ retry:
729 snap_size = i_size_read(inode); 717 snap_size = i_size_read(inode);
730 dout(" oldest snapc is %p seq %lld (%d snaps)\n", 718 dout(" oldest snapc is %p seq %lld (%d snaps)\n",
731 snapc, snapc->seq, snapc->num_snaps); 719 snapc, snapc->seq, snapc->num_snaps);
720
721 spin_lock(&ci->i_ceph_lock);
722 truncate_seq = ci->i_truncate_seq;
723 truncate_size = ci->i_truncate_size;
724 if (!snap_size)
725 snap_size = i_size_read(inode);
726 spin_unlock(&ci->i_ceph_lock);
727
732 if (last_snapc && snapc != last_snapc) { 728 if (last_snapc && snapc != last_snapc) {
733 /* if we switched to a newer snapc, restart our scan at the 729 /* if we switched to a newer snapc, restart our scan at the
734 * start of the original file range. */ 730 * start of the original file range. */
@@ -740,7 +736,6 @@ retry:
740 736
741 while (!done && index <= end) { 737 while (!done && index <= end) {
742 int num_ops = do_sync ? 2 : 1; 738 int num_ops = do_sync ? 2 : 1;
743 struct ceph_vino vino;
744 unsigned i; 739 unsigned i;
745 int first; 740 int first;
746 pgoff_t next; 741 pgoff_t next;
@@ -834,17 +829,18 @@ get_more_pages:
834 * that it will use. 829 * that it will use.
835 */ 830 */
836 if (locked_pages == 0) { 831 if (locked_pages == 0) {
837 size_t size;
838
839 BUG_ON(pages); 832 BUG_ON(pages);
840
841 /* prepare async write request */ 833 /* prepare async write request */
842 offset = (u64)page_offset(page); 834 offset = (u64)page_offset(page);
843 len = wsize; 835 len = wsize;
844 req = ceph_writepages_osd_request(inode, 836 req = ceph_osdc_new_request(&fsc->client->osdc,
845 offset, &len, snapc, 837 &ci->i_layout, vino,
846 num_ops); 838 offset, &len, num_ops,
847 839 CEPH_OSD_OP_WRITE,
840 CEPH_OSD_FLAG_WRITE |
841 CEPH_OSD_FLAG_ONDISK,
842 snapc, truncate_seq,
843 truncate_size, true);
848 if (IS_ERR(req)) { 844 if (IS_ERR(req)) {
849 rc = PTR_ERR(req); 845 rc = PTR_ERR(req);
850 unlock_page(page); 846 unlock_page(page);
@@ -855,8 +851,8 @@ get_more_pages:
855 req->r_inode = inode; 851 req->r_inode = inode;
856 852
857 max_pages = calc_pages_for(0, (u64)len); 853 max_pages = calc_pages_for(0, (u64)len);
858 size = max_pages * sizeof (*pages); 854 pages = kmalloc(max_pages * sizeof (*pages),
859 pages = kmalloc(size, GFP_NOFS); 855 GFP_NOFS);
860 if (!pages) { 856 if (!pages) {
861 pool = fsc->wb_pagevec_pool; 857 pool = fsc->wb_pagevec_pool;
862 pages = mempool_alloc(pool, GFP_NOFS); 858 pages = mempool_alloc(pool, GFP_NOFS);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index da0f9b8a3bcb..25442b40c25a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -147,7 +147,7 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
147 spin_unlock(&mdsc->caps_list_lock); 147 spin_unlock(&mdsc->caps_list_lock);
148} 148}
149 149
150int ceph_reserve_caps(struct ceph_mds_client *mdsc, 150void ceph_reserve_caps(struct ceph_mds_client *mdsc,
151 struct ceph_cap_reservation *ctx, int need) 151 struct ceph_cap_reservation *ctx, int need)
152{ 152{
153 int i; 153 int i;
@@ -155,7 +155,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
155 int have; 155 int have;
156 int alloc = 0; 156 int alloc = 0;
157 LIST_HEAD(newcaps); 157 LIST_HEAD(newcaps);
158 int ret = 0;
159 158
160 dout("reserve caps ctx=%p need=%d\n", ctx, need); 159 dout("reserve caps ctx=%p need=%d\n", ctx, need);
161 160
@@ -174,14 +173,15 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
174 173
175 for (i = have; i < need; i++) { 174 for (i = have; i < need; i++) {
176 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); 175 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
177 if (!cap) { 176 if (!cap)
178 ret = -ENOMEM; 177 break;
179 goto out_alloc_count;
180 }
181 list_add(&cap->caps_item, &newcaps); 178 list_add(&cap->caps_item, &newcaps);
182 alloc++; 179 alloc++;
183 } 180 }
184 BUG_ON(have + alloc != need); 181 /* we didn't manage to reserve as much as we needed */
182 if (have + alloc != need)
183 pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
184 ctx, need, have + alloc);
185 185
186 spin_lock(&mdsc->caps_list_lock); 186 spin_lock(&mdsc->caps_list_lock);
187 mdsc->caps_total_count += alloc; 187 mdsc->caps_total_count += alloc;
@@ -197,13 +197,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
197 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", 197 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
198 ctx, mdsc->caps_total_count, mdsc->caps_use_count, 198 ctx, mdsc->caps_total_count, mdsc->caps_use_count,
199 mdsc->caps_reserve_count, mdsc->caps_avail_count); 199 mdsc->caps_reserve_count, mdsc->caps_avail_count);
200 return 0;
201
202out_alloc_count:
203 /* we didn't manage to reserve as much as we needed */
204 pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
205 ctx, need, have);
206 return ret;
207} 200}
208 201
209int ceph_unreserve_caps(struct ceph_mds_client *mdsc, 202int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
@@ -612,9 +605,11 @@ retry:
612 __cap_delay_requeue(mdsc, ci); 605 __cap_delay_requeue(mdsc, ci);
613 } 606 }
614 607
615 if (flags & CEPH_CAP_FLAG_AUTH) 608 if (flags & CEPH_CAP_FLAG_AUTH) {
616 ci->i_auth_cap = cap; 609 if (ci->i_auth_cap == NULL ||
617 else if (ci->i_auth_cap == cap) { 610 ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
611 ci->i_auth_cap = cap;
612 } else if (ci->i_auth_cap == cap) {
618 ci->i_auth_cap = NULL; 613 ci->i_auth_cap = NULL;
619 spin_lock(&mdsc->cap_dirty_lock); 614 spin_lock(&mdsc->cap_dirty_lock);
620 if (!list_empty(&ci->i_dirty_item)) { 615 if (!list_empty(&ci->i_dirty_item)) {
@@ -695,6 +690,15 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
695 if (implemented) 690 if (implemented)
696 *implemented |= cap->implemented; 691 *implemented |= cap->implemented;
697 } 692 }
693 /*
694 * exclude caps issued by non-auth MDS, but are been revoking
695 * by the auth MDS. The non-auth MDS should be revoking/exporting
696 * these caps, but the message is delayed.
697 */
698 if (ci->i_auth_cap) {
699 cap = ci->i_auth_cap;
700 have &= ~cap->implemented | cap->issued;
701 }
698 return have; 702 return have;
699} 703}
700 704
@@ -802,22 +806,28 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
802/* 806/*
803 * Return true if mask caps are currently being revoked by an MDS. 807 * Return true if mask caps are currently being revoked by an MDS.
804 */ 808 */
805int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) 809int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
810 struct ceph_cap *ocap, int mask)
806{ 811{
807 struct inode *inode = &ci->vfs_inode;
808 struct ceph_cap *cap; 812 struct ceph_cap *cap;
809 struct rb_node *p; 813 struct rb_node *p;
810 int ret = 0;
811 814
812 spin_lock(&ci->i_ceph_lock);
813 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 815 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
814 cap = rb_entry(p, struct ceph_cap, ci_node); 816 cap = rb_entry(p, struct ceph_cap, ci_node);
815 if (__cap_is_valid(cap) && 817 if (cap != ocap && __cap_is_valid(cap) &&
816 (cap->implemented & ~cap->issued & mask)) { 818 (cap->implemented & ~cap->issued & mask))
817 ret = 1; 819 return 1;
818 break;
819 }
820 } 820 }
821 return 0;
822}
823
824int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
825{
826 struct inode *inode = &ci->vfs_inode;
827 int ret;
828
829 spin_lock(&ci->i_ceph_lock);
830 ret = __ceph_caps_revoking_other(ci, NULL, mask);
821 spin_unlock(&ci->i_ceph_lock); 831 spin_unlock(&ci->i_ceph_lock);
822 dout("ceph_caps_revoking %p %s = %d\n", inode, 832 dout("ceph_caps_revoking %p %s = %d\n", inode,
823 ceph_cap_string(mask), ret); 833 ceph_cap_string(mask), ret);
@@ -1980,8 +1990,15 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
1980 cap = ci->i_auth_cap; 1990 cap = ci->i_auth_cap;
1981 dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, 1991 dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
1982 ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); 1992 ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
1993
1983 __ceph_flush_snaps(ci, &session, 1); 1994 __ceph_flush_snaps(ci, &session, 1);
1995
1984 if (ci->i_flushing_caps) { 1996 if (ci->i_flushing_caps) {
1997 spin_lock(&mdsc->cap_dirty_lock);
1998 list_move_tail(&ci->i_flushing_item,
1999 &cap->session->s_cap_flushing);
2000 spin_unlock(&mdsc->cap_dirty_lock);
2001
1985 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, 2002 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1986 __ceph_caps_used(ci), 2003 __ceph_caps_used(ci),
1987 __ceph_caps_wanted(ci), 2004 __ceph_caps_wanted(ci),
@@ -2055,7 +2072,11 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
2055 /* finish pending truncate */ 2072 /* finish pending truncate */
2056 while (ci->i_truncate_pending) { 2073 while (ci->i_truncate_pending) {
2057 spin_unlock(&ci->i_ceph_lock); 2074 spin_unlock(&ci->i_ceph_lock);
2058 __ceph_do_pending_vmtruncate(inode, !(need & CEPH_CAP_FILE_WR)); 2075 if (!(need & CEPH_CAP_FILE_WR))
2076 mutex_lock(&inode->i_mutex);
2077 __ceph_do_pending_vmtruncate(inode);
2078 if (!(need & CEPH_CAP_FILE_WR))
2079 mutex_unlock(&inode->i_mutex);
2059 spin_lock(&ci->i_ceph_lock); 2080 spin_lock(&ci->i_ceph_lock);
2060 } 2081 }
2061 2082
@@ -2473,6 +2494,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2473 } else { 2494 } else {
2474 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued), 2495 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
2475 ceph_cap_string(newcaps)); 2496 ceph_cap_string(newcaps));
2497 /* non-auth MDS is revoking the newly grant caps ? */
2498 if (cap == ci->i_auth_cap &&
2499 __ceph_caps_revoking_other(ci, cap, newcaps))
2500 check_caps = 2;
2501
2476 cap->issued = newcaps; 2502 cap->issued = newcaps;
2477 cap->implemented |= newcaps; /* add bits only, to 2503 cap->implemented |= newcaps; /* add bits only, to
2478 * avoid stepping on a 2504 * avoid stepping on a
@@ -3042,21 +3068,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
3042 (cap->issued & unless) == 0)) { 3068 (cap->issued & unless) == 0)) {
3043 if ((cap->issued & drop) && 3069 if ((cap->issued & drop) &&
3044 (cap->issued & unless) == 0) { 3070 (cap->issued & unless) == 0) {
3045 dout("encode_inode_release %p cap %p %s -> " 3071 int wanted = __ceph_caps_wanted(ci);
3046 "%s\n", inode, cap, 3072 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
3073 wanted |= cap->mds_wanted;
3074 dout("encode_inode_release %p cap %p "
3075 "%s -> %s, wanted %s -> %s\n", inode, cap,
3047 ceph_cap_string(cap->issued), 3076 ceph_cap_string(cap->issued),
3048 ceph_cap_string(cap->issued & ~drop)); 3077 ceph_cap_string(cap->issued & ~drop),
3078 ceph_cap_string(cap->mds_wanted),
3079 ceph_cap_string(wanted));
3080
3049 cap->issued &= ~drop; 3081 cap->issued &= ~drop;
3050 cap->implemented &= ~drop; 3082 cap->implemented &= ~drop;
3051 if (ci->i_ceph_flags & CEPH_I_NODELAY) { 3083 cap->mds_wanted = wanted;
3052 int wanted = __ceph_caps_wanted(ci);
3053 dout(" wanted %s -> %s (act %s)\n",
3054 ceph_cap_string(cap->mds_wanted),
3055 ceph_cap_string(cap->mds_wanted &
3056 ~wanted),
3057 ceph_cap_string(wanted));
3058 cap->mds_wanted &= wanted;
3059 }
3060 } else { 3084 } else {
3061 dout("encode_inode_release %p cap %p %s" 3085 dout("encode_inode_release %p cap %p %s"
3062 " (force)\n", inode, cap, 3086 " (force)\n", inode, cap,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 16c989d3e23c..2ddf061c1c4a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -716,7 +716,6 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
716 if (ceph_snap(inode) != CEPH_NOSNAP) 716 if (ceph_snap(inode) != CEPH_NOSNAP)
717 return -EROFS; 717 return -EROFS;
718 718
719 sb_start_write(inode->i_sb);
720 mutex_lock(&inode->i_mutex); 719 mutex_lock(&inode->i_mutex);
721 hold_mutex = true; 720 hold_mutex = true;
722 721
@@ -809,7 +808,6 @@ retry_snap:
809out: 808out:
810 if (hold_mutex) 809 if (hold_mutex)
811 mutex_unlock(&inode->i_mutex); 810 mutex_unlock(&inode->i_mutex);
812 sb_end_write(inode->i_sb);
813 current->backing_dev_info = NULL; 811 current->backing_dev_info = NULL;
814 812
815 return written ? written : err; 813 return written ? written : err;
@@ -824,7 +822,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
824 int ret; 822 int ret;
825 823
826 mutex_lock(&inode->i_mutex); 824 mutex_lock(&inode->i_mutex);
827 __ceph_do_pending_vmtruncate(inode, false); 825 __ceph_do_pending_vmtruncate(inode);
828 826
829 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { 827 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
830 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); 828 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index bd2289a4f40d..f3a2abf28a77 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1465,7 +1465,9 @@ static void ceph_vmtruncate_work(struct work_struct *work)
1465 struct inode *inode = &ci->vfs_inode; 1465 struct inode *inode = &ci->vfs_inode;
1466 1466
1467 dout("vmtruncate_work %p\n", inode); 1467 dout("vmtruncate_work %p\n", inode);
1468 __ceph_do_pending_vmtruncate(inode, true); 1468 mutex_lock(&inode->i_mutex);
1469 __ceph_do_pending_vmtruncate(inode);
1470 mutex_unlock(&inode->i_mutex);
1469 iput(inode); 1471 iput(inode);
1470} 1472}
1471 1473
@@ -1492,7 +1494,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
1492 * Make sure any pending truncation is applied before doing anything 1494 * Make sure any pending truncation is applied before doing anything
1493 * that may depend on it. 1495 * that may depend on it.
1494 */ 1496 */
1495void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock) 1497void __ceph_do_pending_vmtruncate(struct inode *inode)
1496{ 1498{
1497 struct ceph_inode_info *ci = ceph_inode(inode); 1499 struct ceph_inode_info *ci = ceph_inode(inode);
1498 u64 to; 1500 u64 to;
@@ -1525,11 +1527,7 @@ retry:
1525 ci->i_truncate_pending, to); 1527 ci->i_truncate_pending, to);
1526 spin_unlock(&ci->i_ceph_lock); 1528 spin_unlock(&ci->i_ceph_lock);
1527 1529
1528 if (needlock)
1529 mutex_lock(&inode->i_mutex);
1530 truncate_inode_pages(inode->i_mapping, to); 1530 truncate_inode_pages(inode->i_mapping, to);
1531 if (needlock)
1532 mutex_unlock(&inode->i_mutex);
1533 1531
1534 spin_lock(&ci->i_ceph_lock); 1532 spin_lock(&ci->i_ceph_lock);
1535 if (to == ci->i_truncate_size) { 1533 if (to == ci->i_truncate_size) {
@@ -1588,7 +1586,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1588 if (ceph_snap(inode) != CEPH_NOSNAP) 1586 if (ceph_snap(inode) != CEPH_NOSNAP)
1589 return -EROFS; 1587 return -EROFS;
1590 1588
1591 __ceph_do_pending_vmtruncate(inode, false); 1589 __ceph_do_pending_vmtruncate(inode);
1592 1590
1593 err = inode_change_ok(inode, attr); 1591 err = inode_change_ok(inode, attr);
1594 if (err != 0) 1592 if (err != 0)
@@ -1770,7 +1768,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1770 ceph_cap_string(dirtied), mask); 1768 ceph_cap_string(dirtied), mask);
1771 1769
1772 ceph_mdsc_put_request(req); 1770 ceph_mdsc_put_request(req);
1773 __ceph_do_pending_vmtruncate(inode, false); 1771 __ceph_do_pending_vmtruncate(inode);
1774 return err; 1772 return err;
1775out: 1773out:
1776 spin_unlock(&ci->i_ceph_lock); 1774 spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 690f73f42425..ae6d14e82b0f 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -169,7 +169,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
169} 169}
170 170
171/** 171/**
172 * Must be called with BKL already held. Fills in the passed 172 * Must be called with lock_flocks() already held. Fills in the passed
173 * counter variables, so you can prepare pagelist metadata before calling 173 * counter variables, so you can prepare pagelist metadata before calling
174 * ceph_encode_locks. 174 * ceph_encode_locks.
175 */ 175 */
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 99890b02a10b..187bf214444d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1391,6 +1391,7 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
1391 num = le32_to_cpu(head->num); 1391 num = le32_to_cpu(head->num);
1392 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); 1392 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
1393 head->num = cpu_to_le32(0); 1393 head->num = cpu_to_le32(0);
1394 msg->front.iov_len = sizeof(*head);
1394 session->s_num_cap_releases += num; 1395 session->s_num_cap_releases += num;
1395 1396
1396 /* requeue completed messages */ 1397 /* requeue completed messages */
@@ -2454,6 +2455,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2454 spin_lock(&ci->i_ceph_lock); 2455 spin_lock(&ci->i_ceph_lock);
2455 cap->seq = 0; /* reset cap seq */ 2456 cap->seq = 0; /* reset cap seq */
2456 cap->issue_seq = 0; /* and issue_seq */ 2457 cap->issue_seq = 0; /* and issue_seq */
2458 cap->mseq = 0; /* and migrate_seq */
2457 2459
2458 if (recon_state->flock) { 2460 if (recon_state->flock) {
2459 rec.v2.cap_id = cpu_to_le64(cap->cap_id); 2461 rec.v2.cap_id = cpu_to_le64(cap->cap_id);
@@ -3040,8 +3042,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
3040 fsc->mdsc = mdsc; 3042 fsc->mdsc = mdsc;
3041 mutex_init(&mdsc->mutex); 3043 mutex_init(&mdsc->mutex);
3042 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); 3044 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
3043 if (mdsc->mdsmap == NULL) 3045 if (mdsc->mdsmap == NULL) {
3046 kfree(mdsc);
3044 return -ENOMEM; 3047 return -ENOMEM;
3048 }
3045 3049
3046 init_completion(&mdsc->safe_umount_waiters); 3050 init_completion(&mdsc->safe_umount_waiters);
3047 init_waitqueue_head(&mdsc->session_close_wq); 3051 init_waitqueue_head(&mdsc->session_close_wq);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 9278dec9e940..132b64eeecd4 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -92,6 +92,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
92 u32 num_export_targets; 92 u32 num_export_targets;
93 void *pexport_targets = NULL; 93 void *pexport_targets = NULL;
94 struct ceph_timespec laggy_since; 94 struct ceph_timespec laggy_since;
95 struct ceph_mds_info *info;
95 96
96 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); 97 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
97 global_id = ceph_decode_64(p); 98 global_id = ceph_decode_64(p);
@@ -126,24 +127,27 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
126 i+1, n, global_id, mds, inc, 127 i+1, n, global_id, mds, inc,
127 ceph_pr_addr(&addr.in_addr), 128 ceph_pr_addr(&addr.in_addr),
128 ceph_mds_state_name(state)); 129 ceph_mds_state_name(state));
129 if (mds >= 0 && mds < m->m_max_mds && state > 0) { 130
130 m->m_info[mds].global_id = global_id; 131 if (mds < 0 || mds >= m->m_max_mds || state <= 0)
131 m->m_info[mds].state = state; 132 continue;
132 m->m_info[mds].addr = addr; 133
133 m->m_info[mds].laggy = 134 info = &m->m_info[mds];
134 (laggy_since.tv_sec != 0 || 135 info->global_id = global_id;
135 laggy_since.tv_nsec != 0); 136 info->state = state;
136 m->m_info[mds].num_export_targets = num_export_targets; 137 info->addr = addr;
137 if (num_export_targets) { 138 info->laggy = (laggy_since.tv_sec != 0 ||
138 m->m_info[mds].export_targets = 139 laggy_since.tv_nsec != 0);
139 kcalloc(num_export_targets, sizeof(u32), 140 info->num_export_targets = num_export_targets;
140 GFP_NOFS); 141 if (num_export_targets) {
141 for (j = 0; j < num_export_targets; j++) 142 info->export_targets = kcalloc(num_export_targets,
142 m->m_info[mds].export_targets[j] = 143 sizeof(u32), GFP_NOFS);
143 ceph_decode_32(&pexport_targets); 144 if (info->export_targets == NULL)
144 } else { 145 goto badmem;
145 m->m_info[mds].export_targets = NULL; 146 for (j = 0; j < num_export_targets; j++)
146 } 147 info->export_targets[j] =
148 ceph_decode_32(&pexport_targets);
149 } else {
150 info->export_targets = NULL;
147 } 151 }
148 } 152 }
149 153
@@ -170,7 +174,7 @@ bad:
170 DUMP_PREFIX_OFFSET, 16, 1, 174 DUMP_PREFIX_OFFSET, 16, 1,
171 start, end - start, true); 175 start, end - start, true);
172 ceph_mdsmap_destroy(m); 176 ceph_mdsmap_destroy(m);
173 return ERR_PTR(-EINVAL); 177 return ERR_PTR(err);
174} 178}
175 179
176void ceph_mdsmap_destroy(struct ceph_mdsmap *m) 180void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 7d377c9a5e35..6627b26a800c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -357,7 +357,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
357 } 357 }
358 err = -EINVAL; 358 err = -EINVAL;
359 dev_name_end--; /* back up to ':' separator */ 359 dev_name_end--; /* back up to ':' separator */
360 if (*dev_name_end != ':') { 360 if (dev_name_end < dev_name || *dev_name_end != ':') {
361 pr_err("device name is missing path (no : separator in %s)\n", 361 pr_err("device name is missing path (no : separator in %s)\n",
362 dev_name); 362 dev_name);
363 goto out; 363 goto out;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7ccfdb4aea2e..cbded572345e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -534,7 +534,7 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
534extern void ceph_caps_init(struct ceph_mds_client *mdsc); 534extern void ceph_caps_init(struct ceph_mds_client *mdsc);
535extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); 535extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
536extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta); 536extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
537extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, 537extern void ceph_reserve_caps(struct ceph_mds_client *mdsc,
538 struct ceph_cap_reservation *ctx, int need); 538 struct ceph_cap_reservation *ctx, int need);
539extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, 539extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
540 struct ceph_cap_reservation *ctx); 540 struct ceph_cap_reservation *ctx);
@@ -692,7 +692,7 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
692extern int ceph_inode_holds_cap(struct inode *inode, int mask); 692extern int ceph_inode_holds_cap(struct inode *inode, int mask);
693 693
694extern int ceph_inode_set_size(struct inode *inode, loff_t size); 694extern int ceph_inode_set_size(struct inode *inode, loff_t size);
695extern void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock); 695extern void __ceph_do_pending_vmtruncate(struct inode *inode);
696extern void ceph_queue_vmtruncate(struct inode *inode); 696extern void ceph_queue_vmtruncate(struct inode *inode);
697 697
698extern void ceph_queue_invalidate(struct inode *inode); 698extern void ceph_queue_invalidate(struct inode *inode);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9b6b2b6dd164..be661d8f532a 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -675,17 +675,18 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
675 if (!ceph_is_valid_xattr(name)) 675 if (!ceph_is_valid_xattr(name))
676 return -ENODATA; 676 return -ENODATA;
677 677
678 spin_lock(&ci->i_ceph_lock);
679 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
680 ci->i_xattrs.version, ci->i_xattrs.index_version);
681 678
682 /* let's see if a virtual xattr was requested */ 679 /* let's see if a virtual xattr was requested */
683 vxattr = ceph_match_vxattr(inode, name); 680 vxattr = ceph_match_vxattr(inode, name);
684 if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { 681 if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
685 err = vxattr->getxattr_cb(ci, value, size); 682 err = vxattr->getxattr_cb(ci, value, size);
686 goto out; 683 return err;
687 } 684 }
688 685
686 spin_lock(&ci->i_ceph_lock);
687 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
688 ci->i_xattrs.version, ci->i_xattrs.index_version);
689
689 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && 690 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
690 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { 691 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
691 goto get_xattr; 692 goto get_xattr;