aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/ceph/caps.c220
-rw-r--r--fs/ceph/inode.c4
-rw-r--r--fs/ceph/super.h4
3 files changed, 146 insertions, 82 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 98f3ca4a5ddf..17543383545c 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -555,21 +555,34 @@ retry:
555 cap->ci = ci; 555 cap->ci = ci;
556 __insert_cap_node(ci, cap); 556 __insert_cap_node(ci, cap);
557 557
558 /* clear out old exporting info? (i.e. on cap import) */
559 if (ci->i_cap_exporting_mds == mds) {
560 ci->i_cap_exporting_issued = 0;
561 ci->i_cap_exporting_mseq = 0;
562 ci->i_cap_exporting_mds = -1;
563 }
564
565 /* add to session cap list */ 558 /* add to session cap list */
566 cap->session = session; 559 cap->session = session;
567 spin_lock(&session->s_cap_lock); 560 spin_lock(&session->s_cap_lock);
568 list_add_tail(&cap->session_caps, &session->s_caps); 561 list_add_tail(&cap->session_caps, &session->s_caps);
569 session->s_nr_caps++; 562 session->s_nr_caps++;
570 spin_unlock(&session->s_cap_lock); 563 spin_unlock(&session->s_cap_lock);
571 } else if (new_cap) 564 } else {
572 ceph_put_cap(mdsc, new_cap); 565 if (new_cap)
566 ceph_put_cap(mdsc, new_cap);
567
568 /*
569 * auth mds of the inode changed. we received the cap export
570 * message, but still haven't received the cap import message.
571 * handle_cap_export() updated the new auth MDS' cap.
572 *
573 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
574 * a message that was send before the cap import message. So
575 * don't remove caps.
576 */
577 if (ceph_seq_cmp(seq, cap->seq) <= 0) {
578 WARN_ON(cap != ci->i_auth_cap);
579 WARN_ON(cap->cap_id != cap_id);
580 seq = cap->seq;
581 mseq = cap->mseq;
582 issued |= cap->issued;
583 flags |= CEPH_CAP_FLAG_AUTH;
584 }
585 }
573 586
574 if (!ci->i_snap_realm) { 587 if (!ci->i_snap_realm) {
575 /* 588 /*
@@ -612,15 +625,8 @@ retry:
612 ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) 625 ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
613 ci->i_auth_cap = cap; 626 ci->i_auth_cap = cap;
614 ci->i_cap_exporting_issued = 0; 627 ci->i_cap_exporting_issued = 0;
615 } else if (ci->i_auth_cap == cap) { 628 } else {
616 ci->i_auth_cap = NULL; 629 WARN_ON(ci->i_auth_cap == cap);
617 spin_lock(&mdsc->cap_dirty_lock);
618 if (!list_empty(&ci->i_dirty_item)) {
619 dout(" moving %p to cap_dirty_migrating\n", inode);
620 list_move(&ci->i_dirty_item,
621 &mdsc->cap_dirty_migrating);
622 }
623 spin_unlock(&mdsc->cap_dirty_lock);
624 } 630 }
625 631
626 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", 632 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
@@ -889,7 +895,7 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
889 */ 895 */
890static int __ceph_is_any_caps(struct ceph_inode_info *ci) 896static int __ceph_is_any_caps(struct ceph_inode_info *ci)
891{ 897{
892 return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0; 898 return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_issued;
893} 899}
894 900
895int ceph_is_any_caps(struct inode *inode) 901int ceph_is_any_caps(struct inode *inode)
@@ -1396,13 +1402,10 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1396 ci->i_snap_realm->cached_context); 1402 ci->i_snap_realm->cached_context);
1397 dout(" inode %p now dirty snapc %p auth cap %p\n", 1403 dout(" inode %p now dirty snapc %p auth cap %p\n",
1398 &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); 1404 &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
1405 WARN_ON(!ci->i_auth_cap);
1399 BUG_ON(!list_empty(&ci->i_dirty_item)); 1406 BUG_ON(!list_empty(&ci->i_dirty_item));
1400 spin_lock(&mdsc->cap_dirty_lock); 1407 spin_lock(&mdsc->cap_dirty_lock);
1401 if (ci->i_auth_cap) 1408 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1402 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1403 else
1404 list_add(&ci->i_dirty_item,
1405 &mdsc->cap_dirty_migrating);
1406 spin_unlock(&mdsc->cap_dirty_lock); 1409 spin_unlock(&mdsc->cap_dirty_lock);
1407 if (ci->i_flushing_caps == 0) { 1410 if (ci->i_flushing_caps == 0) {
1408 ihold(inode); 1411 ihold(inode);
@@ -2421,6 +2424,22 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2421 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, 2424 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2422 inode->i_size); 2425 inode->i_size);
2423 2426
2427
2428 /*
2429 * auth mds of the inode changed. we received the cap export message,
2430 * but still haven't received the cap import message. handle_cap_export
2431 * updated the new auth MDS' cap.
2432 *
2433 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
2434 * that was sent before the cap import message. So don't remove caps.
2435 */
2436 if (ceph_seq_cmp(seq, cap->seq) <= 0) {
2437 WARN_ON(cap != ci->i_auth_cap);
2438 WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
2439 seq = cap->seq;
2440 newcaps |= cap->issued;
2441 }
2442
2424 /* 2443 /*
2425 * If CACHE is being revoked, and we have no dirty buffers, 2444 * If CACHE is being revoked, and we have no dirty buffers,
2426 * try to invalidate (once). (If there are dirty buffers, we 2445 * try to invalidate (once). (If there are dirty buffers, we
@@ -2447,6 +2466,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2447 issued |= implemented | __ceph_caps_dirty(ci); 2466 issued |= implemented | __ceph_caps_dirty(ci);
2448 2467
2449 cap->cap_gen = session->s_cap_gen; 2468 cap->cap_gen = session->s_cap_gen;
2469 cap->seq = seq;
2450 2470
2451 __check_cap_issue(ci, cap, newcaps); 2471 __check_cap_issue(ci, cap, newcaps);
2452 2472
@@ -2497,6 +2517,10 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2497 le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, 2517 le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
2498 &atime); 2518 &atime);
2499 2519
2520
2521 /* file layout may have changed */
2522 ci->i_layout = grant->layout;
2523
2500 /* max size increase? */ 2524 /* max size increase? */
2501 if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { 2525 if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
2502 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); 2526 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
@@ -2525,11 +2549,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2525 check_caps = 1; 2549 check_caps = 1;
2526 } 2550 }
2527 2551
2528 cap->seq = seq;
2529
2530 /* file layout may have changed */
2531 ci->i_layout = grant->layout;
2532
2533 /* revocation, grant, or no-op? */ 2552 /* revocation, grant, or no-op? */
2534 if (cap->issued & ~newcaps) { 2553 if (cap->issued & ~newcaps) {
2535 int revoking = cap->issued & ~newcaps; 2554 int revoking = cap->issued & ~newcaps;
@@ -2755,65 +2774,114 @@ static void handle_cap_trunc(struct inode *inode,
2755 * caller holds s_mutex 2774 * caller holds s_mutex
2756 */ 2775 */
2757static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, 2776static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2758 struct ceph_mds_session *session, 2777 struct ceph_mds_cap_peer *ph,
2759 int *open_target_sessions) 2778 struct ceph_mds_session *session)
2760{ 2779{
2761 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 2780 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
2781 struct ceph_mds_session *tsession = NULL;
2782 struct ceph_cap *cap, *tcap;
2762 struct ceph_inode_info *ci = ceph_inode(inode); 2783 struct ceph_inode_info *ci = ceph_inode(inode);
2763 int mds = session->s_mds; 2784 u64 t_cap_id;
2764 unsigned mseq = le32_to_cpu(ex->migrate_seq); 2785 unsigned mseq = le32_to_cpu(ex->migrate_seq);
2765 struct ceph_cap *cap = NULL, *t; 2786 unsigned t_seq, t_mseq;
2766 struct rb_node *p; 2787 int target, issued;
2767 int remember = 1; 2788 int mds = session->s_mds;
2768 2789
2769 dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", 2790 if (ph) {
2770 inode, ci, mds, mseq); 2791 t_cap_id = le64_to_cpu(ph->cap_id);
2792 t_seq = le32_to_cpu(ph->seq);
2793 t_mseq = le32_to_cpu(ph->mseq);
2794 target = le32_to_cpu(ph->mds);
2795 } else {
2796 t_cap_id = t_seq = t_mseq = 0;
2797 target = -1;
2798 }
2771 2799
2800 dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
2801 inode, ci, mds, mseq, target);
2802retry:
2772 spin_lock(&ci->i_ceph_lock); 2803 spin_lock(&ci->i_ceph_lock);
2804 cap = __get_cap_for_mds(ci, mds);
2805 if (!cap)
2806 goto out_unlock;
2773 2807
2774 /* make sure we haven't seen a higher mseq */ 2808 if (target < 0) {
2775 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 2809 __ceph_remove_cap(cap, false);
2776 t = rb_entry(p, struct ceph_cap, ci_node); 2810 goto out_unlock;
2777 if (ceph_seq_cmp(t->mseq, mseq) > 0) {
2778 dout(" higher mseq on cap from mds%d\n",
2779 t->session->s_mds);
2780 remember = 0;
2781 }
2782 if (t->session->s_mds == mds)
2783 cap = t;
2784 } 2811 }
2785 2812
2786 if (cap) { 2813 /*
2787 if (remember) { 2814 * now we know we haven't received the cap import message yet
2788 /* make note */ 2815 * because the exported cap still exist.
2789 ci->i_cap_exporting_mds = mds; 2816 */
2790 ci->i_cap_exporting_mseq = mseq;
2791 ci->i_cap_exporting_issued = cap->issued;
2792
2793 /*
2794 * make sure we have open sessions with all possible
2795 * export targets, so that we get the matching IMPORT
2796 */
2797 *open_target_sessions = 1;
2798 2817
2799 /* 2818 issued = cap->issued;
2800 * we can't flush dirty caps that we've seen the 2819 WARN_ON(issued != cap->implemented);
2801 * EXPORT but no IMPORT for 2820
2802 */ 2821 tcap = __get_cap_for_mds(ci, target);
2803 spin_lock(&mdsc->cap_dirty_lock); 2822 if (tcap) {
2804 if (!list_empty(&ci->i_dirty_item)) { 2823 /* already have caps from the target */
2805 dout(" moving %p to cap_dirty_migrating\n", 2824 if (tcap->cap_id != t_cap_id ||
2806 inode); 2825 ceph_seq_cmp(tcap->seq, t_seq) < 0) {
2807 list_move(&ci->i_dirty_item, 2826 dout(" updating import cap %p mds%d\n", tcap, target);
2808 &mdsc->cap_dirty_migrating); 2827 tcap->cap_id = t_cap_id;
2828 tcap->seq = t_seq - 1;
2829 tcap->issue_seq = t_seq - 1;
2830 tcap->mseq = t_mseq;
2831 tcap->issued |= issued;
2832 tcap->implemented |= issued;
2833 if (cap == ci->i_auth_cap)
2834 ci->i_auth_cap = tcap;
2835 if (ci->i_flushing_caps && ci->i_auth_cap == tcap) {
2836 spin_lock(&mdsc->cap_dirty_lock);
2837 list_move_tail(&ci->i_flushing_item,
2838 &tcap->session->s_cap_flushing);
2839 spin_unlock(&mdsc->cap_dirty_lock);
2809 } 2840 }
2810 spin_unlock(&mdsc->cap_dirty_lock);
2811 } 2841 }
2812 __ceph_remove_cap(cap, false); 2842 __ceph_remove_cap(cap, false);
2843 goto out_unlock;
2844 }
2845
2846 if (tsession) {
2847 int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
2848 spin_unlock(&ci->i_ceph_lock);
2849 /* add placeholder for the export tagert */
2850 ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
2851 t_seq - 1, t_mseq, (u64)-1, flag, NULL);
2852 goto retry;
2813 } 2853 }
2814 /* else, we already released it */
2815 2854
2816 spin_unlock(&ci->i_ceph_lock); 2855 spin_unlock(&ci->i_ceph_lock);
2856 mutex_unlock(&session->s_mutex);
2857
2858 /* open target session */
2859 tsession = ceph_mdsc_open_export_target_session(mdsc, target);
2860 if (!IS_ERR(tsession)) {
2861 if (mds > target) {
2862 mutex_lock(&session->s_mutex);
2863 mutex_lock_nested(&tsession->s_mutex,
2864 SINGLE_DEPTH_NESTING);
2865 } else {
2866 mutex_lock(&tsession->s_mutex);
2867 mutex_lock_nested(&session->s_mutex,
2868 SINGLE_DEPTH_NESTING);
2869 }
2870 ceph_add_cap_releases(mdsc, tsession);
2871 } else {
2872 WARN_ON(1);
2873 tsession = NULL;
2874 target = -1;
2875 }
2876 goto retry;
2877
2878out_unlock:
2879 spin_unlock(&ci->i_ceph_lock);
2880 mutex_unlock(&session->s_mutex);
2881 if (tsession) {
2882 mutex_unlock(&tsession->s_mutex);
2883 ceph_put_mds_session(tsession);
2884 }
2817} 2885}
2818 2886
2819/* 2887/*
@@ -2915,7 +2983,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2915 void *flock; 2983 void *flock;
2916 void *end; 2984 void *end;
2917 u32 flock_len; 2985 u32 flock_len;
2918 int open_target_sessions = 0;
2919 2986
2920 dout("handle_caps from mds%d\n", mds); 2987 dout("handle_caps from mds%d\n", mds);
2921 2988
@@ -2954,6 +3021,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2954 if (p + sizeof(*peer) > end) 3021 if (p + sizeof(*peer) > end)
2955 goto bad; 3022 goto bad;
2956 peer = p; 3023 peer = p;
3024 } else if (op == CEPH_CAP_OP_EXPORT) {
3025 /* recorded in unused fields */
3026 peer = (void *)&h->size;
2957 } 3027 }
2958 } 3028 }
2959 3029
@@ -2989,8 +3059,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2989 goto done; 3059 goto done;
2990 3060
2991 case CEPH_CAP_OP_EXPORT: 3061 case CEPH_CAP_OP_EXPORT:
2992 handle_cap_export(inode, h, session, &open_target_sessions); 3062 handle_cap_export(inode, h, peer, session);
2993 goto done; 3063 goto done_unlocked;
2994 3064
2995 case CEPH_CAP_OP_IMPORT: 3065 case CEPH_CAP_OP_IMPORT:
2996 handle_cap_import(mdsc, inode, h, peer, session, 3066 handle_cap_import(mdsc, inode, h, peer, session,
@@ -3045,8 +3115,6 @@ done:
3045done_unlocked: 3115done_unlocked:
3046 if (inode) 3116 if (inode)
3047 iput(inode); 3117 iput(inode);
3048 if (open_target_sessions)
3049 ceph_mdsc_open_export_target_sessions(mdsc, session);
3050 return; 3118 return;
3051 3119
3052bad: 3120bad:
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 3db97ba15a06..6fc10a7d7c59 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -336,12 +336,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
336 ci->i_hold_caps_min = 0; 336 ci->i_hold_caps_min = 0;
337 ci->i_hold_caps_max = 0; 337 ci->i_hold_caps_max = 0;
338 INIT_LIST_HEAD(&ci->i_cap_delay_list); 338 INIT_LIST_HEAD(&ci->i_cap_delay_list);
339 ci->i_cap_exporting_mds = 0;
340 ci->i_cap_exporting_mseq = 0;
341 ci->i_cap_exporting_issued = 0;
342 INIT_LIST_HEAD(&ci->i_cap_snaps); 339 INIT_LIST_HEAD(&ci->i_cap_snaps);
343 ci->i_head_snapc = NULL; 340 ci->i_head_snapc = NULL;
344 ci->i_snap_caps = 0; 341 ci->i_snap_caps = 0;
342 ci->i_cap_exporting_issued = 0;
345 343
346 for (i = 0; i < CEPH_FILE_MODE_NUM; i++) 344 for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
347 ci->i_nr_by_mode[i] = 0; 345 ci->i_nr_by_mode[i] = 0;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index a6ba32fb0d49..c299f7d19bf3 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -287,14 +287,12 @@ struct ceph_inode_info {
287 unsigned long i_hold_caps_min; /* jiffies */ 287 unsigned long i_hold_caps_min; /* jiffies */
288 unsigned long i_hold_caps_max; /* jiffies */ 288 unsigned long i_hold_caps_max; /* jiffies */
289 struct list_head i_cap_delay_list; /* for delayed cap release to mds */ 289 struct list_head i_cap_delay_list; /* for delayed cap release to mds */
290 int i_cap_exporting_mds; /* to handle cap migration between */
291 unsigned i_cap_exporting_mseq; /* mds's. */
292 unsigned i_cap_exporting_issued;
293 struct ceph_cap_reservation i_cap_migration_resv; 290 struct ceph_cap_reservation i_cap_migration_resv;
294 struct list_head i_cap_snaps; /* snapped state pending flush to mds */ 291 struct list_head i_cap_snaps; /* snapped state pending flush to mds */
295 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or 292 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or
296 dirty|flushing caps */ 293 dirty|flushing caps */
297 unsigned i_snap_caps; /* cap bits for snapped files */ 294 unsigned i_snap_caps; /* cap bits for snapped files */
295 unsigned i_cap_exporting_issued;
298 296
299 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ 297 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
300 298