aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYan, Zheng <zyan@redhat.com>2016-07-07 03:22:38 -0400
committerIlya Dryomov <idryomov@gmail.com>2016-07-27 21:00:45 -0400
commitc8799fc4674fe5bb9b9391f9eac202250b8370e1 (patch)
tree2033af32e00355b5f63343e9e0bd374f40b5c76c
parented9b430c9ba99e70e8ddd7e08429c4c2a620ba74 (diff)
ceph: optimize cap flush waiting
Add a 'wake' flag to ceph_cap_flush struct, which indicates if there is someone waiting for it to finish. When getting flush ack message, we check the 'wake' flag in corresponding ceph_cap_flush struct to decide if we should wake up waiters. One corner case is that the acked cap flush has 'wake' flags is set, but it is not the first one on the flushing list. We do not wake up waiters in this case, set 'wake' flags of preceding ceph_cap_flush struct instead Signed-off-by: Yan, Zheng <zyan@redhat.com>
-rw-r--r--fs/ceph/caps.c91
-rw-r--r--fs/ceph/mds_client.c8
-rw-r--r--fs/ceph/super.h1
3 files changed, 73 insertions, 27 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 736e1c86bcf3..99115cae1652 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1474,13 +1474,44 @@ static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
1474} 1474}
1475 1475
1476/* 1476/*
1477 * Remove cap_flush from the mdsc's or inode's flushing cap list.
1478 * Return true if caller needs to wake up flush waiters.
1479 */
1480static bool __finish_cap_flush(struct ceph_mds_client *mdsc,
1481 struct ceph_inode_info *ci,
1482 struct ceph_cap_flush *cf)
1483{
1484 struct ceph_cap_flush *prev;
1485 bool wake = cf->wake;
1486 if (mdsc) {
1487 /* are there older pending cap flushes? */
1488 if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
1489 prev = list_prev_entry(cf, g_list);
1490 prev->wake = true;
1491 wake = false;
1492 }
1493 list_del(&cf->g_list);
1494 } else if (ci) {
1495 if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
1496 prev = list_prev_entry(cf, i_list);
1497 prev->wake = true;
1498 wake = false;
1499 }
1500 list_del(&cf->i_list);
1501 } else {
1502 BUG_ON(1);
1503 }
1504 return wake;
1505}
1506
1507/*
1477 * Add dirty inode to the flushing list. Assigned a seq number so we 1508 * Add dirty inode to the flushing list. Assigned a seq number so we
1478 * can wait for caps to flush without starving. 1509 * can wait for caps to flush without starving.
1479 * 1510 *
1480 * Called under i_ceph_lock. 1511 * Called under i_ceph_lock.
1481 */ 1512 */
1482static int __mark_caps_flushing(struct inode *inode, 1513static int __mark_caps_flushing(struct inode *inode,
1483 struct ceph_mds_session *session, 1514 struct ceph_mds_session *session, bool wake,
1484 u64 *flush_tid, u64 *oldest_flush_tid) 1515 u64 *flush_tid, u64 *oldest_flush_tid)
1485{ 1516{
1486 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 1517 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
@@ -1503,6 +1534,7 @@ static int __mark_caps_flushing(struct inode *inode,
1503 1534
1504 swap(cf, ci->i_prealloc_cap_flush); 1535 swap(cf, ci->i_prealloc_cap_flush);
1505 cf->caps = flushing; 1536 cf->caps = flushing;
1537 cf->wake = wake;
1506 1538
1507 spin_lock(&mdsc->cap_dirty_lock); 1539 spin_lock(&mdsc->cap_dirty_lock);
1508 list_del_init(&ci->i_dirty_item); 1540 list_del_init(&ci->i_dirty_item);
@@ -1808,7 +1840,7 @@ ack:
1808 } 1840 }
1809 1841
1810 if (cap == ci->i_auth_cap && ci->i_dirty_caps) { 1842 if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
1811 flushing = __mark_caps_flushing(inode, session, 1843 flushing = __mark_caps_flushing(inode, session, false,
1812 &flush_tid, 1844 &flush_tid,
1813 &oldest_flush_tid); 1845 &oldest_flush_tid);
1814 } else { 1846 } else {
@@ -1885,8 +1917,8 @@ retry:
1885 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) 1917 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
1886 goto out; 1918 goto out;
1887 1919
1888 flushing = __mark_caps_flushing(inode, session, &flush_tid, 1920 flushing = __mark_caps_flushing(inode, session, true,
1889 &oldest_flush_tid); 1921 &flush_tid, &oldest_flush_tid);
1890 1922
1891 /* __send_cap drops i_ceph_lock */ 1923 /* __send_cap drops i_ceph_lock */
1892 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, 1924 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
@@ -1902,7 +1934,8 @@ retry:
1902 if (!list_empty(&ci->i_cap_flush_list)) { 1934 if (!list_empty(&ci->i_cap_flush_list)) {
1903 struct ceph_cap_flush *cf = 1935 struct ceph_cap_flush *cf =
1904 list_last_entry(&ci->i_cap_flush_list, 1936 list_last_entry(&ci->i_cap_flush_list,
1905 struct ceph_cap_flush, i_list); 1937 struct ceph_cap_flush, i_list);
1938 cf->wake = true;
1906 flush_tid = cf->tid; 1939 flush_tid = cf->tid;
1907 } 1940 }
1908 flushing = ci->i_flushing_caps; 1941 flushing = ci->i_flushing_caps;
@@ -3022,7 +3055,9 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
3022 unsigned seq = le32_to_cpu(m->seq); 3055 unsigned seq = le32_to_cpu(m->seq);
3023 int dirty = le32_to_cpu(m->dirty); 3056 int dirty = le32_to_cpu(m->dirty);
3024 int cleaned = 0; 3057 int cleaned = 0;
3025 int drop = 0; 3058 bool drop = false;
3059 bool wake_ci = 0;
3060 bool wake_mdsc = 0;
3026 3061
3027 list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { 3062 list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
3028 if (cf->tid == flush_tid) 3063 if (cf->tid == flush_tid)
@@ -3030,7 +3065,8 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
3030 if (cf->caps == 0) /* capsnap */ 3065 if (cf->caps == 0) /* capsnap */
3031 continue; 3066 continue;
3032 if (cf->tid <= flush_tid) { 3067 if (cf->tid <= flush_tid) {
3033 list_del(&cf->i_list); 3068 if (__finish_cap_flush(NULL, ci, cf))
3069 wake_ci = true;
3034 list_add_tail(&cf->i_list, &to_remove); 3070 list_add_tail(&cf->i_list, &to_remove);
3035 } else { 3071 } else {
3036 cleaned &= ~cf->caps; 3072 cleaned &= ~cf->caps;
@@ -3052,14 +3088,9 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
3052 3088
3053 spin_lock(&mdsc->cap_dirty_lock); 3089 spin_lock(&mdsc->cap_dirty_lock);
3054 3090
3055 if (!list_empty(&to_remove)) { 3091 list_for_each_entry(cf, &to_remove, i_list) {
3056 u64 oldest_flush_tid; 3092 if (__finish_cap_flush(mdsc, NULL, cf))
3057 list_for_each_entry(cf, &to_remove, i_list) 3093 wake_mdsc = true;
3058 list_del(&cf->g_list);
3059
3060 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
3061 if (oldest_flush_tid == 0 || oldest_flush_tid > flush_tid)
3062 wake_up_all(&mdsc->cap_flushing_wq);
3063 } 3094 }
3064 3095
3065 if (ci->i_flushing_caps == 0) { 3096 if (ci->i_flushing_caps == 0) {
@@ -3079,7 +3110,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
3079 if (ci->i_dirty_caps == 0) { 3110 if (ci->i_dirty_caps == 0) {
3080 dout(" inode %p now clean\n", inode); 3111 dout(" inode %p now clean\n", inode);
3081 BUG_ON(!list_empty(&ci->i_dirty_item)); 3112 BUG_ON(!list_empty(&ci->i_dirty_item));
3082 drop = 1; 3113 drop = true;
3083 if (ci->i_wr_ref == 0 && 3114 if (ci->i_wr_ref == 0 &&
3084 ci->i_wrbuffer_ref_head == 0) { 3115 ci->i_wrbuffer_ref_head == 0) {
3085 BUG_ON(!ci->i_head_snapc); 3116 BUG_ON(!ci->i_head_snapc);
@@ -3091,7 +3122,6 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
3091 } 3122 }
3092 } 3123 }
3093 spin_unlock(&mdsc->cap_dirty_lock); 3124 spin_unlock(&mdsc->cap_dirty_lock);
3094 wake_up_all(&ci->i_cap_wq);
3095 3125
3096out: 3126out:
3097 spin_unlock(&ci->i_ceph_lock); 3127 spin_unlock(&ci->i_ceph_lock);
@@ -3102,6 +3132,11 @@ out:
3102 list_del(&cf->i_list); 3132 list_del(&cf->i_list);
3103 ceph_free_cap_flush(cf); 3133 ceph_free_cap_flush(cf);
3104 } 3134 }
3135
3136 if (wake_ci)
3137 wake_up_all(&ci->i_cap_wq);
3138 if (wake_mdsc)
3139 wake_up_all(&mdsc->cap_flushing_wq);
3105 if (drop) 3140 if (drop)
3106 iput(inode); 3141 iput(inode);
3107} 3142}
@@ -3120,7 +3155,9 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
3120 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 3155 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
3121 u64 follows = le64_to_cpu(m->snap_follows); 3156 u64 follows = le64_to_cpu(m->snap_follows);
3122 struct ceph_cap_snap *capsnap; 3157 struct ceph_cap_snap *capsnap;
3123 int flushed = 0; 3158 bool flushed = false;
3159 bool wake_ci = false;
3160 bool wake_mdsc = false;
3124 3161
3125 dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", 3162 dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
3126 inode, ci, session->s_mds, follows); 3163 inode, ci, session->s_mds, follows);
@@ -3134,7 +3171,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
3134 flush_tid, capsnap->cap_flush.tid); 3171 flush_tid, capsnap->cap_flush.tid);
3135 break; 3172 break;
3136 } 3173 }
3137 flushed = 1; 3174 flushed = true;
3138 break; 3175 break;
3139 } else { 3176 } else {
3140 dout(" skipping cap_snap %p follows %lld\n", 3177 dout(" skipping cap_snap %p follows %lld\n",
@@ -3142,31 +3179,31 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
3142 } 3179 }
3143 } 3180 }
3144 if (flushed) { 3181 if (flushed) {
3145 u64 oldest_flush_tid;
3146 WARN_ON(capsnap->dirty_pages || capsnap->writing); 3182 WARN_ON(capsnap->dirty_pages || capsnap->writing);
3147 dout(" removing %p cap_snap %p follows %lld\n", 3183 dout(" removing %p cap_snap %p follows %lld\n",
3148 inode, capsnap, follows); 3184 inode, capsnap, follows);
3149 list_del(&capsnap->ci_item); 3185 list_del(&capsnap->ci_item);
3150 list_del(&capsnap->cap_flush.i_list); 3186 if (__finish_cap_flush(NULL, ci, &capsnap->cap_flush))
3187 wake_ci = true;
3151 3188
3152 spin_lock(&mdsc->cap_dirty_lock); 3189 spin_lock(&mdsc->cap_dirty_lock);
3153 3190
3154 if (list_empty(&ci->i_cap_flush_list)) 3191 if (list_empty(&ci->i_cap_flush_list))
3155 list_del_init(&ci->i_flushing_item); 3192 list_del_init(&ci->i_flushing_item);
3156 3193
3157 list_del(&capsnap->cap_flush.g_list); 3194 if (__finish_cap_flush(mdsc, NULL, &capsnap->cap_flush))
3158 3195 wake_mdsc = true;
3159 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
3160 if (oldest_flush_tid == 0 || oldest_flush_tid > flush_tid)
3161 wake_up_all(&mdsc->cap_flushing_wq);
3162 3196
3163 spin_unlock(&mdsc->cap_dirty_lock); 3197 spin_unlock(&mdsc->cap_dirty_lock);
3164 wake_up_all(&ci->i_cap_wq);
3165 } 3198 }
3166 spin_unlock(&ci->i_ceph_lock); 3199 spin_unlock(&ci->i_ceph_lock);
3167 if (flushed) { 3200 if (flushed) {
3168 ceph_put_snap_context(capsnap->context); 3201 ceph_put_snap_context(capsnap->context);
3169 ceph_put_cap_snap(capsnap); 3202 ceph_put_cap_snap(capsnap);
3203 if (wake_ci)
3204 wake_up_all(&ci->i_cap_wq);
3205 if (wake_mdsc)
3206 wake_up_all(&mdsc->cap_flushing_wq);
3170 iput(inode); 3207 iput(inode);
3171 } 3208 }
3172} 3209}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fa9036af5445..cdc6a17f5867 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1212,6 +1212,8 @@ static void remove_session_caps(struct ceph_mds_session *session)
1212 dout("remove_session_caps on %p\n", session); 1212 dout("remove_session_caps on %p\n", session);
1213 iterate_session_caps(session, remove_session_caps_cb, fsc); 1213 iterate_session_caps(session, remove_session_caps_cb, fsc);
1214 1214
1215 wake_up_all(&fsc->mdsc->cap_flushing_wq);
1216
1215 spin_lock(&session->s_cap_lock); 1217 spin_lock(&session->s_cap_lock);
1216 if (session->s_nr_caps > 0) { 1218 if (session->s_nr_caps > 0) {
1217 struct inode *inode; 1219 struct inode *inode;
@@ -3536,6 +3538,12 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3536 ceph_flush_dirty_caps(mdsc); 3538 ceph_flush_dirty_caps(mdsc);
3537 spin_lock(&mdsc->cap_dirty_lock); 3539 spin_lock(&mdsc->cap_dirty_lock);
3538 want_flush = mdsc->last_cap_flush_tid; 3540 want_flush = mdsc->last_cap_flush_tid;
3541 if (!list_empty(&mdsc->cap_flush_list)) {
3542 struct ceph_cap_flush *cf =
3543 list_last_entry(&mdsc->cap_flush_list,
3544 struct ceph_cap_flush, g_list);
3545 cf->wake = true;
3546 }
3539 spin_unlock(&mdsc->cap_dirty_lock); 3547 spin_unlock(&mdsc->cap_dirty_lock);
3540 3548
3541 dout("sync want tid %lld flush_seq %lld\n", 3549 dout("sync want tid %lld flush_seq %lld\n",
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b097d474f888..3e3fa9163059 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -150,6 +150,7 @@ struct ceph_cap {
150struct ceph_cap_flush { 150struct ceph_cap_flush {
151 u64 tid; 151 u64 tid;
152 int caps; /* 0 means capsnap */ 152 int caps; /* 0 means capsnap */
153 bool wake; /* wake up flush waiters when finish ? */
153 struct list_head g_list; // global 154 struct list_head g_list; // global
154 struct list_head i_list; // per inode 155 struct list_head i_list; // per inode
155}; 156};