aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYan, Zheng <zyan@redhat.com>2016-07-04 06:06:41 -0400
committerIlya Dryomov <idryomov@gmail.com>2016-07-27 21:00:42 -0400
commit0e2943878942aee7100c94d0d40c49087dac12cb (patch)
tree8611ba2813ab13fd5a36a70dbd5005b29818d40e
parente4500b5e35c213e0f97be7cb69328c0877203a79 (diff)
ceph: unify cap flush and snapcap flush
This patch includes following changes - Assign flush tid to snapcap flush - Remove session's s_cap_snaps_flushing list. Add inode to session's s_cap_flushing list instead. Inode is removed from the list when there is no pending snapcap flush or cap flush. - make __kick_flushing_caps() re-send both snapcap flushes and cap flushes. Signed-off-by: Yan, Zheng <zyan@redhat.com>
-rw-r--r--fs/ceph/caps.c291
-rw-r--r--fs/ceph/mds_client.c77
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/ceph/snap.c4
-rw-r--r--fs/ceph/super.h24
5 files changed, 175 insertions, 222 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index e0efa75a1b98..0ac604719663 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -40,6 +40,7 @@
40 * cluster to release server state. 40 * cluster to release server state.
41 */ 41 */
42 42
43static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc);
43 44
44/* 45/*
45 * Generate readable cap strings for debugging output. 46 * Generate readable cap strings for debugging output.
@@ -1217,6 +1218,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1217 return delayed; 1218 return delayed;
1218} 1219}
1219 1220
1221static inline int __send_flush_snap(struct inode *inode,
1222 struct ceph_mds_session *session,
1223 struct ceph_cap_snap *capsnap,
1224 u32 mseq, u64 oldest_flush_tid)
1225{
1226 return send_cap_msg(session, ceph_vino(inode).ino, 0,
1227 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
1228 capsnap->dirty, 0, capsnap->cap_flush.tid,
1229 oldest_flush_tid, 0, mseq, capsnap->size, 0,
1230 &capsnap->mtime, &capsnap->atime,
1231 &capsnap->ctime, capsnap->time_warp_seq,
1232 capsnap->uid, capsnap->gid, capsnap->mode,
1233 capsnap->xattr_version, capsnap->xattr_blob,
1234 capsnap->follows, capsnap->inline_data);
1235}
1236
1220/* 1237/*
1221 * When a snapshot is taken, clients accumulate dirty metadata on 1238 * When a snapshot is taken, clients accumulate dirty metadata on
1222 * inodes with capabilities in ceph_cap_snaps to describe the file 1239 * inodes with capabilities in ceph_cap_snaps to describe the file
@@ -1224,14 +1241,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1224 * asynchronously back to the MDS once sync writes complete and dirty 1241 * asynchronously back to the MDS once sync writes complete and dirty
1225 * data is written out. 1242 * data is written out.
1226 * 1243 *
1227 * Unless @kick is true, skip cap_snaps that were already sent to
1228 * the MDS (i.e., during this session).
1229 *
1230 * Called under i_ceph_lock. Takes s_mutex as needed. 1244 * Called under i_ceph_lock. Takes s_mutex as needed.
1231 */ 1245 */
1232void __ceph_flush_snaps(struct ceph_inode_info *ci, 1246void __ceph_flush_snaps(struct ceph_inode_info *ci,
1233 struct ceph_mds_session **psession, 1247 struct ceph_mds_session **psession)
1234 int kick)
1235 __releases(ci->i_ceph_lock) 1248 __releases(ci->i_ceph_lock)
1236 __acquires(ci->i_ceph_lock) 1249 __acquires(ci->i_ceph_lock)
1237{ 1250{
@@ -1242,6 +1255,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci,
1242 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 1255 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1243 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold 1256 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
1244 session->s_mutex */ 1257 session->s_mutex */
1258 u64 oldest_flush_tid;
1245 u64 next_follows = 0; /* keep track of how far we've gotten through the 1259 u64 next_follows = 0; /* keep track of how far we've gotten through the
1246 i_cap_snaps list, and skip these entries next time 1260 i_cap_snaps list, and skip these entries next time
1247 around to avoid an infinite loop */ 1261 around to avoid an infinite loop */
@@ -1272,7 +1286,7 @@ retry:
1272 } 1286 }
1273 1287
1274 /* only flush each capsnap once */ 1288 /* only flush each capsnap once */
1275 if (!kick && !list_empty(&capsnap->flushing_item)) { 1289 if (capsnap->cap_flush.tid > 0) {
1276 dout("already flushed %p, skipping\n", capsnap); 1290 dout("already flushed %p, skipping\n", capsnap);
1277 continue; 1291 continue;
1278 } 1292 }
@@ -1282,8 +1296,6 @@ retry:
1282 1296
1283 if (session && session->s_mds != mds) { 1297 if (session && session->s_mds != mds) {
1284 dout("oops, wrong session %p mutex\n", session); 1298 dout("oops, wrong session %p mutex\n", session);
1285 if (kick)
1286 goto out;
1287 1299
1288 mutex_unlock(&session->s_mutex); 1300 mutex_unlock(&session->s_mutex);
1289 ceph_put_mds_session(session); 1301 ceph_put_mds_session(session);
@@ -1309,26 +1321,27 @@ retry:
1309 } 1321 }
1310 1322
1311 spin_lock(&mdsc->cap_dirty_lock); 1323 spin_lock(&mdsc->cap_dirty_lock);
1312 capsnap->flush_tid = ++mdsc->last_cap_flush_tid; 1324 capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
1325 list_add_tail(&capsnap->cap_flush.g_list,
1326 &mdsc->cap_flush_list);
1327 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
1328
1329 if (list_empty(&ci->i_flushing_item)) {
1330 list_add_tail(&ci->i_flushing_item,
1331 &session->s_cap_flushing);
1332 }
1313 spin_unlock(&mdsc->cap_dirty_lock); 1333 spin_unlock(&mdsc->cap_dirty_lock);
1314 1334
1335 list_add_tail(&capsnap->cap_flush.i_list,
1336 &ci->i_cap_flush_list);
1337
1315 atomic_inc(&capsnap->nref); 1338 atomic_inc(&capsnap->nref);
1316 if (list_empty(&capsnap->flushing_item))
1317 list_add_tail(&capsnap->flushing_item,
1318 &session->s_cap_snaps_flushing);
1319 spin_unlock(&ci->i_ceph_lock); 1339 spin_unlock(&ci->i_ceph_lock);
1320 1340
1321 dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", 1341 dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
1322 inode, capsnap, capsnap->follows, capsnap->flush_tid); 1342 inode, capsnap, capsnap->follows, capsnap->cap_flush.tid);
1323 send_cap_msg(session, ceph_vino(inode).ino, 0, 1343 __send_flush_snap(inode, session, capsnap, mseq,
1324 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, 1344 oldest_flush_tid);
1325 capsnap->dirty, 0, capsnap->flush_tid, 0,
1326 0, mseq, capsnap->size, 0,
1327 &capsnap->mtime, &capsnap->atime,
1328 &capsnap->ctime, capsnap->time_warp_seq,
1329 capsnap->uid, capsnap->gid, capsnap->mode,
1330 capsnap->xattr_version, capsnap->xattr_blob,
1331 capsnap->follows, capsnap->inline_data);
1332 1345
1333 next_follows = capsnap->follows + 1; 1346 next_follows = capsnap->follows + 1;
1334 ceph_put_cap_snap(capsnap); 1347 ceph_put_cap_snap(capsnap);
@@ -1354,7 +1367,7 @@ out:
1354static void ceph_flush_snaps(struct ceph_inode_info *ci) 1367static void ceph_flush_snaps(struct ceph_inode_info *ci)
1355{ 1368{
1356 spin_lock(&ci->i_ceph_lock); 1369 spin_lock(&ci->i_ceph_lock);
1357 __ceph_flush_snaps(ci, NULL, 0); 1370 __ceph_flush_snaps(ci, NULL);
1358 spin_unlock(&ci->i_ceph_lock); 1371 spin_unlock(&ci->i_ceph_lock);
1359} 1372}
1360 1373
@@ -1476,11 +1489,6 @@ static int __mark_caps_flushing(struct inode *inode,
1476 if (list_empty(&ci->i_flushing_item)) { 1489 if (list_empty(&ci->i_flushing_item)) {
1477 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); 1490 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1478 mdsc->num_cap_flushing++; 1491 mdsc->num_cap_flushing++;
1479 dout(" inode %p now flushing tid %llu\n", inode, cf->tid);
1480 } else {
1481 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1482 dout(" inode %p now flushing (more) tid %llu\n",
1483 inode, cf->tid);
1484 } 1492 }
1485 spin_unlock(&mdsc->cap_dirty_lock); 1493 spin_unlock(&mdsc->cap_dirty_lock);
1486 1494
@@ -1556,7 +1564,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1556 1564
1557 /* flush snaps first time around only */ 1565 /* flush snaps first time around only */
1558 if (!list_empty(&ci->i_cap_snaps)) 1566 if (!list_empty(&ci->i_cap_snaps))
1559 __ceph_flush_snaps(ci, &session, 0); 1567 __ceph_flush_snaps(ci, &session);
1560 goto retry_locked; 1568 goto retry_locked;
1561retry: 1569retry:
1562 spin_lock(&ci->i_ceph_lock); 1570 spin_lock(&ci->i_ceph_lock);
@@ -1997,80 +2005,74 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1997 return err; 2005 return err;
1998} 2006}
1999 2007
2000/* 2008static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
2001 * After a recovering MDS goes active, we need to resend any caps 2009 struct ceph_mds_session *session,
2002 * we were flushing. 2010 struct ceph_inode_info *ci,
2003 * 2011 u64 oldest_flush_tid)
2004 * Caller holds session->s_mutex. 2012 __releases(ci->i_ceph_lock)
2005 */ 2013 __acquires(ci->i_ceph_lock)
2006static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
2007 struct ceph_mds_session *session)
2008{
2009 struct ceph_cap_snap *capsnap;
2010
2011 dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
2012 list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
2013 flushing_item) {
2014 struct ceph_inode_info *ci = capsnap->ci;
2015 struct inode *inode = &ci->vfs_inode;
2016 struct ceph_cap *cap;
2017
2018 spin_lock(&ci->i_ceph_lock);
2019 cap = ci->i_auth_cap;
2020 if (cap && cap->session == session) {
2021 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
2022 cap, capsnap);
2023 __ceph_flush_snaps(ci, &session, 1);
2024 } else {
2025 pr_err("%p auth cap %p not mds%d ???\n", inode,
2026 cap, session->s_mds);
2027 }
2028 spin_unlock(&ci->i_ceph_lock);
2029 }
2030}
2031
2032static int __kick_flushing_caps(struct ceph_mds_client *mdsc,
2033 struct ceph_mds_session *session,
2034 struct ceph_inode_info *ci)
2035{ 2014{
2036 struct inode *inode = &ci->vfs_inode; 2015 struct inode *inode = &ci->vfs_inode;
2037 struct ceph_cap *cap; 2016 struct ceph_cap *cap;
2038 struct ceph_cap_flush *cf; 2017 struct ceph_cap_flush *cf;
2039 int delayed = 0; 2018 int ret;
2040 u64 first_tid = 0; 2019 u64 first_tid = 0;
2041 u64 oldest_flush_tid;
2042
2043 spin_lock(&mdsc->cap_dirty_lock);
2044 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2045 spin_unlock(&mdsc->cap_dirty_lock);
2046 2020
2047 spin_lock(&ci->i_ceph_lock);
2048 list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { 2021 list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
2049 if (cf->tid < first_tid) 2022 if (cf->tid < first_tid)
2050 continue; 2023 continue;
2051 2024
2052 cap = ci->i_auth_cap; 2025 cap = ci->i_auth_cap;
2053 if (!(cap && cap->session == session)) { 2026 if (!(cap && cap->session == session)) {
2054 pr_err("%p auth cap %p not mds%d ???\n", inode, 2027 pr_err("%p auth cap %p not mds%d ???\n",
2055 cap, session->s_mds); 2028 inode, cap, session->s_mds);
2056 spin_unlock(&ci->i_ceph_lock);
2057 break; 2029 break;
2058 } 2030 }
2059 2031
2060 first_tid = cf->tid + 1; 2032 first_tid = cf->tid + 1;
2061 2033
2062 dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode, 2034 if (cf->caps) {
2063 cap, cf->tid, ceph_cap_string(cf->caps)); 2035 dout("kick_flushing_caps %p cap %p tid %llu %s\n",
2064 delayed |= __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, 2036 inode, cap, cf->tid, ceph_cap_string(cf->caps));
2065 __ceph_caps_used(ci), 2037 ci->i_ceph_flags |= CEPH_I_NODELAY;
2066 __ceph_caps_wanted(ci), 2038 ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
2067 cap->issued | cap->implemented, 2039 __ceph_caps_used(ci),
2068 cf->caps, cf->tid, oldest_flush_tid); 2040 __ceph_caps_wanted(ci),
2041 cap->issued | cap->implemented,
2042 cf->caps, cf->tid, oldest_flush_tid);
2043 if (ret) {
2044 pr_err("kick_flushing_caps: error sending "
2045 "cap flush, ino (%llx.%llx) "
2046 "tid %llu flushing %s\n",
2047 ceph_vinop(inode), cf->tid,
2048 ceph_cap_string(cf->caps));
2049 }
2050 } else {
2051 struct ceph_cap_snap *capsnap =
2052 container_of(cf, struct ceph_cap_snap,
2053 cap_flush);
2054 dout("kick_flushing_caps %p capsnap %p tid %llu %s\n",
2055 inode, capsnap, cf->tid,
2056 ceph_cap_string(capsnap->dirty));
2057
2058 atomic_inc(&capsnap->nref);
2059 spin_unlock(&ci->i_ceph_lock);
2060
2061 ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
2062 oldest_flush_tid);
2063 if (ret < 0) {
2064 pr_err("kick_flushing_caps: error sending "
2065 "cap flushsnap, ino (%llx.%llx) "
2066 "tid %llu follows %llu\n",
2067 ceph_vinop(inode), cf->tid,
2068 capsnap->follows);
2069 }
2070
2071 ceph_put_cap_snap(capsnap);
2072 }
2069 2073
2070 spin_lock(&ci->i_ceph_lock); 2074 spin_lock(&ci->i_ceph_lock);
2071 } 2075 }
2072 spin_unlock(&ci->i_ceph_lock);
2073 return delayed;
2074} 2076}
2075 2077
2076void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, 2078void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
@@ -2078,8 +2080,14 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
2078{ 2080{
2079 struct ceph_inode_info *ci; 2081 struct ceph_inode_info *ci;
2080 struct ceph_cap *cap; 2082 struct ceph_cap *cap;
2083 u64 oldest_flush_tid;
2081 2084
2082 dout("early_kick_flushing_caps mds%d\n", session->s_mds); 2085 dout("early_kick_flushing_caps mds%d\n", session->s_mds);
2086
2087 spin_lock(&mdsc->cap_dirty_lock);
2088 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2089 spin_unlock(&mdsc->cap_dirty_lock);
2090
2083 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { 2091 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
2084 spin_lock(&ci->i_ceph_lock); 2092 spin_lock(&ci->i_ceph_lock);
2085 cap = ci->i_auth_cap; 2093 cap = ci->i_auth_cap;
@@ -2099,10 +2107,8 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
2099 */ 2107 */
2100 if ((cap->issued & ci->i_flushing_caps) != 2108 if ((cap->issued & ci->i_flushing_caps) !=
2101 ci->i_flushing_caps) { 2109 ci->i_flushing_caps) {
2102 spin_unlock(&ci->i_ceph_lock); 2110 __kick_flushing_caps(mdsc, session, ci,
2103 if (!__kick_flushing_caps(mdsc, session, ci)) 2111 oldest_flush_tid);
2104 continue;
2105 spin_lock(&ci->i_ceph_lock);
2106 } 2112 }
2107 2113
2108 spin_unlock(&ci->i_ceph_lock); 2114 spin_unlock(&ci->i_ceph_lock);
@@ -2113,50 +2119,43 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
2113 struct ceph_mds_session *session) 2119 struct ceph_mds_session *session)
2114{ 2120{
2115 struct ceph_inode_info *ci; 2121 struct ceph_inode_info *ci;
2116 2122 u64 oldest_flush_tid;
2117 kick_flushing_capsnaps(mdsc, session);
2118 2123
2119 dout("kick_flushing_caps mds%d\n", session->s_mds); 2124 dout("kick_flushing_caps mds%d\n", session->s_mds);
2125
2126 spin_lock(&mdsc->cap_dirty_lock);
2127 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2128 spin_unlock(&mdsc->cap_dirty_lock);
2129
2120 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { 2130 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
2121 int delayed = __kick_flushing_caps(mdsc, session, ci); 2131 spin_lock(&ci->i_ceph_lock);
2122 if (delayed) { 2132 __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
2123 spin_lock(&ci->i_ceph_lock); 2133 spin_unlock(&ci->i_ceph_lock);
2124 __cap_delay_requeue(mdsc, ci);
2125 spin_unlock(&ci->i_ceph_lock);
2126 }
2127 } 2134 }
2128} 2135}
2129 2136
2130static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, 2137static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
2131 struct ceph_mds_session *session, 2138 struct ceph_mds_session *session,
2132 struct inode *inode) 2139 struct inode *inode)
2140 __releases(ci->i_ceph_lock)
2133{ 2141{
2134 struct ceph_inode_info *ci = ceph_inode(inode); 2142 struct ceph_inode_info *ci = ceph_inode(inode);
2135 struct ceph_cap *cap; 2143 struct ceph_cap *cap;
2136 2144
2137 spin_lock(&ci->i_ceph_lock);
2138 cap = ci->i_auth_cap; 2145 cap = ci->i_auth_cap;
2139 dout("kick_flushing_inode_caps %p flushing %s\n", inode, 2146 dout("kick_flushing_inode_caps %p flushing %s\n", inode,
2140 ceph_cap_string(ci->i_flushing_caps)); 2147 ceph_cap_string(ci->i_flushing_caps));
2141 2148
2142 __ceph_flush_snaps(ci, &session, 1); 2149 if (!list_empty(&ci->i_cap_flush_list)) {
2143 2150 u64 oldest_flush_tid;
2144 if (ci->i_flushing_caps) {
2145 int delayed;
2146
2147 spin_lock(&mdsc->cap_dirty_lock); 2151 spin_lock(&mdsc->cap_dirty_lock);
2148 list_move_tail(&ci->i_flushing_item, 2152 list_move_tail(&ci->i_flushing_item,
2149 &cap->session->s_cap_flushing); 2153 &cap->session->s_cap_flushing);
2154 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2150 spin_unlock(&mdsc->cap_dirty_lock); 2155 spin_unlock(&mdsc->cap_dirty_lock);
2151 2156
2157 __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
2152 spin_unlock(&ci->i_ceph_lock); 2158 spin_unlock(&ci->i_ceph_lock);
2153
2154 delayed = __kick_flushing_caps(mdsc, session, ci);
2155 if (delayed) {
2156 spin_lock(&ci->i_ceph_lock);
2157 __cap_delay_requeue(mdsc, ci);
2158 spin_unlock(&ci->i_ceph_lock);
2159 }
2160 } else { 2159 } else {
2161 spin_unlock(&ci->i_ceph_lock); 2160 spin_unlock(&ci->i_ceph_lock);
2162 } 2161 }
@@ -2487,12 +2486,11 @@ static int ceph_try_drop_cap_snap(struct ceph_cap_snap *capsnap)
2487{ 2486{
2488 if (!capsnap->need_flush && 2487 if (!capsnap->need_flush &&
2489 !capsnap->writing && !capsnap->dirty_pages) { 2488 !capsnap->writing && !capsnap->dirty_pages) {
2490
2491 dout("dropping cap_snap %p follows %llu\n", 2489 dout("dropping cap_snap %p follows %llu\n",
2492 capsnap, capsnap->follows); 2490 capsnap, capsnap->follows);
2491 BUG_ON(capsnap->cap_flush.tid > 0);
2493 ceph_put_snap_context(capsnap->context); 2492 ceph_put_snap_context(capsnap->context);
2494 list_del(&capsnap->ci_item); 2493 list_del(&capsnap->ci_item);
2495 list_del(&capsnap->flushing_item);
2496 ceph_put_cap_snap(capsnap); 2494 ceph_put_cap_snap(capsnap);
2497 return 1; 2495 return 1;
2498 } 2496 }
@@ -2891,13 +2889,13 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
2891 fill_inline = true; 2889 fill_inline = true;
2892 } 2890 }
2893 2891
2894 spin_unlock(&ci->i_ceph_lock);
2895
2896 if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { 2892 if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
2897 kick_flushing_inode_caps(mdsc, session, inode);
2898 up_read(&mdsc->snap_rwsem);
2899 if (newcaps & ~issued) 2893 if (newcaps & ~issued)
2900 wake = true; 2894 wake = true;
2895 kick_flushing_inode_caps(mdsc, session, inode);
2896 up_read(&mdsc->snap_rwsem);
2897 } else {
2898 spin_unlock(&ci->i_ceph_lock);
2901 } 2899 }
2902 2900
2903 if (fill_inline) 2901 if (fill_inline)
@@ -2951,6 +2949,8 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2951 list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { 2949 list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
2952 if (cf->tid == flush_tid) 2950 if (cf->tid == flush_tid)
2953 cleaned = cf->caps; 2951 cleaned = cf->caps;
2952 if (cf->caps == 0) /* capsnap */
2953 continue;
2954 if (cf->tid <= flush_tid) { 2954 if (cf->tid <= flush_tid) {
2955 list_del(&cf->i_list); 2955 list_del(&cf->i_list);
2956 list_add_tail(&cf->i_list, &to_remove); 2956 list_add_tail(&cf->i_list, &to_remove);
@@ -2985,13 +2985,16 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2985 } 2985 }
2986 2986
2987 if (ci->i_flushing_caps == 0) { 2987 if (ci->i_flushing_caps == 0) {
2988 list_del_init(&ci->i_flushing_item); 2988 if (list_empty(&ci->i_cap_flush_list)) {
2989 if (!list_empty(&session->s_cap_flushing)) 2989 list_del_init(&ci->i_flushing_item);
2990 dout(" mds%d still flushing cap on %p\n", 2990 if (!list_empty(&session->s_cap_flushing)) {
2991 session->s_mds, 2991 dout(" mds%d still flushing cap on %p\n",
2992 &list_entry(session->s_cap_flushing.next, 2992 session->s_mds,
2993 struct ceph_inode_info, 2993 &list_first_entry(&session->s_cap_flushing,
2994 i_flushing_item)->vfs_inode); 2994 struct ceph_inode_info,
2995 i_flushing_item)->vfs_inode);
2996 }
2997 }
2995 mdsc->num_cap_flushing--; 2998 mdsc->num_cap_flushing--;
2996 dout(" inode %p now !flushing\n", inode); 2999 dout(" inode %p now !flushing\n", inode);
2997 3000
@@ -3039,7 +3042,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
3039 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 3042 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
3040 u64 follows = le64_to_cpu(m->snap_follows); 3043 u64 follows = le64_to_cpu(m->snap_follows);
3041 struct ceph_cap_snap *capsnap; 3044 struct ceph_cap_snap *capsnap;
3042 int drop = 0; 3045 int flushed = 0;
3043 3046
3044 dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", 3047 dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
3045 inode, ci, session->s_mds, follows); 3048 inode, ci, session->s_mds, follows);
@@ -3047,30 +3050,47 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
3047 spin_lock(&ci->i_ceph_lock); 3050 spin_lock(&ci->i_ceph_lock);
3048 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 3051 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
3049 if (capsnap->follows == follows) { 3052 if (capsnap->follows == follows) {
3050 if (capsnap->flush_tid != flush_tid) { 3053 if (capsnap->cap_flush.tid != flush_tid) {
3051 dout(" cap_snap %p follows %lld tid %lld !=" 3054 dout(" cap_snap %p follows %lld tid %lld !="
3052 " %lld\n", capsnap, follows, 3055 " %lld\n", capsnap, follows,
3053 flush_tid, capsnap->flush_tid); 3056 flush_tid, capsnap->cap_flush.tid);
3054 break; 3057 break;
3055 } 3058 }
3056 WARN_ON(capsnap->dirty_pages || capsnap->writing); 3059 flushed = 1;
3057 dout(" removing %p cap_snap %p follows %lld\n",
3058 inode, capsnap, follows);
3059 ceph_put_snap_context(capsnap->context);
3060 list_del(&capsnap->ci_item);
3061 list_del(&capsnap->flushing_item);
3062 ceph_put_cap_snap(capsnap);
3063 wake_up_all(&mdsc->cap_flushing_wq);
3064 drop = 1;
3065 break; 3060 break;
3066 } else { 3061 } else {
3067 dout(" skipping cap_snap %p follows %lld\n", 3062 dout(" skipping cap_snap %p follows %lld\n",
3068 capsnap, capsnap->follows); 3063 capsnap, capsnap->follows);
3069 } 3064 }
3070 } 3065 }
3066 if (flushed) {
3067 u64 oldest_flush_tid;
3068 WARN_ON(capsnap->dirty_pages || capsnap->writing);
3069 dout(" removing %p cap_snap %p follows %lld\n",
3070 inode, capsnap, follows);
3071 list_del(&capsnap->ci_item);
3072 list_del(&capsnap->cap_flush.i_list);
3073
3074 spin_lock(&mdsc->cap_dirty_lock);
3075
3076 if (list_empty(&ci->i_cap_flush_list))
3077 list_del_init(&ci->i_flushing_item);
3078
3079 list_del(&capsnap->cap_flush.g_list);
3080
3081 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
3082 if (oldest_flush_tid == 0 || oldest_flush_tid > flush_tid)
3083 wake_up_all(&mdsc->cap_flushing_wq);
3084
3085 spin_unlock(&mdsc->cap_dirty_lock);
3086 wake_up_all(&ci->i_cap_wq);
3087 }
3071 spin_unlock(&ci->i_ceph_lock); 3088 spin_unlock(&ci->i_ceph_lock);
3072 if (drop) 3089 if (flushed) {
3090 ceph_put_snap_context(capsnap->context);
3091 ceph_put_cap_snap(capsnap);
3073 iput(inode); 3092 iput(inode);
3093 }
3074} 3094}
3075 3095
3076/* 3096/*
@@ -3175,7 +3195,8 @@ retry:
3175 tcap->implemented |= issued; 3195 tcap->implemented |= issued;
3176 if (cap == ci->i_auth_cap) 3196 if (cap == ci->i_auth_cap)
3177 ci->i_auth_cap = tcap; 3197 ci->i_auth_cap = tcap;
3178 if (ci->i_flushing_caps && ci->i_auth_cap == tcap) { 3198 if (!list_empty(&ci->i_cap_flush_list) &&
3199 ci->i_auth_cap == tcap) {
3179 spin_lock(&mdsc->cap_dirty_lock); 3200 spin_lock(&mdsc->cap_dirty_lock);
3180 list_move_tail(&ci->i_flushing_item, 3201 list_move_tail(&ci->i_flushing_item,
3181 &tcap->session->s_cap_flushing); 3202 &tcap->session->s_cap_flushing);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 7cd6b861c2f3..fa9036af5445 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -472,7 +472,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
472 s->s_cap_iterator = NULL; 472 s->s_cap_iterator = NULL;
473 INIT_LIST_HEAD(&s->s_cap_releases); 473 INIT_LIST_HEAD(&s->s_cap_releases);
474 INIT_LIST_HEAD(&s->s_cap_flushing); 474 INIT_LIST_HEAD(&s->s_cap_flushing);
475 INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
476 475
477 dout("register_session mds%d\n", mds); 476 dout("register_session mds%d\n", mds);
478 if (mds >= mdsc->max_sessions) { 477 if (mds >= mdsc->max_sessions) {
@@ -1479,21 +1478,6 @@ static int trim_caps(struct ceph_mds_client *mdsc,
1479 return 0; 1478 return 0;
1480} 1479}
1481 1480
1482static int check_capsnap_flush(struct ceph_inode_info *ci,
1483 u64 want_snap_seq)
1484{
1485 int ret = 1;
1486 spin_lock(&ci->i_ceph_lock);
1487 if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) {
1488 struct ceph_cap_snap *capsnap =
1489 list_first_entry(&ci->i_cap_snaps,
1490 struct ceph_cap_snap, ci_item);
1491 ret = capsnap->follows >= want_snap_seq;
1492 }
1493 spin_unlock(&ci->i_ceph_lock);
1494 return ret;
1495}
1496
1497static int check_caps_flush(struct ceph_mds_client *mdsc, 1481static int check_caps_flush(struct ceph_mds_client *mdsc,
1498 u64 want_flush_tid) 1482 u64 want_flush_tid)
1499{ 1483{
@@ -1520,54 +1504,9 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
1520 * returns true if we've flushed through want_flush_tid 1504 * returns true if we've flushed through want_flush_tid
1521 */ 1505 */
1522static void wait_caps_flush(struct ceph_mds_client *mdsc, 1506static void wait_caps_flush(struct ceph_mds_client *mdsc,
1523 u64 want_flush_tid, u64 want_snap_seq) 1507 u64 want_flush_tid)
1524{ 1508{
1525 int mds; 1509 dout("check_caps_flush want %llu\n", want_flush_tid);
1526
1527 dout("check_caps_flush want %llu snap want %llu\n",
1528 want_flush_tid, want_snap_seq);
1529 mutex_lock(&mdsc->mutex);
1530 for (mds = 0; mds < mdsc->max_sessions; ) {
1531 struct ceph_mds_session *session = mdsc->sessions[mds];
1532 struct inode *inode = NULL;
1533
1534 if (!session) {
1535 mds++;
1536 continue;
1537 }
1538 get_session(session);
1539 mutex_unlock(&mdsc->mutex);
1540
1541 mutex_lock(&session->s_mutex);
1542 if (!list_empty(&session->s_cap_snaps_flushing)) {
1543 struct ceph_cap_snap *capsnap =
1544 list_first_entry(&session->s_cap_snaps_flushing,
1545 struct ceph_cap_snap,
1546 flushing_item);
1547 struct ceph_inode_info *ci = capsnap->ci;
1548 if (!check_capsnap_flush(ci, want_snap_seq)) {
1549 dout("check_cap_flush still flushing snap %p "
1550 "follows %lld <= %lld to mds%d\n",
1551 &ci->vfs_inode, capsnap->follows,
1552 want_snap_seq, mds);
1553 inode = igrab(&ci->vfs_inode);
1554 }
1555 }
1556 mutex_unlock(&session->s_mutex);
1557 ceph_put_mds_session(session);
1558
1559 if (inode) {
1560 wait_event(mdsc->cap_flushing_wq,
1561 check_capsnap_flush(ceph_inode(inode),
1562 want_snap_seq));
1563 iput(inode);
1564 } else {
1565 mds++;
1566 }
1567
1568 mutex_lock(&mdsc->mutex);
1569 }
1570 mutex_unlock(&mdsc->mutex);
1571 1510
1572 wait_event(mdsc->cap_flushing_wq, 1511 wait_event(mdsc->cap_flushing_wq,
1573 check_caps_flush(mdsc, want_flush_tid)); 1512 check_caps_flush(mdsc, want_flush_tid));
@@ -3584,7 +3523,7 @@ restart:
3584 3523
3585void ceph_mdsc_sync(struct ceph_mds_client *mdsc) 3524void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3586{ 3525{
3587 u64 want_tid, want_flush, want_snap; 3526 u64 want_tid, want_flush;
3588 3527
3589 if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) 3528 if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
3590 return; 3529 return;
@@ -3599,15 +3538,11 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3599 want_flush = mdsc->last_cap_flush_tid; 3538 want_flush = mdsc->last_cap_flush_tid;
3600 spin_unlock(&mdsc->cap_dirty_lock); 3539 spin_unlock(&mdsc->cap_dirty_lock);
3601 3540
3602 down_read(&mdsc->snap_rwsem); 3541 dout("sync want tid %lld flush_seq %lld\n",
3603 want_snap = mdsc->last_snap_seq; 3542 want_tid, want_flush);
3604 up_read(&mdsc->snap_rwsem);
3605
3606 dout("sync want tid %lld flush_seq %lld snap_seq %lld\n",
3607 want_tid, want_flush, want_snap);
3608 3543
3609 wait_unsafe_requests(mdsc, want_tid); 3544 wait_unsafe_requests(mdsc, want_tid);
3610 wait_caps_flush(mdsc, want_flush, want_snap); 3545 wait_caps_flush(mdsc, want_flush);
3611} 3546}
3612 3547
3613/* 3548/*
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 93170b4b5d75..6b3679737d4a 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -152,7 +152,6 @@ struct ceph_mds_session {
152 152
153 /* protected by mutex */ 153 /* protected by mutex */
154 struct list_head s_cap_flushing; /* inodes w/ flushing caps */ 154 struct list_head s_cap_flushing; /* inodes w/ flushing caps */
155 struct list_head s_cap_snaps_flushing;
156 unsigned long s_renew_requested; /* last time we sent a renew req */ 155 unsigned long s_renew_requested; /* last time we sent a renew req */
157 u64 s_renew_seq; 156 u64 s_renew_seq;
158 157
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index eadf2c33edc6..20d5b0cdf655 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -520,9 +520,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
520 ihold(inode); 520 ihold(inode);
521 521
522 atomic_set(&capsnap->nref, 1); 522 atomic_set(&capsnap->nref, 1);
523 capsnap->ci = ci;
524 INIT_LIST_HEAD(&capsnap->ci_item); 523 INIT_LIST_HEAD(&capsnap->ci_item);
525 INIT_LIST_HEAD(&capsnap->flushing_item);
526 524
527 capsnap->follows = old_snapc->seq; 525 capsnap->follows = old_snapc->seq;
528 capsnap->issued = __ceph_caps_issued(ci, NULL); 526 capsnap->issued = __ceph_caps_issued(ci, NULL);
@@ -800,7 +798,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
800 ihold(inode); 798 ihold(inode);
801 spin_unlock(&mdsc->snap_flush_lock); 799 spin_unlock(&mdsc->snap_flush_lock);
802 spin_lock(&ci->i_ceph_lock); 800 spin_lock(&ci->i_ceph_lock);
803 __ceph_flush_snaps(ci, &session, 0); 801 __ceph_flush_snaps(ci, &session);
804 spin_unlock(&ci->i_ceph_lock); 802 spin_unlock(&ci->i_ceph_lock);
805 iput(inode); 803 iput(inode);
806 spin_lock(&mdsc->snap_flush_lock); 804 spin_lock(&mdsc->snap_flush_lock);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 29e8b7bd9413..08ed51299f9f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -147,6 +147,13 @@ struct ceph_cap {
147#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */ 147#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
148#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */ 148#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
149 149
150struct ceph_cap_flush {
151 u64 tid;
152 int caps; /* 0 means capsnap */
153 struct list_head g_list; // global
154 struct list_head i_list; // per inode
155};
156
150/* 157/*
151 * Snapped cap state that is pending flush to mds. When a snapshot occurs, 158 * Snapped cap state that is pending flush to mds. When a snapshot occurs,
152 * we first complete any in-process sync writes and writeback any dirty 159 * we first complete any in-process sync writes and writeback any dirty
@@ -154,10 +161,11 @@ struct ceph_cap {
154 */ 161 */
155struct ceph_cap_snap { 162struct ceph_cap_snap {
156 atomic_t nref; 163 atomic_t nref;
157 struct ceph_inode_info *ci; 164 struct list_head ci_item;
158 struct list_head ci_item, flushing_item;
159 165
160 u64 follows, flush_tid; 166 struct ceph_cap_flush cap_flush;
167
168 u64 follows;
161 int issued, dirty; 169 int issued, dirty;
162 struct ceph_snap_context *context; 170 struct ceph_snap_context *context;
163 171
@@ -186,13 +194,6 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
186 } 194 }
187} 195}
188 196
189struct ceph_cap_flush {
190 u64 tid;
191 int caps;
192 struct list_head g_list; // global
193 struct list_head i_list; // per inode
194};
195
196/* 197/*
197 * The frag tree describes how a directory is fragmented, potentially across 198 * The frag tree describes how a directory is fragmented, potentially across
198 * multiple metadata servers. It is also used to indicate points where 199 * multiple metadata servers. It is also used to indicate points where
@@ -888,8 +889,7 @@ extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
888extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, 889extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
889 struct ceph_snap_context *snapc); 890 struct ceph_snap_context *snapc);
890extern void __ceph_flush_snaps(struct ceph_inode_info *ci, 891extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
891 struct ceph_mds_session **psession, 892 struct ceph_mds_session **psession);
892 int again);
893extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, 893extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
894 struct ceph_mds_session *session); 894 struct ceph_mds_session *session);
895extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc); 895extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);