aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ceph
diff options
context:
space:
mode:
authorYan, Zheng <zyan@redhat.com>2015-06-09 23:09:32 -0400
committerIlya Dryomov <idryomov@gmail.com>2015-06-25 04:49:31 -0400
commita2971c8ccb9bd7677a6c43cdbed9aacfef5e9f26 (patch)
tree6566e4d5bb1903328c9b7d453e660c7c2beeb72e /fs/ceph
parent8310b08913eca8aee98744c9aff1ec0d1f603b19 (diff)
ceph: send TID of the oldest pending caps flush to MDS
According to this information, MDS can trim its completed caps flush list (which is used to detect duplicated cap flush). Signed-off-by: Yan, Zheng <zyan@redhat.com>
Diffstat (limited to 'fs/ceph')
-rw-r--r--fs/ceph/caps.c67
1 files changed, 49 insertions, 18 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 0295048724d2..420272788e01 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -986,8 +986,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
986static int send_cap_msg(struct ceph_mds_session *session, 986static int send_cap_msg(struct ceph_mds_session *session,
987 u64 ino, u64 cid, int op, 987 u64 ino, u64 cid, int op,
988 int caps, int wanted, int dirty, 988 int caps, int wanted, int dirty,
989 u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq, 989 u32 seq, u64 flush_tid, u64 oldest_flush_tid,
990 u64 size, u64 max_size, 990 u32 issue_seq, u32 mseq, u64 size, u64 max_size,
991 struct timespec *mtime, struct timespec *atime, 991 struct timespec *mtime, struct timespec *atime,
992 u64 time_warp_seq, 992 u64 time_warp_seq,
993 kuid_t uid, kgid_t gid, umode_t mode, 993 kuid_t uid, kgid_t gid, umode_t mode,
@@ -1001,20 +1001,23 @@ static int send_cap_msg(struct ceph_mds_session *session,
1001 size_t extra_len; 1001 size_t extra_len;
1002 1002
1003 dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" 1003 dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
1004 " seq %u/%u mseq %u follows %lld size %llu/%llu" 1004 " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
1005 " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op), 1005 " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
1006 cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted), 1006 cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
1007 ceph_cap_string(dirty), 1007 ceph_cap_string(dirty),
1008 seq, issue_seq, mseq, follows, size, max_size, 1008 seq, issue_seq, flush_tid, oldest_flush_tid,
1009 mseq, follows, size, max_size,
1009 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); 1010 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
1010 1011
1011 /* flock buffer size + inline version + inline data size */ 1012 /* flock buffer size + inline version + inline data size +
1012 extra_len = 4 + 8 + 4; 1013 * osd_epoch_barrier + oldest_flush_tid */
1014 extra_len = 4 + 8 + 4 + 4 + 8;
1013 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, 1015 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
1014 GFP_NOFS, false); 1016 GFP_NOFS, false);
1015 if (!msg) 1017 if (!msg)
1016 return -ENOMEM; 1018 return -ENOMEM;
1017 1019
1020 msg->hdr.version = cpu_to_le16(6);
1018 msg->hdr.tid = cpu_to_le64(flush_tid); 1021 msg->hdr.tid = cpu_to_le64(flush_tid);
1019 1022
1020 fc = msg->front.iov_base; 1023 fc = msg->front.iov_base;
@@ -1050,6 +1053,10 @@ static int send_cap_msg(struct ceph_mds_session *session,
1050 ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE); 1053 ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE);
1051 /* inline data size */ 1054 /* inline data size */
1052 ceph_encode_32(&p, 0); 1055 ceph_encode_32(&p, 0);
1056 /* osd_epoch_barrier */
1057 ceph_encode_32(&p, 0);
1058 /* oldest_flush_tid */
1059 ceph_encode_64(&p, oldest_flush_tid);
1053 1060
1054 fc->xattr_version = cpu_to_le64(xattr_version); 1061 fc->xattr_version = cpu_to_le64(xattr_version);
1055 if (xattrs_buf) { 1062 if (xattrs_buf) {
@@ -1098,7 +1105,7 @@ void ceph_queue_caps_release(struct inode *inode)
1098 */ 1105 */
1099static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, 1106static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1100 int op, int used, int want, int retain, int flushing, 1107 int op, int used, int want, int retain, int flushing,
1101 u64 flush_tid) 1108 u64 flush_tid, u64 oldest_flush_tid)
1102 __releases(cap->ci->i_ceph_lock) 1109 __releases(cap->ci->i_ceph_lock)
1103{ 1110{
1104 struct ceph_inode_info *ci = cap->ci; 1111 struct ceph_inode_info *ci = cap->ci;
@@ -1187,7 +1194,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1187 spin_unlock(&ci->i_ceph_lock); 1194 spin_unlock(&ci->i_ceph_lock);
1188 1195
1189 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, 1196 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
1190 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, 1197 op, keep, want, flushing, seq,
1198 flush_tid, oldest_flush_tid, issue_seq, mseq,
1191 size, max_size, &mtime, &atime, time_warp_seq, 1199 size, max_size, &mtime, &atime, time_warp_seq,
1192 uid, gid, mode, xattr_version, xattr_blob, 1200 uid, gid, mode, xattr_version, xattr_blob,
1193 follows, inline_data); 1201 follows, inline_data);
@@ -1307,8 +1315,8 @@ retry:
1307 inode, capsnap, capsnap->follows, capsnap->flush_tid); 1315 inode, capsnap, capsnap->follows, capsnap->flush_tid);
1308 send_cap_msg(session, ceph_vino(inode).ino, 0, 1316 send_cap_msg(session, ceph_vino(inode).ino, 0,
1309 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, 1317 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
1310 capsnap->dirty, 0, capsnap->flush_tid, 0, mseq, 1318 capsnap->dirty, 0, capsnap->flush_tid, 0,
1311 capsnap->size, 0, 1319 0, mseq, capsnap->size, 0,
1312 &capsnap->mtime, &capsnap->atime, 1320 &capsnap->mtime, &capsnap->atime,
1313 capsnap->time_warp_seq, 1321 capsnap->time_warp_seq,
1314 capsnap->uid, capsnap->gid, capsnap->mode, 1322 capsnap->uid, capsnap->gid, capsnap->mode,
@@ -1438,6 +1446,17 @@ static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
1438 rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree); 1446 rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree);
1439} 1447}
1440 1448
1449static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
1450{
1451 struct rb_node *n = rb_first(&mdsc->cap_flush_tree);
1452 if (n) {
1453 struct ceph_cap_flush *cf =
1454 rb_entry(n, struct ceph_cap_flush, g_node);
1455 return cf->tid;
1456 }
1457 return 0;
1458}
1459
1441/* 1460/*
1442 * Add dirty inode to the flushing list. Assigned a seq number so we 1461 * Add dirty inode to the flushing list. Assigned a seq number so we
1443 * can wait for caps to flush without starving. 1462 * can wait for caps to flush without starving.
@@ -1446,7 +1465,7 @@ static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
1446 */ 1465 */
1447static int __mark_caps_flushing(struct inode *inode, 1466static int __mark_caps_flushing(struct inode *inode,
1448 struct ceph_mds_session *session, 1467 struct ceph_mds_session *session,
1449 u64 *flush_tid) 1468 u64 *flush_tid, u64 *oldest_flush_tid)
1450{ 1469{
1451 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 1470 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1452 struct ceph_inode_info *ci = ceph_inode(inode); 1471 struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1473,6 +1492,7 @@ static int __mark_caps_flushing(struct inode *inode,
1473 1492
1474 cf->tid = ++mdsc->last_cap_flush_tid; 1493 cf->tid = ++mdsc->last_cap_flush_tid;
1475 __add_cap_flushing_to_mdsc(mdsc, cf); 1494 __add_cap_flushing_to_mdsc(mdsc, cf);
1495 *oldest_flush_tid = __get_oldest_flush_tid(mdsc);
1476 1496
1477 if (list_empty(&ci->i_flushing_item)) { 1497 if (list_empty(&ci->i_flushing_item)) {
1478 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); 1498 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
@@ -1533,7 +1553,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1533 struct ceph_mds_client *mdsc = fsc->mdsc; 1553 struct ceph_mds_client *mdsc = fsc->mdsc;
1534 struct inode *inode = &ci->vfs_inode; 1554 struct inode *inode = &ci->vfs_inode;
1535 struct ceph_cap *cap; 1555 struct ceph_cap *cap;
1536 u64 flush_tid; 1556 u64 flush_tid, oldest_flush_tid;
1537 int file_wanted, used, cap_used; 1557 int file_wanted, used, cap_used;
1538 int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ 1558 int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
1539 int issued, implemented, want, retain, revoking, flushing = 0; 1559 int issued, implemented, want, retain, revoking, flushing = 0;
@@ -1754,10 +1774,14 @@ ack:
1754 1774
1755 if (cap == ci->i_auth_cap && ci->i_dirty_caps) { 1775 if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
1756 flushing = __mark_caps_flushing(inode, session, 1776 flushing = __mark_caps_flushing(inode, session,
1757 &flush_tid); 1777 &flush_tid,
1778 &oldest_flush_tid);
1758 } else { 1779 } else {
1759 flushing = 0; 1780 flushing = 0;
1760 flush_tid = 0; 1781 flush_tid = 0;
1782 spin_lock(&mdsc->cap_dirty_lock);
1783 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
1784 spin_unlock(&mdsc->cap_dirty_lock);
1761 } 1785 }
1762 1786
1763 mds = cap->mds; /* remember mds, so we don't repeat */ 1787 mds = cap->mds; /* remember mds, so we don't repeat */
@@ -1765,7 +1789,8 @@ ack:
1765 1789
1766 /* __send_cap drops i_ceph_lock */ 1790 /* __send_cap drops i_ceph_lock */
1767 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, 1791 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
1768 want, retain, flushing, flush_tid); 1792 want, retain, flushing,
1793 flush_tid, oldest_flush_tid);
1769 goto retry; /* retake i_ceph_lock and restart our cap scan. */ 1794 goto retry; /* retake i_ceph_lock and restart our cap scan. */
1770 } 1795 }
1771 1796
@@ -1800,7 +1825,7 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
1800 struct ceph_inode_info *ci = ceph_inode(inode); 1825 struct ceph_inode_info *ci = ceph_inode(inode);
1801 struct ceph_mds_session *session = NULL; 1826 struct ceph_mds_session *session = NULL;
1802 int flushing = 0; 1827 int flushing = 0;
1803 u64 flush_tid = 0; 1828 u64 flush_tid = 0, oldest_flush_tid = 0;
1804 1829
1805retry: 1830retry:
1806 spin_lock(&ci->i_ceph_lock); 1831 spin_lock(&ci->i_ceph_lock);
@@ -1825,12 +1850,13 @@ retry:
1825 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) 1850 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
1826 goto out; 1851 goto out;
1827 1852
1828 flushing = __mark_caps_flushing(inode, session, &flush_tid); 1853 flushing = __mark_caps_flushing(inode, session, &flush_tid,
1854 &oldest_flush_tid);
1829 1855
1830 /* __send_cap drops i_ceph_lock */ 1856 /* __send_cap drops i_ceph_lock */
1831 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, 1857 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
1832 (cap->issued | cap->implemented), 1858 (cap->issued | cap->implemented),
1833 flushing, flush_tid); 1859 flushing, flush_tid, oldest_flush_tid);
1834 1860
1835 if (delayed) { 1861 if (delayed) {
1836 spin_lock(&ci->i_ceph_lock); 1862 spin_lock(&ci->i_ceph_lock);
@@ -2083,6 +2109,11 @@ static int __kick_flushing_caps(struct ceph_mds_client *mdsc,
2083 struct rb_node *n; 2109 struct rb_node *n;
2084 int delayed = 0; 2110 int delayed = 0;
2085 u64 first_tid = 0; 2111 u64 first_tid = 0;
2112 u64 oldest_flush_tid;
2113
2114 spin_lock(&mdsc->cap_dirty_lock);
2115 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2116 spin_unlock(&mdsc->cap_dirty_lock);
2086 2117
2087 while (true) { 2118 while (true) {
2088 spin_lock(&ci->i_ceph_lock); 2119 spin_lock(&ci->i_ceph_lock);
@@ -2113,7 +2144,7 @@ static int __kick_flushing_caps(struct ceph_mds_client *mdsc,
2113 __ceph_caps_used(ci), 2144 __ceph_caps_used(ci),
2114 __ceph_caps_wanted(ci), 2145 __ceph_caps_wanted(ci),
2115 cap->issued | cap->implemented, 2146 cap->issued | cap->implemented,
2116 cf->caps, cf->tid); 2147 cf->caps, cf->tid, oldest_flush_tid);
2117 } 2148 }
2118 return delayed; 2149 return delayed;
2119} 2150}