aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ceph
diff options
context:
space:
mode:
authorYan, Zheng <zyan@redhat.com>2015-06-09 05:20:12 -0400
committerIlya Dryomov <idryomov@gmail.com>2015-06-25 04:49:31 -0400
commit8310b08913eca8aee98744c9aff1ec0d1f603b19 (patch)
treeac58f8ea0c44c98a675242c1a2a338f6333f03ba /fs/ceph
parent553adfd941f8ca622965ef809553d918ea039929 (diff)
ceph: track pending caps flushing globally
So we know TID of the oldest pending caps flushing. Later patch will send this information to MDS, so that MDS can trim its completed caps flush list. Tracking pending caps flushing globally also simplifies syncfs code. Signed-off-by: Yan, Zheng <zyan@redhat.com>
Diffstat (limited to 'fs/ceph')
-rw-r--r--fs/ceph/caps.c50
-rw-r--r--fs/ceph/inode.c1
-rw-r--r--fs/ceph/mds_client.c93
-rw-r--r--fs/ceph/mds_client.h2
-rw-r--r--fs/ceph/super.h2
5 files changed, 91 insertions, 57 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 9a25f8d66fbc..0295048724d2 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1415,6 +1415,29 @@ static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci,
1415 rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree); 1415 rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree);
1416} 1416}
1417 1417
1418static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
1419 struct ceph_cap_flush *cf)
1420{
1421 struct rb_node **p = &mdsc->cap_flush_tree.rb_node;
1422 struct rb_node *parent = NULL;
1423 struct ceph_cap_flush *other = NULL;
1424
1425 while (*p) {
1426 parent = *p;
1427 other = rb_entry(parent, struct ceph_cap_flush, g_node);
1428
1429 if (cf->tid < other->tid)
1430 p = &(*p)->rb_left;
1431 else if (cf->tid > other->tid)
1432 p = &(*p)->rb_right;
1433 else
1434 BUG();
1435 }
1436
1437 rb_link_node(&cf->g_node, parent, p);
1438 rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree);
1439}
1440
1418/* 1441/*
1419 * Add dirty inode to the flushing list. Assigned a seq number so we 1442 * Add dirty inode to the flushing list. Assigned a seq number so we
1420 * can wait for caps to flush without starving. 1443 * can wait for caps to flush without starving.
@@ -1449,17 +1472,16 @@ static int __mark_caps_flushing(struct inode *inode,
1449 list_del_init(&ci->i_dirty_item); 1472 list_del_init(&ci->i_dirty_item);
1450 1473
1451 cf->tid = ++mdsc->last_cap_flush_tid; 1474 cf->tid = ++mdsc->last_cap_flush_tid;
1475 __add_cap_flushing_to_mdsc(mdsc, cf);
1452 1476
1453 if (list_empty(&ci->i_flushing_item)) { 1477 if (list_empty(&ci->i_flushing_item)) {
1454 ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
1455 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); 1478 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1456 mdsc->num_cap_flushing++; 1479 mdsc->num_cap_flushing++;
1457 dout(" inode %p now flushing seq %lld\n", inode, 1480 dout(" inode %p now flushing tid %llu\n", inode, cf->tid);
1458 ci->i_cap_flush_seq);
1459 } else { 1481 } else {
1460 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); 1482 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1461 dout(" inode %p now flushing (more) seq %lld\n", inode, 1483 dout(" inode %p now flushing (more) tid %llu\n",
1462 ci->i_cap_flush_seq); 1484 inode, cf->tid);
1463 } 1485 }
1464 spin_unlock(&mdsc->cap_dirty_lock); 1486 spin_unlock(&mdsc->cap_dirty_lock);
1465 1487
@@ -2123,8 +2145,8 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
2123 2145
2124 spin_lock(&ci->i_ceph_lock); 2146 spin_lock(&ci->i_ceph_lock);
2125 cap = ci->i_auth_cap; 2147 cap = ci->i_auth_cap;
2126 dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, 2148 dout("kick_flushing_inode_caps %p flushing %s\n", inode,
2127 ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); 2149 ceph_cap_string(ci->i_flushing_caps));
2128 2150
2129 __ceph_flush_snaps(ci, &session, 1); 2151 __ceph_flush_snaps(ci, &session, 1);
2130 2152
@@ -2921,12 +2943,23 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2921 ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), 2943 ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
2922 ceph_cap_string(ci->i_flushing_caps & ~cleaned)); 2944 ceph_cap_string(ci->i_flushing_caps & ~cleaned));
2923 2945
2924 if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned)) 2946 if (list_empty(&to_remove) && !cleaned)
2925 goto out; 2947 goto out;
2926 2948
2927 ci->i_flushing_caps &= ~cleaned; 2949 ci->i_flushing_caps &= ~cleaned;
2928 2950
2929 spin_lock(&mdsc->cap_dirty_lock); 2951 spin_lock(&mdsc->cap_dirty_lock);
2952
2953 if (!list_empty(&to_remove)) {
2954 list_for_each_entry(cf, &to_remove, list)
2955 rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
2956
2957 n = rb_first(&mdsc->cap_flush_tree);
2958 cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
2959 if (!cf || cf->tid > flush_tid)
2960 wake_up_all(&mdsc->cap_flushing_wq);
2961 }
2962
2930 if (ci->i_flushing_caps == 0) { 2963 if (ci->i_flushing_caps == 0) {
2931 list_del_init(&ci->i_flushing_item); 2964 list_del_init(&ci->i_flushing_item);
2932 if (!list_empty(&session->s_cap_flushing)) 2965 if (!list_empty(&session->s_cap_flushing))
@@ -2936,7 +2969,6 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2936 struct ceph_inode_info, 2969 struct ceph_inode_info,
2937 i_flushing_item)->vfs_inode); 2970 i_flushing_item)->vfs_inode);
2938 mdsc->num_cap_flushing--; 2971 mdsc->num_cap_flushing--;
2939 wake_up_all(&mdsc->cap_flushing_wq);
2940 dout(" inode %p now !flushing\n", inode); 2972 dout(" inode %p now !flushing\n", inode);
2941 2973
2942 if (ci->i_dirty_caps == 0) { 2974 if (ci->i_dirty_caps == 0) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 6d3f19db8c8a..3326302f5884 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -416,7 +416,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
416 ci->i_flushing_caps = 0; 416 ci->i_flushing_caps = 0;
417 INIT_LIST_HEAD(&ci->i_dirty_item); 417 INIT_LIST_HEAD(&ci->i_dirty_item);
418 INIT_LIST_HEAD(&ci->i_flushing_item); 418 INIT_LIST_HEAD(&ci->i_flushing_item);
419 ci->i_cap_flush_seq = 0;
420 ci->i_cap_flush_tree = RB_ROOT; 419 ci->i_cap_flush_tree = RB_ROOT;
421 init_waitqueue_head(&ci->i_cap_wq); 420 init_waitqueue_head(&ci->i_cap_wq);
422 ci->i_hold_caps_min = 0; 421 ci->i_hold_caps_min = 0;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 839901f51512..31f6a78caa0a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1164,6 +1164,10 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1164 } 1164 }
1165 1165
1166 spin_lock(&mdsc->cap_dirty_lock); 1166 spin_lock(&mdsc->cap_dirty_lock);
1167
1168 list_for_each_entry(cf, &to_remove, list)
1169 rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
1170
1167 if (!list_empty(&ci->i_dirty_item)) { 1171 if (!list_empty(&ci->i_dirty_item)) {
1168 pr_warn_ratelimited( 1172 pr_warn_ratelimited(
1169 " dropping dirty %s state for %p %lld\n", 1173 " dropping dirty %s state for %p %lld\n",
@@ -1467,39 +1471,56 @@ static int trim_caps(struct ceph_mds_client *mdsc,
1467 return 0; 1471 return 0;
1468} 1472}
1469 1473
1470static int check_cap_flush(struct ceph_inode_info *ci, 1474static int check_capsnap_flush(struct ceph_inode_info *ci,
1471 u64 want_flush_seq, u64 want_snap_seq) 1475 u64 want_snap_seq)
1472{ 1476{
1473 int ret1 = 1, ret2 = 1; 1477 int ret = 1;
1474 spin_lock(&ci->i_ceph_lock); 1478 spin_lock(&ci->i_ceph_lock);
1475 if (want_flush_seq > 0 && ci->i_flushing_caps)
1476 ret1 = ci->i_cap_flush_seq >= want_flush_seq;
1477
1478 if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) { 1479 if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) {
1479 struct ceph_cap_snap *capsnap = 1480 struct ceph_cap_snap *capsnap =
1480 list_first_entry(&ci->i_cap_snaps, 1481 list_first_entry(&ci->i_cap_snaps,
1481 struct ceph_cap_snap, ci_item); 1482 struct ceph_cap_snap, ci_item);
1482 ret2 = capsnap->follows >= want_snap_seq; 1483 ret = capsnap->follows >= want_snap_seq;
1483 } 1484 }
1484 spin_unlock(&ci->i_ceph_lock); 1485 spin_unlock(&ci->i_ceph_lock);
1485 return ret1 && ret2; 1486 return ret;
1487}
1488
1489static int check_caps_flush(struct ceph_mds_client *mdsc,
1490 u64 want_flush_tid)
1491{
1492 struct rb_node *n;
1493 struct ceph_cap_flush *cf;
1494 int ret = 1;
1495
1496 spin_lock(&mdsc->cap_dirty_lock);
1497 n = rb_first(&mdsc->cap_flush_tree);
1498 cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
1499 if (cf && cf->tid <= want_flush_tid) {
1500 dout("check_caps_flush still flushing tid %llu <= %llu\n",
1501 cf->tid, want_flush_tid);
1502 ret = 0;
1503 }
1504 spin_unlock(&mdsc->cap_dirty_lock);
1505 return ret;
1486} 1506}
1487 1507
1488/* 1508/*
1489 * flush all dirty inode data to disk. 1509 * flush all dirty inode data to disk.
1490 * 1510 *
1491 * returns true if we've flushed through want_flush_seq 1511 * returns true if we've flushed through want_flush_tid
1492 */ 1512 */
1493static void wait_caps_flush(struct ceph_mds_client *mdsc, 1513static void wait_caps_flush(struct ceph_mds_client *mdsc,
1494 u64 want_flush_seq, u64 want_snap_seq) 1514 u64 want_flush_tid, u64 want_snap_seq)
1495{ 1515{
1496 int mds; 1516 int mds;
1497 1517
1498 dout("check_cap_flush want %lld\n", want_flush_seq); 1518 dout("check_caps_flush want %llu snap want %llu\n",
1519 want_flush_tid, want_snap_seq);
1499 mutex_lock(&mdsc->mutex); 1520 mutex_lock(&mdsc->mutex);
1500 for (mds = 0; mds < mdsc->max_sessions; ) { 1521 for (mds = 0; mds < mdsc->max_sessions; ) {
1501 struct ceph_mds_session *session = mdsc->sessions[mds]; 1522 struct ceph_mds_session *session = mdsc->sessions[mds];
1502 struct inode *inode1 = NULL, *inode2 = NULL; 1523 struct inode *inode = NULL;
1503 1524
1504 if (!session) { 1525 if (!session) {
1505 mds++; 1526 mds++;
@@ -1509,58 +1530,40 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc,
1509 mutex_unlock(&mdsc->mutex); 1530 mutex_unlock(&mdsc->mutex);
1510 1531
1511 mutex_lock(&session->s_mutex); 1532 mutex_lock(&session->s_mutex);
1512 if (!list_empty(&session->s_cap_flushing)) {
1513 struct ceph_inode_info *ci =
1514 list_first_entry(&session->s_cap_flushing,
1515 struct ceph_inode_info,
1516 i_flushing_item);
1517
1518 if (!check_cap_flush(ci, want_flush_seq, 0)) {
1519 dout("check_cap_flush still flushing %p "
1520 "seq %lld <= %lld to mds%d\n",
1521 &ci->vfs_inode, ci->i_cap_flush_seq,
1522 want_flush_seq, mds);
1523 inode1 = igrab(&ci->vfs_inode);
1524 }
1525 }
1526 if (!list_empty(&session->s_cap_snaps_flushing)) { 1533 if (!list_empty(&session->s_cap_snaps_flushing)) {
1527 struct ceph_cap_snap *capsnap = 1534 struct ceph_cap_snap *capsnap =
1528 list_first_entry(&session->s_cap_snaps_flushing, 1535 list_first_entry(&session->s_cap_snaps_flushing,
1529 struct ceph_cap_snap, 1536 struct ceph_cap_snap,
1530 flushing_item); 1537 flushing_item);
1531 struct ceph_inode_info *ci = capsnap->ci; 1538 struct ceph_inode_info *ci = capsnap->ci;
1532 if (!check_cap_flush(ci, 0, want_snap_seq)) { 1539 if (!check_capsnap_flush(ci, want_snap_seq)) {
1533 dout("check_cap_flush still flushing snap %p " 1540 dout("check_cap_flush still flushing snap %p "
1534 "follows %lld <= %lld to mds%d\n", 1541 "follows %lld <= %lld to mds%d\n",
1535 &ci->vfs_inode, capsnap->follows, 1542 &ci->vfs_inode, capsnap->follows,
1536 want_snap_seq, mds); 1543 want_snap_seq, mds);
1537 inode2 = igrab(&ci->vfs_inode); 1544 inode = igrab(&ci->vfs_inode);
1538 } 1545 }
1539 } 1546 }
1540 mutex_unlock(&session->s_mutex); 1547 mutex_unlock(&session->s_mutex);
1541 ceph_put_mds_session(session); 1548 ceph_put_mds_session(session);
1542 1549
1543 if (inode1) { 1550 if (inode) {
1544 wait_event(mdsc->cap_flushing_wq,
1545 check_cap_flush(ceph_inode(inode1),
1546 want_flush_seq, 0));
1547 iput(inode1);
1548 }
1549 if (inode2) {
1550 wait_event(mdsc->cap_flushing_wq, 1551 wait_event(mdsc->cap_flushing_wq,
1551 check_cap_flush(ceph_inode(inode2), 1552 check_capsnap_flush(ceph_inode(inode),
1552 0, want_snap_seq)); 1553 want_snap_seq));
1553 iput(inode2); 1554 iput(inode);
1554 } 1555 } else {
1555
1556 if (!inode1 && !inode2)
1557 mds++; 1556 mds++;
1557 }
1558 1558
1559 mutex_lock(&mdsc->mutex); 1559 mutex_lock(&mdsc->mutex);
1560 } 1560 }
1561
1562 mutex_unlock(&mdsc->mutex); 1561 mutex_unlock(&mdsc->mutex);
1563 dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); 1562
1563 wait_event(mdsc->cap_flushing_wq,
1564 check_caps_flush(mdsc, want_flush_tid));
1565
1566 dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);
1564} 1567}
1565 1568
1566/* 1569/*
@@ -3426,8 +3429,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
3426 spin_lock_init(&mdsc->cap_delay_lock); 3429 spin_lock_init(&mdsc->cap_delay_lock);
3427 INIT_LIST_HEAD(&mdsc->snap_flush_list); 3430 INIT_LIST_HEAD(&mdsc->snap_flush_list);
3428 spin_lock_init(&mdsc->snap_flush_lock); 3431 spin_lock_init(&mdsc->snap_flush_lock);
3429 mdsc->cap_flush_seq = 0;
3430 mdsc->last_cap_flush_tid = 1; 3432 mdsc->last_cap_flush_tid = 1;
3433 mdsc->cap_flush_tree = RB_ROOT;
3431 INIT_LIST_HEAD(&mdsc->cap_dirty); 3434 INIT_LIST_HEAD(&mdsc->cap_dirty);
3432 INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); 3435 INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
3433 mdsc->num_cap_flushing = 0; 3436 mdsc->num_cap_flushing = 0;
@@ -3554,7 +3557,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3554 3557
3555 ceph_flush_dirty_caps(mdsc); 3558 ceph_flush_dirty_caps(mdsc);
3556 spin_lock(&mdsc->cap_dirty_lock); 3559 spin_lock(&mdsc->cap_dirty_lock);
3557 want_flush = mdsc->cap_flush_seq; 3560 want_flush = mdsc->last_cap_flush_tid;
3558 spin_unlock(&mdsc->cap_dirty_lock); 3561 spin_unlock(&mdsc->cap_dirty_lock);
3559 3562
3560 down_read(&mdsc->snap_rwsem); 3563 down_read(&mdsc->snap_rwsem);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 19f6084203f0..470be4eb25f3 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -306,8 +306,8 @@ struct ceph_mds_client {
306 struct list_head snap_flush_list; /* cap_snaps ready to flush */ 306 struct list_head snap_flush_list; /* cap_snaps ready to flush */
307 spinlock_t snap_flush_lock; 307 spinlock_t snap_flush_lock;
308 308
309 u64 cap_flush_seq;
310 u64 last_cap_flush_tid; 309 u64 last_cap_flush_tid;
310 struct rb_root cap_flush_tree;
311 struct list_head cap_dirty; /* inodes with dirty caps */ 311 struct list_head cap_dirty; /* inodes with dirty caps */
312 struct list_head cap_dirty_migrating; /* ...that are migration... */ 312 struct list_head cap_dirty_migrating; /* ...that are migration... */
313 int num_cap_flushing; /* # caps we are flushing */ 313 int num_cap_flushing; /* # caps we are flushing */
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index cc597f52e046..94d91471165f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -189,6 +189,7 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
189struct ceph_cap_flush { 189struct ceph_cap_flush {
190 u64 tid; 190 u64 tid;
191 int caps; 191 int caps;
192 struct rb_node g_node;
192 union { 193 union {
193 struct rb_node i_node; 194 struct rb_node i_node;
194 struct list_head list; 195 struct list_head list;
@@ -304,7 +305,6 @@ struct ceph_inode_info {
304 struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ 305 struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
305 unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ 306 unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
306 struct list_head i_dirty_item, i_flushing_item; 307 struct list_head i_dirty_item, i_flushing_item;
307 u64 i_cap_flush_seq;
308 /* we need to track cap writeback on a per-cap-bit basis, to allow 308 /* we need to track cap writeback on a per-cap-bit basis, to allow
309 * overlapping, pipelined cap flushes to the mds. we can probably 309 * overlapping, pipelined cap flushes to the mds. we can probably
310 * reduce the tid to 8 bits if we're concerned about inode size. */ 310 * reduce the tid to 8 bits if we're concerned about inode size. */