aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/ceph/caps.c50
-rw-r--r--fs/ceph/inode.c1
-rw-r--r--fs/ceph/mds_client.c93
-rw-r--r--fs/ceph/mds_client.h2
-rw-r--r--fs/ceph/super.h2
5 files changed, 91 insertions, 57 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 9a25f8d66fbc..0295048724d2 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1415,6 +1415,29 @@ static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci,
1415 rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree); 1415 rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree);
1416} 1416}
1417 1417
1418static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
1419 struct ceph_cap_flush *cf)
1420{
1421 struct rb_node **p = &mdsc->cap_flush_tree.rb_node;
1422 struct rb_node *parent = NULL;
1423 struct ceph_cap_flush *other = NULL;
1424
1425 while (*p) {
1426 parent = *p;
1427 other = rb_entry(parent, struct ceph_cap_flush, g_node);
1428
1429 if (cf->tid < other->tid)
1430 p = &(*p)->rb_left;
1431 else if (cf->tid > other->tid)
1432 p = &(*p)->rb_right;
1433 else
1434 BUG();
1435 }
1436
1437 rb_link_node(&cf->g_node, parent, p);
1438 rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree);
1439}
1440
1418/* 1441/*
1419 * Add dirty inode to the flushing list. Assigned a seq number so we 1442 * Add dirty inode to the flushing list. Assigned a seq number so we
1420 * can wait for caps to flush without starving. 1443 * can wait for caps to flush without starving.
@@ -1449,17 +1472,16 @@ static int __mark_caps_flushing(struct inode *inode,
1449 list_del_init(&ci->i_dirty_item); 1472 list_del_init(&ci->i_dirty_item);
1450 1473
1451 cf->tid = ++mdsc->last_cap_flush_tid; 1474 cf->tid = ++mdsc->last_cap_flush_tid;
1475 __add_cap_flushing_to_mdsc(mdsc, cf);
1452 1476
1453 if (list_empty(&ci->i_flushing_item)) { 1477 if (list_empty(&ci->i_flushing_item)) {
1454 ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
1455 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); 1478 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1456 mdsc->num_cap_flushing++; 1479 mdsc->num_cap_flushing++;
1457 dout(" inode %p now flushing seq %lld\n", inode, 1480 dout(" inode %p now flushing tid %llu\n", inode, cf->tid);
1458 ci->i_cap_flush_seq);
1459 } else { 1481 } else {
1460 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); 1482 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1461 dout(" inode %p now flushing (more) seq %lld\n", inode, 1483 dout(" inode %p now flushing (more) tid %llu\n",
1462 ci->i_cap_flush_seq); 1484 inode, cf->tid);
1463 } 1485 }
1464 spin_unlock(&mdsc->cap_dirty_lock); 1486 spin_unlock(&mdsc->cap_dirty_lock);
1465 1487
@@ -2123,8 +2145,8 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
2123 2145
2124 spin_lock(&ci->i_ceph_lock); 2146 spin_lock(&ci->i_ceph_lock);
2125 cap = ci->i_auth_cap; 2147 cap = ci->i_auth_cap;
2126 dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, 2148 dout("kick_flushing_inode_caps %p flushing %s\n", inode,
2127 ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); 2149 ceph_cap_string(ci->i_flushing_caps));
2128 2150
2129 __ceph_flush_snaps(ci, &session, 1); 2151 __ceph_flush_snaps(ci, &session, 1);
2130 2152
@@ -2921,12 +2943,23 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2921 ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), 2943 ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
2922 ceph_cap_string(ci->i_flushing_caps & ~cleaned)); 2944 ceph_cap_string(ci->i_flushing_caps & ~cleaned));
2923 2945
2924 if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned)) 2946 if (list_empty(&to_remove) && !cleaned)
2925 goto out; 2947 goto out;
2926 2948
2927 ci->i_flushing_caps &= ~cleaned; 2949 ci->i_flushing_caps &= ~cleaned;
2928 2950
2929 spin_lock(&mdsc->cap_dirty_lock); 2951 spin_lock(&mdsc->cap_dirty_lock);
2952
2953 if (!list_empty(&to_remove)) {
2954 list_for_each_entry(cf, &to_remove, list)
2955 rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
2956
2957 n = rb_first(&mdsc->cap_flush_tree);
2958 cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
2959 if (!cf || cf->tid > flush_tid)
2960 wake_up_all(&mdsc->cap_flushing_wq);
2961 }
2962
2930 if (ci->i_flushing_caps == 0) { 2963 if (ci->i_flushing_caps == 0) {
2931 list_del_init(&ci->i_flushing_item); 2964 list_del_init(&ci->i_flushing_item);
2932 if (!list_empty(&session->s_cap_flushing)) 2965 if (!list_empty(&session->s_cap_flushing))
@@ -2936,7 +2969,6 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2936 struct ceph_inode_info, 2969 struct ceph_inode_info,
2937 i_flushing_item)->vfs_inode); 2970 i_flushing_item)->vfs_inode);
2938 mdsc->num_cap_flushing--; 2971 mdsc->num_cap_flushing--;
2939 wake_up_all(&mdsc->cap_flushing_wq);
2940 dout(" inode %p now !flushing\n", inode); 2972 dout(" inode %p now !flushing\n", inode);
2941 2973
2942 if (ci->i_dirty_caps == 0) { 2974 if (ci->i_dirty_caps == 0) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 6d3f19db8c8a..3326302f5884 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -416,7 +416,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
416 ci->i_flushing_caps = 0; 416 ci->i_flushing_caps = 0;
417 INIT_LIST_HEAD(&ci->i_dirty_item); 417 INIT_LIST_HEAD(&ci->i_dirty_item);
418 INIT_LIST_HEAD(&ci->i_flushing_item); 418 INIT_LIST_HEAD(&ci->i_flushing_item);
419 ci->i_cap_flush_seq = 0;
420 ci->i_cap_flush_tree = RB_ROOT; 419 ci->i_cap_flush_tree = RB_ROOT;
421 init_waitqueue_head(&ci->i_cap_wq); 420 init_waitqueue_head(&ci->i_cap_wq);
422 ci->i_hold_caps_min = 0; 421 ci->i_hold_caps_min = 0;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 839901f51512..31f6a78caa0a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1164,6 +1164,10 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1164 } 1164 }
1165 1165
1166 spin_lock(&mdsc->cap_dirty_lock); 1166 spin_lock(&mdsc->cap_dirty_lock);
1167
1168 list_for_each_entry(cf, &to_remove, list)
1169 rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
1170
1167 if (!list_empty(&ci->i_dirty_item)) { 1171 if (!list_empty(&ci->i_dirty_item)) {
1168 pr_warn_ratelimited( 1172 pr_warn_ratelimited(
1169 " dropping dirty %s state for %p %lld\n", 1173 " dropping dirty %s state for %p %lld\n",
@@ -1467,39 +1471,56 @@ static int trim_caps(struct ceph_mds_client *mdsc,
1467 return 0; 1471 return 0;
1468} 1472}
1469 1473
1470static int check_cap_flush(struct ceph_inode_info *ci, 1474static int check_capsnap_flush(struct ceph_inode_info *ci,
1471 u64 want_flush_seq, u64 want_snap_seq) 1475 u64 want_snap_seq)
1472{ 1476{
1473 int ret1 = 1, ret2 = 1; 1477 int ret = 1;
1474 spin_lock(&ci->i_ceph_lock); 1478 spin_lock(&ci->i_ceph_lock);
1475 if (want_flush_seq > 0 && ci->i_flushing_caps)
1476 ret1 = ci->i_cap_flush_seq >= want_flush_seq;
1477
1478 if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) { 1479 if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) {
1479 struct ceph_cap_snap *capsnap = 1480 struct ceph_cap_snap *capsnap =
1480 list_first_entry(&ci->i_cap_snaps, 1481 list_first_entry(&ci->i_cap_snaps,
1481 struct ceph_cap_snap, ci_item); 1482 struct ceph_cap_snap, ci_item);
1482 ret2 = capsnap->follows >= want_snap_seq; 1483 ret = capsnap->follows >= want_snap_seq;
1483 } 1484 }
1484 spin_unlock(&ci->i_ceph_lock); 1485 spin_unlock(&ci->i_ceph_lock);
1485 return ret1 && ret2; 1486 return ret;
1487}
1488
1489static int check_caps_flush(struct ceph_mds_client *mdsc,
1490 u64 want_flush_tid)
1491{
1492 struct rb_node *n;
1493 struct ceph_cap_flush *cf;
1494 int ret = 1;
1495
1496 spin_lock(&mdsc->cap_dirty_lock);
1497 n = rb_first(&mdsc->cap_flush_tree);
1498 cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
1499 if (cf && cf->tid <= want_flush_tid) {
1500 dout("check_caps_flush still flushing tid %llu <= %llu\n",
1501 cf->tid, want_flush_tid);
1502 ret = 0;
1503 }
1504 spin_unlock(&mdsc->cap_dirty_lock);
1505 return ret;
1486} 1506}
1487 1507
1488/* 1508/*
1489 * flush all dirty inode data to disk. 1509 * flush all dirty inode data to disk.
1490 * 1510 *
1491 * returns true if we've flushed through want_flush_seq 1511 * returns true if we've flushed through want_flush_tid
1492 */ 1512 */
1493static void wait_caps_flush(struct ceph_mds_client *mdsc, 1513static void wait_caps_flush(struct ceph_mds_client *mdsc,
1494 u64 want_flush_seq, u64 want_snap_seq) 1514 u64 want_flush_tid, u64 want_snap_seq)
1495{ 1515{
1496 int mds; 1516 int mds;
1497 1517
1498 dout("check_cap_flush want %lld\n", want_flush_seq); 1518 dout("check_caps_flush want %llu snap want %llu\n",
1519 want_flush_tid, want_snap_seq);
1499 mutex_lock(&mdsc->mutex); 1520 mutex_lock(&mdsc->mutex);
1500 for (mds = 0; mds < mdsc->max_sessions; ) { 1521 for (mds = 0; mds < mdsc->max_sessions; ) {
1501 struct ceph_mds_session *session = mdsc->sessions[mds]; 1522 struct ceph_mds_session *session = mdsc->sessions[mds];
1502 struct inode *inode1 = NULL, *inode2 = NULL; 1523 struct inode *inode = NULL;
1503 1524
1504 if (!session) { 1525 if (!session) {
1505 mds++; 1526 mds++;
@@ -1509,58 +1530,40 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc,
1509 mutex_unlock(&mdsc->mutex); 1530 mutex_unlock(&mdsc->mutex);
1510 1531
1511 mutex_lock(&session->s_mutex); 1532 mutex_lock(&session->s_mutex);
1512 if (!list_empty(&session->s_cap_flushing)) {
1513 struct ceph_inode_info *ci =
1514 list_first_entry(&session->s_cap_flushing,
1515 struct ceph_inode_info,
1516 i_flushing_item);
1517
1518 if (!check_cap_flush(ci, want_flush_seq, 0)) {
1519 dout("check_cap_flush still flushing %p "
1520 "seq %lld <= %lld to mds%d\n",
1521 &ci->vfs_inode, ci->i_cap_flush_seq,
1522 want_flush_seq, mds);
1523 inode1 = igrab(&ci->vfs_inode);
1524 }
1525 }
1526 if (!list_empty(&session->s_cap_snaps_flushing)) { 1533 if (!list_empty(&session->s_cap_snaps_flushing)) {
1527 struct ceph_cap_snap *capsnap = 1534 struct ceph_cap_snap *capsnap =
1528 list_first_entry(&session->s_cap_snaps_flushing, 1535 list_first_entry(&session->s_cap_snaps_flushing,
1529 struct ceph_cap_snap, 1536 struct ceph_cap_snap,
1530 flushing_item); 1537 flushing_item);
1531 struct ceph_inode_info *ci = capsnap->ci; 1538 struct ceph_inode_info *ci = capsnap->ci;
1532 if (!check_cap_flush(ci, 0, want_snap_seq)) { 1539 if (!check_capsnap_flush(ci, want_snap_seq)) {
1533 dout("check_cap_flush still flushing snap %p " 1540 dout("check_cap_flush still flushing snap %p "
1534 "follows %lld <= %lld to mds%d\n", 1541 "follows %lld <= %lld to mds%d\n",
1535 &ci->vfs_inode, capsnap->follows, 1542 &ci->vfs_inode, capsnap->follows,
1536 want_snap_seq, mds); 1543 want_snap_seq, mds);
1537 inode2 = igrab(&ci->vfs_inode); 1544 inode = igrab(&ci->vfs_inode);
1538 } 1545 }
1539 } 1546 }
1540 mutex_unlock(&session->s_mutex); 1547 mutex_unlock(&session->s_mutex);
1541 ceph_put_mds_session(session); 1548 ceph_put_mds_session(session);
1542 1549
1543 if (inode1) { 1550 if (inode) {
1544 wait_event(mdsc->cap_flushing_wq,
1545 check_cap_flush(ceph_inode(inode1),
1546 want_flush_seq, 0));
1547 iput(inode1);
1548 }
1549 if (inode2) {
1550 wait_event(mdsc->cap_flushing_wq, 1551 wait_event(mdsc->cap_flushing_wq,
1551 check_cap_flush(ceph_inode(inode2), 1552 check_capsnap_flush(ceph_inode(inode),
1552 0, want_snap_seq)); 1553 want_snap_seq));
1553 iput(inode2); 1554 iput(inode);
1554 } 1555 } else {
1555
1556 if (!inode1 && !inode2)
1557 mds++; 1556 mds++;
1557 }
1558 1558
1559 mutex_lock(&mdsc->mutex); 1559 mutex_lock(&mdsc->mutex);
1560 } 1560 }
1561
1562 mutex_unlock(&mdsc->mutex); 1561 mutex_unlock(&mdsc->mutex);
1563 dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); 1562
1563 wait_event(mdsc->cap_flushing_wq,
1564 check_caps_flush(mdsc, want_flush_tid));
1565
1566 dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);
1564} 1567}
1565 1568
1566/* 1569/*
@@ -3426,8 +3429,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
3426 spin_lock_init(&mdsc->cap_delay_lock); 3429 spin_lock_init(&mdsc->cap_delay_lock);
3427 INIT_LIST_HEAD(&mdsc->snap_flush_list); 3430 INIT_LIST_HEAD(&mdsc->snap_flush_list);
3428 spin_lock_init(&mdsc->snap_flush_lock); 3431 spin_lock_init(&mdsc->snap_flush_lock);
3429 mdsc->cap_flush_seq = 0;
3430 mdsc->last_cap_flush_tid = 1; 3432 mdsc->last_cap_flush_tid = 1;
3433 mdsc->cap_flush_tree = RB_ROOT;
3431 INIT_LIST_HEAD(&mdsc->cap_dirty); 3434 INIT_LIST_HEAD(&mdsc->cap_dirty);
3432 INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); 3435 INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
3433 mdsc->num_cap_flushing = 0; 3436 mdsc->num_cap_flushing = 0;
@@ -3554,7 +3557,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3554 3557
3555 ceph_flush_dirty_caps(mdsc); 3558 ceph_flush_dirty_caps(mdsc);
3556 spin_lock(&mdsc->cap_dirty_lock); 3559 spin_lock(&mdsc->cap_dirty_lock);
3557 want_flush = mdsc->cap_flush_seq; 3560 want_flush = mdsc->last_cap_flush_tid;
3558 spin_unlock(&mdsc->cap_dirty_lock); 3561 spin_unlock(&mdsc->cap_dirty_lock);
3559 3562
3560 down_read(&mdsc->snap_rwsem); 3563 down_read(&mdsc->snap_rwsem);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 19f6084203f0..470be4eb25f3 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -306,8 +306,8 @@ struct ceph_mds_client {
306 struct list_head snap_flush_list; /* cap_snaps ready to flush */ 306 struct list_head snap_flush_list; /* cap_snaps ready to flush */
307 spinlock_t snap_flush_lock; 307 spinlock_t snap_flush_lock;
308 308
309 u64 cap_flush_seq;
310 u64 last_cap_flush_tid; 309 u64 last_cap_flush_tid;
310 struct rb_root cap_flush_tree;
311 struct list_head cap_dirty; /* inodes with dirty caps */ 311 struct list_head cap_dirty; /* inodes with dirty caps */
312 struct list_head cap_dirty_migrating; /* ...that are migration... */ 312 struct list_head cap_dirty_migrating; /* ...that are migration... */
313 int num_cap_flushing; /* # caps we are flushing */ 313 int num_cap_flushing; /* # caps we are flushing */
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index cc597f52e046..94d91471165f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -189,6 +189,7 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
189struct ceph_cap_flush { 189struct ceph_cap_flush {
190 u64 tid; 190 u64 tid;
191 int caps; 191 int caps;
192 struct rb_node g_node;
192 union { 193 union {
193 struct rb_node i_node; 194 struct rb_node i_node;
194 struct list_head list; 195 struct list_head list;
@@ -304,7 +305,6 @@ struct ceph_inode_info {
304 struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ 305 struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
305 unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ 306 unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
306 struct list_head i_dirty_item, i_flushing_item; 307 struct list_head i_dirty_item, i_flushing_item;
307 u64 i_cap_flush_seq;
308 /* we need to track cap writeback on a per-cap-bit basis, to allow 308 /* we need to track cap writeback on a per-cap-bit basis, to allow
309 * overlapping, pipelined cap flushes to the mds. we can probably 309 * overlapping, pipelined cap flushes to the mds. we can probably
310 * reduce the tid to 8 bits if we're concerned about inode size. */ 310 * reduce the tid to 8 bits if we're concerned about inode size. */