diff options
-rw-r--r-- | fs/ceph/caps.c | 50 | ||||
-rw-r--r-- | fs/ceph/inode.c | 1 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 93 | ||||
-rw-r--r-- | fs/ceph/mds_client.h | 2 | ||||
-rw-r--r-- | fs/ceph/super.h | 2 |
5 files changed, 91 insertions, 57 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 9a25f8d66fbc..0295048724d2 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c | |||
@@ -1415,6 +1415,29 @@ static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci, | |||
1415 | rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree); | 1415 | rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree); |
1416 | } | 1416 | } |
1417 | 1417 | ||
1418 | static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc, | ||
1419 | struct ceph_cap_flush *cf) | ||
1420 | { | ||
1421 | struct rb_node **p = &mdsc->cap_flush_tree.rb_node; | ||
1422 | struct rb_node *parent = NULL; | ||
1423 | struct ceph_cap_flush *other = NULL; | ||
1424 | |||
1425 | while (*p) { | ||
1426 | parent = *p; | ||
1427 | other = rb_entry(parent, struct ceph_cap_flush, g_node); | ||
1428 | |||
1429 | if (cf->tid < other->tid) | ||
1430 | p = &(*p)->rb_left; | ||
1431 | else if (cf->tid > other->tid) | ||
1432 | p = &(*p)->rb_right; | ||
1433 | else | ||
1434 | BUG(); | ||
1435 | } | ||
1436 | |||
1437 | rb_link_node(&cf->g_node, parent, p); | ||
1438 | rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree); | ||
1439 | } | ||
1440 | |||
1418 | /* | 1441 | /* |
1419 | * Add dirty inode to the flushing list. Assigned a seq number so we | 1442 | * Add dirty inode to the flushing list. Assigned a seq number so we |
1420 | * can wait for caps to flush without starving. | 1443 | * can wait for caps to flush without starving. |
@@ -1449,17 +1472,16 @@ static int __mark_caps_flushing(struct inode *inode, | |||
1449 | list_del_init(&ci->i_dirty_item); | 1472 | list_del_init(&ci->i_dirty_item); |
1450 | 1473 | ||
1451 | cf->tid = ++mdsc->last_cap_flush_tid; | 1474 | cf->tid = ++mdsc->last_cap_flush_tid; |
1475 | __add_cap_flushing_to_mdsc(mdsc, cf); | ||
1452 | 1476 | ||
1453 | if (list_empty(&ci->i_flushing_item)) { | 1477 | if (list_empty(&ci->i_flushing_item)) { |
1454 | ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; | ||
1455 | list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); | 1478 | list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); |
1456 | mdsc->num_cap_flushing++; | 1479 | mdsc->num_cap_flushing++; |
1457 | dout(" inode %p now flushing seq %lld\n", inode, | 1480 | dout(" inode %p now flushing tid %llu\n", inode, cf->tid); |
1458 | ci->i_cap_flush_seq); | ||
1459 | } else { | 1481 | } else { |
1460 | list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); | 1482 | list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); |
1461 | dout(" inode %p now flushing (more) seq %lld\n", inode, | 1483 | dout(" inode %p now flushing (more) tid %llu\n", |
1462 | ci->i_cap_flush_seq); | 1484 | inode, cf->tid); |
1463 | } | 1485 | } |
1464 | spin_unlock(&mdsc->cap_dirty_lock); | 1486 | spin_unlock(&mdsc->cap_dirty_lock); |
1465 | 1487 | ||
@@ -2123,8 +2145,8 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, | |||
2123 | 2145 | ||
2124 | spin_lock(&ci->i_ceph_lock); | 2146 | spin_lock(&ci->i_ceph_lock); |
2125 | cap = ci->i_auth_cap; | 2147 | cap = ci->i_auth_cap; |
2126 | dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, | 2148 | dout("kick_flushing_inode_caps %p flushing %s\n", inode, |
2127 | ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); | 2149 | ceph_cap_string(ci->i_flushing_caps)); |
2128 | 2150 | ||
2129 | __ceph_flush_snaps(ci, &session, 1); | 2151 | __ceph_flush_snaps(ci, &session, 1); |
2130 | 2152 | ||
@@ -2921,12 +2943,23 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, | |||
2921 | ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), | 2943 | ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), |
2922 | ceph_cap_string(ci->i_flushing_caps & ~cleaned)); | 2944 | ceph_cap_string(ci->i_flushing_caps & ~cleaned)); |
2923 | 2945 | ||
2924 | if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned)) | 2946 | if (list_empty(&to_remove) && !cleaned) |
2925 | goto out; | 2947 | goto out; |
2926 | 2948 | ||
2927 | ci->i_flushing_caps &= ~cleaned; | 2949 | ci->i_flushing_caps &= ~cleaned; |
2928 | 2950 | ||
2929 | spin_lock(&mdsc->cap_dirty_lock); | 2951 | spin_lock(&mdsc->cap_dirty_lock); |
2952 | |||
2953 | if (!list_empty(&to_remove)) { | ||
2954 | list_for_each_entry(cf, &to_remove, list) | ||
2955 | rb_erase(&cf->g_node, &mdsc->cap_flush_tree); | ||
2956 | |||
2957 | n = rb_first(&mdsc->cap_flush_tree); | ||
2958 | cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL; | ||
2959 | if (!cf || cf->tid > flush_tid) | ||
2960 | wake_up_all(&mdsc->cap_flushing_wq); | ||
2961 | } | ||
2962 | |||
2930 | if (ci->i_flushing_caps == 0) { | 2963 | if (ci->i_flushing_caps == 0) { |
2931 | list_del_init(&ci->i_flushing_item); | 2964 | list_del_init(&ci->i_flushing_item); |
2932 | if (!list_empty(&session->s_cap_flushing)) | 2965 | if (!list_empty(&session->s_cap_flushing)) |
@@ -2936,7 +2969,6 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, | |||
2936 | struct ceph_inode_info, | 2969 | struct ceph_inode_info, |
2937 | i_flushing_item)->vfs_inode); | 2970 | i_flushing_item)->vfs_inode); |
2938 | mdsc->num_cap_flushing--; | 2971 | mdsc->num_cap_flushing--; |
2939 | wake_up_all(&mdsc->cap_flushing_wq); | ||
2940 | dout(" inode %p now !flushing\n", inode); | 2972 | dout(" inode %p now !flushing\n", inode); |
2941 | 2973 | ||
2942 | if (ci->i_dirty_caps == 0) { | 2974 | if (ci->i_dirty_caps == 0) { |
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 6d3f19db8c8a..3326302f5884 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c | |||
@@ -416,7 +416,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb) | |||
416 | ci->i_flushing_caps = 0; | 416 | ci->i_flushing_caps = 0; |
417 | INIT_LIST_HEAD(&ci->i_dirty_item); | 417 | INIT_LIST_HEAD(&ci->i_dirty_item); |
418 | INIT_LIST_HEAD(&ci->i_flushing_item); | 418 | INIT_LIST_HEAD(&ci->i_flushing_item); |
419 | ci->i_cap_flush_seq = 0; | ||
420 | ci->i_cap_flush_tree = RB_ROOT; | 419 | ci->i_cap_flush_tree = RB_ROOT; |
421 | init_waitqueue_head(&ci->i_cap_wq); | 420 | init_waitqueue_head(&ci->i_cap_wq); |
422 | ci->i_hold_caps_min = 0; | 421 | ci->i_hold_caps_min = 0; |
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 839901f51512..31f6a78caa0a 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c | |||
@@ -1164,6 +1164,10 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
1164 | } | 1164 | } |
1165 | 1165 | ||
1166 | spin_lock(&mdsc->cap_dirty_lock); | 1166 | spin_lock(&mdsc->cap_dirty_lock); |
1167 | |||
1168 | list_for_each_entry(cf, &to_remove, list) | ||
1169 | rb_erase(&cf->g_node, &mdsc->cap_flush_tree); | ||
1170 | |||
1167 | if (!list_empty(&ci->i_dirty_item)) { | 1171 | if (!list_empty(&ci->i_dirty_item)) { |
1168 | pr_warn_ratelimited( | 1172 | pr_warn_ratelimited( |
1169 | " dropping dirty %s state for %p %lld\n", | 1173 | " dropping dirty %s state for %p %lld\n", |
@@ -1467,39 +1471,56 @@ static int trim_caps(struct ceph_mds_client *mdsc, | |||
1467 | return 0; | 1471 | return 0; |
1468 | } | 1472 | } |
1469 | 1473 | ||
1470 | static int check_cap_flush(struct ceph_inode_info *ci, | 1474 | static int check_capsnap_flush(struct ceph_inode_info *ci, |
1471 | u64 want_flush_seq, u64 want_snap_seq) | 1475 | u64 want_snap_seq) |
1472 | { | 1476 | { |
1473 | int ret1 = 1, ret2 = 1; | 1477 | int ret = 1; |
1474 | spin_lock(&ci->i_ceph_lock); | 1478 | spin_lock(&ci->i_ceph_lock); |
1475 | if (want_flush_seq > 0 && ci->i_flushing_caps) | ||
1476 | ret1 = ci->i_cap_flush_seq >= want_flush_seq; | ||
1477 | |||
1478 | if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) { | 1479 | if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) { |
1479 | struct ceph_cap_snap *capsnap = | 1480 | struct ceph_cap_snap *capsnap = |
1480 | list_first_entry(&ci->i_cap_snaps, | 1481 | list_first_entry(&ci->i_cap_snaps, |
1481 | struct ceph_cap_snap, ci_item); | 1482 | struct ceph_cap_snap, ci_item); |
1482 | ret2 = capsnap->follows >= want_snap_seq; | 1483 | ret = capsnap->follows >= want_snap_seq; |
1483 | } | 1484 | } |
1484 | spin_unlock(&ci->i_ceph_lock); | 1485 | spin_unlock(&ci->i_ceph_lock); |
1485 | return ret1 && ret2; | 1486 | return ret; |
1487 | } | ||
1488 | |||
1489 | static int check_caps_flush(struct ceph_mds_client *mdsc, | ||
1490 | u64 want_flush_tid) | ||
1491 | { | ||
1492 | struct rb_node *n; | ||
1493 | struct ceph_cap_flush *cf; | ||
1494 | int ret = 1; | ||
1495 | |||
1496 | spin_lock(&mdsc->cap_dirty_lock); | ||
1497 | n = rb_first(&mdsc->cap_flush_tree); | ||
1498 | cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL; | ||
1499 | if (cf && cf->tid <= want_flush_tid) { | ||
1500 | dout("check_caps_flush still flushing tid %llu <= %llu\n", | ||
1501 | cf->tid, want_flush_tid); | ||
1502 | ret = 0; | ||
1503 | } | ||
1504 | spin_unlock(&mdsc->cap_dirty_lock); | ||
1505 | return ret; | ||
1486 | } | 1506 | } |
1487 | 1507 | ||
1488 | /* | 1508 | /* |
1489 | * flush all dirty inode data to disk. | 1509 | * flush all dirty inode data to disk. |
1490 | * | 1510 | * |
1491 | * returns true if we've flushed through want_flush_seq | 1511 | * returns true if we've flushed through want_flush_tid |
1492 | */ | 1512 | */ |
1493 | static void wait_caps_flush(struct ceph_mds_client *mdsc, | 1513 | static void wait_caps_flush(struct ceph_mds_client *mdsc, |
1494 | u64 want_flush_seq, u64 want_snap_seq) | 1514 | u64 want_flush_tid, u64 want_snap_seq) |
1495 | { | 1515 | { |
1496 | int mds; | 1516 | int mds; |
1497 | 1517 | ||
1498 | dout("check_cap_flush want %lld\n", want_flush_seq); | 1518 | dout("check_caps_flush want %llu snap want %llu\n", |
1519 | want_flush_tid, want_snap_seq); | ||
1499 | mutex_lock(&mdsc->mutex); | 1520 | mutex_lock(&mdsc->mutex); |
1500 | for (mds = 0; mds < mdsc->max_sessions; ) { | 1521 | for (mds = 0; mds < mdsc->max_sessions; ) { |
1501 | struct ceph_mds_session *session = mdsc->sessions[mds]; | 1522 | struct ceph_mds_session *session = mdsc->sessions[mds]; |
1502 | struct inode *inode1 = NULL, *inode2 = NULL; | 1523 | struct inode *inode = NULL; |
1503 | 1524 | ||
1504 | if (!session) { | 1525 | if (!session) { |
1505 | mds++; | 1526 | mds++; |
@@ -1509,58 +1530,40 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, | |||
1509 | mutex_unlock(&mdsc->mutex); | 1530 | mutex_unlock(&mdsc->mutex); |
1510 | 1531 | ||
1511 | mutex_lock(&session->s_mutex); | 1532 | mutex_lock(&session->s_mutex); |
1512 | if (!list_empty(&session->s_cap_flushing)) { | ||
1513 | struct ceph_inode_info *ci = | ||
1514 | list_first_entry(&session->s_cap_flushing, | ||
1515 | struct ceph_inode_info, | ||
1516 | i_flushing_item); | ||
1517 | |||
1518 | if (!check_cap_flush(ci, want_flush_seq, 0)) { | ||
1519 | dout("check_cap_flush still flushing %p " | ||
1520 | "seq %lld <= %lld to mds%d\n", | ||
1521 | &ci->vfs_inode, ci->i_cap_flush_seq, | ||
1522 | want_flush_seq, mds); | ||
1523 | inode1 = igrab(&ci->vfs_inode); | ||
1524 | } | ||
1525 | } | ||
1526 | if (!list_empty(&session->s_cap_snaps_flushing)) { | 1533 | if (!list_empty(&session->s_cap_snaps_flushing)) { |
1527 | struct ceph_cap_snap *capsnap = | 1534 | struct ceph_cap_snap *capsnap = |
1528 | list_first_entry(&session->s_cap_snaps_flushing, | 1535 | list_first_entry(&session->s_cap_snaps_flushing, |
1529 | struct ceph_cap_snap, | 1536 | struct ceph_cap_snap, |
1530 | flushing_item); | 1537 | flushing_item); |
1531 | struct ceph_inode_info *ci = capsnap->ci; | 1538 | struct ceph_inode_info *ci = capsnap->ci; |
1532 | if (!check_cap_flush(ci, 0, want_snap_seq)) { | 1539 | if (!check_capsnap_flush(ci, want_snap_seq)) { |
1533 | dout("check_cap_flush still flushing snap %p " | 1540 | dout("check_cap_flush still flushing snap %p " |
1534 | "follows %lld <= %lld to mds%d\n", | 1541 | "follows %lld <= %lld to mds%d\n", |
1535 | &ci->vfs_inode, capsnap->follows, | 1542 | &ci->vfs_inode, capsnap->follows, |
1536 | want_snap_seq, mds); | 1543 | want_snap_seq, mds); |
1537 | inode2 = igrab(&ci->vfs_inode); | 1544 | inode = igrab(&ci->vfs_inode); |
1538 | } | 1545 | } |
1539 | } | 1546 | } |
1540 | mutex_unlock(&session->s_mutex); | 1547 | mutex_unlock(&session->s_mutex); |
1541 | ceph_put_mds_session(session); | 1548 | ceph_put_mds_session(session); |
1542 | 1549 | ||
1543 | if (inode1) { | 1550 | if (inode) { |
1544 | wait_event(mdsc->cap_flushing_wq, | ||
1545 | check_cap_flush(ceph_inode(inode1), | ||
1546 | want_flush_seq, 0)); | ||
1547 | iput(inode1); | ||
1548 | } | ||
1549 | if (inode2) { | ||
1550 | wait_event(mdsc->cap_flushing_wq, | 1551 | wait_event(mdsc->cap_flushing_wq, |
1551 | check_cap_flush(ceph_inode(inode2), | 1552 | check_capsnap_flush(ceph_inode(inode), |
1552 | 0, want_snap_seq)); | 1553 | want_snap_seq)); |
1553 | iput(inode2); | 1554 | iput(inode); |
1554 | } | 1555 | } else { |
1555 | |||
1556 | if (!inode1 && !inode2) | ||
1557 | mds++; | 1556 | mds++; |
1557 | } | ||
1558 | 1558 | ||
1559 | mutex_lock(&mdsc->mutex); | 1559 | mutex_lock(&mdsc->mutex); |
1560 | } | 1560 | } |
1561 | |||
1562 | mutex_unlock(&mdsc->mutex); | 1561 | mutex_unlock(&mdsc->mutex); |
1563 | dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); | 1562 | |
1563 | wait_event(mdsc->cap_flushing_wq, | ||
1564 | check_caps_flush(mdsc, want_flush_tid)); | ||
1565 | |||
1566 | dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid); | ||
1564 | } | 1567 | } |
1565 | 1568 | ||
1566 | /* | 1569 | /* |
@@ -3426,8 +3429,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) | |||
3426 | spin_lock_init(&mdsc->cap_delay_lock); | 3429 | spin_lock_init(&mdsc->cap_delay_lock); |
3427 | INIT_LIST_HEAD(&mdsc->snap_flush_list); | 3430 | INIT_LIST_HEAD(&mdsc->snap_flush_list); |
3428 | spin_lock_init(&mdsc->snap_flush_lock); | 3431 | spin_lock_init(&mdsc->snap_flush_lock); |
3429 | mdsc->cap_flush_seq = 0; | ||
3430 | mdsc->last_cap_flush_tid = 1; | 3432 | mdsc->last_cap_flush_tid = 1; |
3433 | mdsc->cap_flush_tree = RB_ROOT; | ||
3431 | INIT_LIST_HEAD(&mdsc->cap_dirty); | 3434 | INIT_LIST_HEAD(&mdsc->cap_dirty); |
3432 | INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); | 3435 | INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); |
3433 | mdsc->num_cap_flushing = 0; | 3436 | mdsc->num_cap_flushing = 0; |
@@ -3554,7 +3557,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) | |||
3554 | 3557 | ||
3555 | ceph_flush_dirty_caps(mdsc); | 3558 | ceph_flush_dirty_caps(mdsc); |
3556 | spin_lock(&mdsc->cap_dirty_lock); | 3559 | spin_lock(&mdsc->cap_dirty_lock); |
3557 | want_flush = mdsc->cap_flush_seq; | 3560 | want_flush = mdsc->last_cap_flush_tid; |
3558 | spin_unlock(&mdsc->cap_dirty_lock); | 3561 | spin_unlock(&mdsc->cap_dirty_lock); |
3559 | 3562 | ||
3560 | down_read(&mdsc->snap_rwsem); | 3563 | down_read(&mdsc->snap_rwsem); |
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 19f6084203f0..470be4eb25f3 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h | |||
@@ -306,8 +306,8 @@ struct ceph_mds_client { | |||
306 | struct list_head snap_flush_list; /* cap_snaps ready to flush */ | 306 | struct list_head snap_flush_list; /* cap_snaps ready to flush */ |
307 | spinlock_t snap_flush_lock; | 307 | spinlock_t snap_flush_lock; |
308 | 308 | ||
309 | u64 cap_flush_seq; | ||
310 | u64 last_cap_flush_tid; | 309 | u64 last_cap_flush_tid; |
310 | struct rb_root cap_flush_tree; | ||
311 | struct list_head cap_dirty; /* inodes with dirty caps */ | 311 | struct list_head cap_dirty; /* inodes with dirty caps */ |
312 | struct list_head cap_dirty_migrating; /* ...that are migration... */ | 312 | struct list_head cap_dirty_migrating; /* ...that are migration... */ |
313 | int num_cap_flushing; /* # caps we are flushing */ | 313 | int num_cap_flushing; /* # caps we are flushing */ |
diff --git a/fs/ceph/super.h b/fs/ceph/super.h index cc597f52e046..94d91471165f 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h | |||
@@ -189,6 +189,7 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) | |||
189 | struct ceph_cap_flush { | 189 | struct ceph_cap_flush { |
190 | u64 tid; | 190 | u64 tid; |
191 | int caps; | 191 | int caps; |
192 | struct rb_node g_node; | ||
192 | union { | 193 | union { |
193 | struct rb_node i_node; | 194 | struct rb_node i_node; |
194 | struct list_head list; | 195 | struct list_head list; |
@@ -304,7 +305,6 @@ struct ceph_inode_info { | |||
304 | struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ | 305 | struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ |
305 | unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ | 306 | unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ |
306 | struct list_head i_dirty_item, i_flushing_item; | 307 | struct list_head i_dirty_item, i_flushing_item; |
307 | u64 i_cap_flush_seq; | ||
308 | /* we need to track cap writeback on a per-cap-bit basis, to allow | 308 | /* we need to track cap writeback on a per-cap-bit basis, to allow |
309 | * overlapping, pipelined cap flushes to the mds. we can probably | 309 | * overlapping, pipelined cap flushes to the mds. we can probably |
310 | * reduce the tid to 8 bits if we're concerned about inode size. */ | 310 | * reduce the tid to 8 bits if we're concerned about inode size. */ |