aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYan, Zheng <zheng.z.yan@intel.com>2013-08-13 00:42:15 -0400
committerSage Weil <sage@inktank.com>2013-08-15 14:12:06 -0400
commitb0d7c2231015b331b942746610a05b6ea72977ab (patch)
treea13e3f015fc3144371550b4f5c25363bd66bdf1f
parentb150f5c1c759d551da9146435d3dc9df5f7e15ef (diff)
ceph: introduce i_truncate_mutex
I encountered below deadlock when running fsstress wmtruncate work truncate MDS --------------- ------------------ -------------------------- lock i_mutex <- truncate file lock i_mutex (blocked) <- revoking Fcb (filelock to MIX) send request -> handle request (xlock filelock) At the initial time, there are some dirty pages in the page cache. When the kclient receives the truncate message, it reduces inode size and creates some 'out of i_size' dirty pages. wmtruncate work can't truncate these dirty pages because it's blocked by the i_mutex. Later when the kclient receives the cap message that revokes Fcb caps, It can't flush all dirty pages because writepages() only flushes dirty pages within the inode size. When the MDS handles the 'truncate' request from kclient, it waits for the filelock to become stable. But the filelock is stuck in unstable state because it can't finish revoking kclient's Fcb caps. The truncate pagecache locking has already caused lots of trouble for use. I think it's time simplify it by introducing a new mutex. We use the new mutex to prevent concurrent truncate_inode_pages(). There is no need to worry about race between buffered write and truncate_inode_pages(), because our "get caps" mechanism prevents them from concurrent execution. Reviewed-by: Sage Weil <sage@inktank.com> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
-rw-r--r--fs/ceph/caps.c4
-rw-r--r--fs/ceph/file.c8
-rw-r--r--fs/ceph/inode.c39
-rw-r--r--fs/ceph/super.h1
4 files changed, 30 insertions, 22 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 430121a795bd..0e94d27fa284 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2072,11 +2072,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
2072 /* finish pending truncate */ 2072 /* finish pending truncate */
2073 while (ci->i_truncate_pending) { 2073 while (ci->i_truncate_pending) {
2074 spin_unlock(&ci->i_ceph_lock); 2074 spin_unlock(&ci->i_ceph_lock);
2075 if (!(need & CEPH_CAP_FILE_WR))
2076 mutex_lock(&inode->i_mutex);
2077 __ceph_do_pending_vmtruncate(inode); 2075 __ceph_do_pending_vmtruncate(inode);
2078 if (!(need & CEPH_CAP_FILE_WR))
2079 mutex_unlock(&inode->i_mutex);
2080 spin_lock(&ci->i_ceph_lock); 2076 spin_lock(&ci->i_ceph_lock);
2081 } 2077 }
2082 2078
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index bc0735498d29..abc0e0759bdc 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -773,6 +773,13 @@ retry_snap:
773 goto retry_snap; 773 goto retry_snap;
774 } 774 }
775 } else { 775 } else {
776 /*
777 * No need to acquire the i_truncate_mutex. Because
778 * the MDS revokes Fwb caps before sending truncate
779 * message to us. We can't get Fwb cap while there
780 * are pending vmtruncate. So write and vmtruncate
781 * can not run at the same time
782 */
776 written = generic_file_buffered_write(iocb, iov, nr_segs, 783 written = generic_file_buffered_write(iocb, iov, nr_segs,
777 pos, &iocb->ki_pos, 784 pos, &iocb->ki_pos,
778 count, 0); 785 count, 0);
@@ -819,7 +826,6 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
819 int ret; 826 int ret;
820 827
821 mutex_lock(&inode->i_mutex); 828 mutex_lock(&inode->i_mutex);
822 __ceph_do_pending_vmtruncate(inode);
823 829
824 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { 830 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
825 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); 831 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 98b6e50bde04..602ccd8e06b7 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -352,6 +352,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
352 for (i = 0; i < CEPH_FILE_MODE_NUM; i++) 352 for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
353 ci->i_nr_by_mode[i] = 0; 353 ci->i_nr_by_mode[i] = 0;
354 354
355 mutex_init(&ci->i_truncate_mutex);
355 ci->i_truncate_seq = 0; 356 ci->i_truncate_seq = 0;
356 ci->i_truncate_size = 0; 357 ci->i_truncate_size = 0;
357 ci->i_truncate_pending = 0; 358 ci->i_truncate_pending = 0;
@@ -463,16 +464,20 @@ int ceph_fill_file_size(struct inode *inode, int issued,
463 dout("truncate_seq %u -> %u\n", 464 dout("truncate_seq %u -> %u\n",
464 ci->i_truncate_seq, truncate_seq); 465 ci->i_truncate_seq, truncate_seq);
465 ci->i_truncate_seq = truncate_seq; 466 ci->i_truncate_seq = truncate_seq;
467
468 /* the MDS should have revoked these caps */
469 WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL |
470 CEPH_CAP_FILE_RD |
471 CEPH_CAP_FILE_WR |
472 CEPH_CAP_FILE_LAZYIO));
466 /* 473 /*
467 * If we hold relevant caps, or in the case where we're 474 * If we hold relevant caps, or in the case where we're
468 * not the only client referencing this file and we 475 * not the only client referencing this file and we
469 * don't hold those caps, then we need to check whether 476 * don't hold those caps, then we need to check whether
470 * the file is either opened or mmaped 477 * the file is either opened or mmaped
471 */ 478 */
472 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD| 479 if ((issued & (CEPH_CAP_FILE_CACHE|
473 CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER| 480 CEPH_CAP_FILE_BUFFER)) ||
474 CEPH_CAP_FILE_EXCL|
475 CEPH_CAP_FILE_LAZYIO)) ||
476 mapping_mapped(inode->i_mapping) || 481 mapping_mapped(inode->i_mapping) ||
477 __ceph_caps_file_wanted(ci)) { 482 __ceph_caps_file_wanted(ci)) {
478 ci->i_truncate_pending++; 483 ci->i_truncate_pending++;
@@ -1427,18 +1432,20 @@ static void ceph_invalidate_work(struct work_struct *work)
1427 u32 orig_gen; 1432 u32 orig_gen;
1428 int check = 0; 1433 int check = 0;
1429 1434
1435 mutex_lock(&ci->i_truncate_mutex);
1430 spin_lock(&ci->i_ceph_lock); 1436 spin_lock(&ci->i_ceph_lock);
1431 dout("invalidate_pages %p gen %d revoking %d\n", inode, 1437 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1432 ci->i_rdcache_gen, ci->i_rdcache_revoking); 1438 ci->i_rdcache_gen, ci->i_rdcache_revoking);
1433 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { 1439 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1434 /* nevermind! */ 1440 /* nevermind! */
1435 spin_unlock(&ci->i_ceph_lock); 1441 spin_unlock(&ci->i_ceph_lock);
1442 mutex_unlock(&ci->i_truncate_mutex);
1436 goto out; 1443 goto out;
1437 } 1444 }
1438 orig_gen = ci->i_rdcache_gen; 1445 orig_gen = ci->i_rdcache_gen;
1439 spin_unlock(&ci->i_ceph_lock); 1446 spin_unlock(&ci->i_ceph_lock);
1440 1447
1441 truncate_inode_pages(&inode->i_data, 0); 1448 truncate_inode_pages(inode->i_mapping, 0);
1442 1449
1443 spin_lock(&ci->i_ceph_lock); 1450 spin_lock(&ci->i_ceph_lock);
1444 if (orig_gen == ci->i_rdcache_gen && 1451 if (orig_gen == ci->i_rdcache_gen &&
@@ -1453,6 +1460,7 @@ static void ceph_invalidate_work(struct work_struct *work)
1453 ci->i_rdcache_revoking); 1460 ci->i_rdcache_revoking);
1454 } 1461 }
1455 spin_unlock(&ci->i_ceph_lock); 1462 spin_unlock(&ci->i_ceph_lock);
1463 mutex_unlock(&ci->i_truncate_mutex);
1456 1464
1457 if (check) 1465 if (check)
1458 ceph_check_caps(ci, 0, NULL); 1466 ceph_check_caps(ci, 0, NULL);
@@ -1473,16 +1481,7 @@ static void ceph_vmtruncate_work(struct work_struct *work)
1473 struct inode *inode = &ci->vfs_inode; 1481 struct inode *inode = &ci->vfs_inode;
1474 1482
1475 dout("vmtruncate_work %p\n", inode); 1483 dout("vmtruncate_work %p\n", inode);
1476 if (!mutex_trylock(&inode->i_mutex)) {
1477 /*
1478 * the i_mutex can be hold by a writer who is waiting for
1479 * caps. wake up waiters, they will do pending vmtruncate.
1480 */
1481 wake_up_all(&ci->i_cap_wq);
1482 mutex_lock(&inode->i_mutex);
1483 }
1484 __ceph_do_pending_vmtruncate(inode); 1484 __ceph_do_pending_vmtruncate(inode);
1485 mutex_unlock(&inode->i_mutex);
1486 iput(inode); 1485 iput(inode);
1487} 1486}
1488 1487
@@ -1515,11 +1514,13 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
1515 u64 to; 1514 u64 to;
1516 int wrbuffer_refs, finish = 0; 1515 int wrbuffer_refs, finish = 0;
1517 1516
1517 mutex_lock(&ci->i_truncate_mutex);
1518retry: 1518retry:
1519 spin_lock(&ci->i_ceph_lock); 1519 spin_lock(&ci->i_ceph_lock);
1520 if (ci->i_truncate_pending == 0) { 1520 if (ci->i_truncate_pending == 0) {
1521 dout("__do_pending_vmtruncate %p none pending\n", inode); 1521 dout("__do_pending_vmtruncate %p none pending\n", inode);
1522 spin_unlock(&ci->i_ceph_lock); 1522 spin_unlock(&ci->i_ceph_lock);
1523 mutex_unlock(&ci->i_truncate_mutex);
1523 return; 1524 return;
1524 } 1525 }
1525 1526
@@ -1536,6 +1537,9 @@ retry:
1536 goto retry; 1537 goto retry;
1537 } 1538 }
1538 1539
1540 /* there should be no reader or writer */
1541 WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref);
1542
1539 to = ci->i_truncate_size; 1543 to = ci->i_truncate_size;
1540 wrbuffer_refs = ci->i_wrbuffer_ref; 1544 wrbuffer_refs = ci->i_wrbuffer_ref;
1541 dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, 1545 dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
@@ -1553,6 +1557,8 @@ retry:
1553 if (!finish) 1557 if (!finish)
1554 goto retry; 1558 goto retry;
1555 1559
1560 mutex_unlock(&ci->i_truncate_mutex);
1561
1556 if (wrbuffer_refs == 0) 1562 if (wrbuffer_refs == 0)
1557 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); 1563 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
1558 1564
@@ -1601,8 +1607,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1601 if (ceph_snap(inode) != CEPH_NOSNAP) 1607 if (ceph_snap(inode) != CEPH_NOSNAP)
1602 return -EROFS; 1608 return -EROFS;
1603 1609
1604 __ceph_do_pending_vmtruncate(inode);
1605
1606 err = inode_change_ok(inode, attr); 1610 err = inode_change_ok(inode, attr);
1607 if (err != 0) 1611 if (err != 0)
1608 return err; 1612 return err;
@@ -1783,7 +1787,8 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1783 ceph_cap_string(dirtied), mask); 1787 ceph_cap_string(dirtied), mask);
1784 1788
1785 ceph_mdsc_put_request(req); 1789 ceph_mdsc_put_request(req);
1786 __ceph_do_pending_vmtruncate(inode); 1790 if (mask & CEPH_SETATTR_SIZE)
1791 __ceph_do_pending_vmtruncate(inode);
1787 return err; 1792 return err;
1788out: 1793out:
1789 spin_unlock(&ci->i_ceph_lock); 1794 spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index afcd62a68916..f1e4e4766ea2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -288,6 +288,7 @@ struct ceph_inode_info {
288 288
289 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ 289 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
290 290
291 struct mutex i_truncate_mutex;
291 u32 i_truncate_seq; /* last truncate to smaller size */ 292 u32 i_truncate_seq; /* last truncate to smaller size */
292 u64 i_truncate_size; /* and the size we last truncated down to */ 293 u64 i_truncate_size; /* and the size we last truncated down to */
293 int i_truncate_pending; /* still need to call vmtruncate */ 294 int i_truncate_pending; /* still need to call vmtruncate */