aboutsummaryrefslogtreecommitdiffstats
path: root/fs/gfs2/inode.c
diff options
context:
space:
mode:
authorBob Peterson <rpeterso@redhat.com>2019-08-30 13:31:02 -0400
committerAndreas Gruenbacher <agruenba@redhat.com>2019-09-04 14:22:17 -0400
commitad26967b9afa7faee22c3b79370cb5d9ab553493 (patch)
tree5062d7135c924b2fade3f01828750e4196382838 /fs/gfs2/inode.c
parent01123cf17cfa7c8c30109bdcf2f913f1e63ff97b (diff)
gfs2: Use async glocks for rename
Because s_vfs_rename_mutex is not cluster-wide, multiple nodes can reverse the roles of which directories are "old" and which are "new" for the purposes of rename. This can cause deadlocks where two nodes end up waiting for each other. There can be several layers of directory dependencies across many nodes. This patch fixes the problem by acquiring all gfs2_rename's inode glocks asychronously and waiting for all glocks to be acquired. That way all inodes are locked regardless of the order. The timeout value for multiple asynchronous glocks is calculated to be the total of the individual wait times for each glock times two. Since gfs2_exchange is very similar to gfs2_rename, both functions are patched in the same way. A new async glock wait queue, sd_async_glock_wait, keeps a list of waiters for these events. If gfs2's holder_wake function detects an async holder, it wakes up any waiters for the event. The waiter only tests whether any of its requests are still pending. Since the glocks are sent to dlm asychronously, the wait function needs to check to see which glocks, if any, were granted. If a glock is granted by dlm (and therefore held), its minimum hold time is checked and adjusted as necessary, as other glock grants do. If the event times out, all glocks held thus far must be dequeued to resolve any existing deadlocks. Then, if there are any outstanding locking requests, we need to loop around and wait for dlm to respond to those requests too. After we release all requests, we return -ESTALE to the caller (vfs rename) which loops around and retries the request. Node1 Node2 --------- --------- 1. Enqueue A Enqueue B 2. Enqueue B Enqueue A 3. A granted 6. B granted 7. Wait for B 8. Wait for A 9. A times out (since Node 1 holds A) 10. Dequeue B (since it was granted) 11. Wait for all requests from DLM 12. B Granted (since Node2 released it in step 10) 13. Rename 14. Dequeue A 15. DLM Grants A 16. Dequeue A (due to the timeout and since we no longer have B held for our task). 17. Dequeue B 18. Return -ESTALE to vfs 19. VFS retries the operation, goto step 1. This release-all-locks / acquire-all-locks may slow rename / exchange down as both nodes struggle in the same way and do the same thing. However, this will only happen when there is contention for the same inodes, which ought to be rare. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Diffstat (limited to 'fs/gfs2/inode.c')
-rw-r--r--fs/gfs2/inode.c34
1 files changed, 23 insertions, 11 deletions
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 50eeb15c6f4f..e1e18fb587eb 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1388,16 +1388,18 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1388 } 1388 }
1389 1389
1390 num_gh = 1; 1390 num_gh = 1;
1391 gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); 1391 gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs);
1392 if (odip != ndip) { 1392 if (odip != ndip) {
1393 gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); 1393 gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE,GL_ASYNC,
1394 ghs + num_gh);
1394 num_gh++; 1395 num_gh++;
1395 } 1396 }
1396 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); 1397 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs + num_gh);
1397 num_gh++; 1398 num_gh++;
1398 1399
1399 if (nip) { 1400 if (nip) {
1400 gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); 1401 gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC,
1402 ghs + num_gh);
1401 num_gh++; 1403 num_gh++;
1402 } 1404 }
1403 1405
@@ -1406,6 +1408,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1406 if (error) 1408 if (error)
1407 goto out_gunlock; 1409 goto out_gunlock;
1408 } 1410 }
1411 error = gfs2_glock_async_wait(num_gh, ghs);
1412 if (error)
1413 goto out_gunlock;
1409 1414
1410 if (nip) { 1415 if (nip) {
1411 /* Grab the resource group glock for unlink flag twiddling. 1416 /* Grab the resource group glock for unlink flag twiddling.
@@ -1555,7 +1560,8 @@ out_gunlock:
1555 gfs2_glock_dq_uninit(&rd_gh); 1560 gfs2_glock_dq_uninit(&rd_gh);
1556 1561
1557 while (x--) { 1562 while (x--) {
1558 gfs2_glock_dq(ghs + x); 1563 if (gfs2_holder_queued(ghs + x))
1564 gfs2_glock_dq(ghs + x);
1559 gfs2_holder_uninit(ghs + x); 1565 gfs2_holder_uninit(ghs + x);
1560 } 1566 }
1561out_gunlock_r: 1567out_gunlock_r:
@@ -1585,7 +1591,7 @@ static int gfs2_exchange(struct inode *odir, struct dentry *odentry,
1585 struct gfs2_inode *oip = GFS2_I(odentry->d_inode); 1591 struct gfs2_inode *oip = GFS2_I(odentry->d_inode);
1586 struct gfs2_inode *nip = GFS2_I(ndentry->d_inode); 1592 struct gfs2_inode *nip = GFS2_I(ndentry->d_inode);
1587 struct gfs2_sbd *sdp = GFS2_SB(odir); 1593 struct gfs2_sbd *sdp = GFS2_SB(odir);
1588 struct gfs2_holder ghs[5], r_gh; 1594 struct gfs2_holder ghs[4], r_gh;
1589 unsigned int num_gh; 1595 unsigned int num_gh;
1590 unsigned int x; 1596 unsigned int x;
1591 umode_t old_mode = oip->i_inode.i_mode; 1597 umode_t old_mode = oip->i_inode.i_mode;
@@ -1619,15 +1625,16 @@ static int gfs2_exchange(struct inode *odir, struct dentry *odentry,
1619 } 1625 }
1620 1626
1621 num_gh = 1; 1627 num_gh = 1;
1622 gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); 1628 gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs);
1623 if (odip != ndip) { 1629 if (odip != ndip) {
1624 gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); 1630 gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC,
1631 ghs + num_gh);
1625 num_gh++; 1632 num_gh++;
1626 } 1633 }
1627 gfs2_holder_init(oip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); 1634 gfs2_holder_init(oip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs + num_gh);
1628 num_gh++; 1635 num_gh++;
1629 1636
1630 gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); 1637 gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs + num_gh);
1631 num_gh++; 1638 num_gh++;
1632 1639
1633 for (x = 0; x < num_gh; x++) { 1640 for (x = 0; x < num_gh; x++) {
@@ -1636,6 +1643,10 @@ static int gfs2_exchange(struct inode *odir, struct dentry *odentry,
1636 goto out_gunlock; 1643 goto out_gunlock;
1637 } 1644 }
1638 1645
1646 error = gfs2_glock_async_wait(num_gh, ghs);
1647 if (error)
1648 goto out_gunlock;
1649
1639 error = -ENOENT; 1650 error = -ENOENT;
1640 if (oip->i_inode.i_nlink == 0 || nip->i_inode.i_nlink == 0) 1651 if (oip->i_inode.i_nlink == 0 || nip->i_inode.i_nlink == 0)
1641 goto out_gunlock; 1652 goto out_gunlock;
@@ -1696,7 +1707,8 @@ out_end_trans:
1696 gfs2_trans_end(sdp); 1707 gfs2_trans_end(sdp);
1697out_gunlock: 1708out_gunlock:
1698 while (x--) { 1709 while (x--) {
1699 gfs2_glock_dq(ghs + x); 1710 if (gfs2_holder_queued(ghs + x))
1711 gfs2_glock_dq(ghs + x);
1700 gfs2_holder_uninit(ghs + x); 1712 gfs2_holder_uninit(ghs + x);
1701 } 1713 }
1702out_gunlock_r: 1714out_gunlock_r: