summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGang He <ghe@suse.com>2019-03-05 18:41:45 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2019-03-06 00:07:13 -0500
commit5500ab4ed3b8f0749ec584d8c5e2738bc01ea52e (patch)
tree9d7d80510e0cef57c6e2fa117985e2c570e27c71
parentcc725ef3cb202ef2019a3c67c8913efa05c3cce6 (diff)
ocfs2: fix the application IO timeout when fstrim is running
The user reported this problem, the upper application IO was timeout when fstrim was running on this ocfs2 partition. the application monitoring resource agent considered that this application did not work, then this node was fenced by the cluster brain (e.g. pacemaker). The root cause is that fstrim thread always holds main_bm meta-file related locks until all the cluster groups are trimmed. This patch will make fstrim thread release main_bm meta-file related locks when each cluster group is trimmed, this will let the current application IO has a chance to claim the clusters from main_bm meta-file. Link: http://lkml.kernel.org/r/20190111090014.31645-1-ghe@suse.com Signed-off-by: Gang He <ghe@suse.com> Reviewed-by: Changwei Ge <ge.changwei@h3c.com> Cc: Mark Fasheh <mfasheh@versity.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Cc: Joseph Qi <joseph.qi@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/ocfs2/alloc.c159
-rw-r--r--fs/ocfs2/dlmglue.c5
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/super.c2
5 files changed, 106 insertions, 63 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d1cbb27808e2..6f0999015a44 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block *sb,
7532 return count; 7532 return count;
7533} 7533}
7534 7534
7535int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) 7535static
7536int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
7536{ 7537{
7537 struct ocfs2_super *osb = OCFS2_SB(sb); 7538 struct ocfs2_super *osb = OCFS2_SB(sb);
7538 u64 start, len, trimmed, first_group, last_group, group; 7539 u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
7539 int ret, cnt; 7540 int ret, cnt;
7540 u32 first_bit, last_bit, minlen; 7541 u32 first_bit, last_bit, minlen;
7541 struct buffer_head *main_bm_bh = NULL; 7542 struct buffer_head *main_bm_bh = NULL;
@@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7543 struct buffer_head *gd_bh = NULL; 7544 struct buffer_head *gd_bh = NULL;
7544 struct ocfs2_dinode *main_bm; 7545 struct ocfs2_dinode *main_bm;
7545 struct ocfs2_group_desc *gd = NULL; 7546 struct ocfs2_group_desc *gd = NULL;
7546 struct ocfs2_trim_fs_info info, *pinfo = NULL;
7547 7547
7548 start = range->start >> osb->s_clustersize_bits; 7548 start = range->start >> osb->s_clustersize_bits;
7549 len = range->len >> osb->s_clustersize_bits; 7549 len = range->len >> osb->s_clustersize_bits;
@@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7552 if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize) 7552 if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
7553 return -EINVAL; 7553 return -EINVAL;
7554 7554
7555 trace_ocfs2_trim_mainbm(start, len, minlen);
7556
7557next_group:
7555 main_bm_inode = ocfs2_get_system_file_inode(osb, 7558 main_bm_inode = ocfs2_get_system_file_inode(osb,
7556 GLOBAL_BITMAP_SYSTEM_INODE, 7559 GLOBAL_BITMAP_SYSTEM_INODE,
7557 OCFS2_INVALID_SLOT); 7560 OCFS2_INVALID_SLOT);
@@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7570 } 7573 }
7571 main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data; 7574 main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
7572 7575
7573 if (start >= le32_to_cpu(main_bm->i_clusters)) { 7576 /*
7574 ret = -EINVAL; 7577 * Do some check before trim the first group.
7575 goto out_unlock; 7578 */
7576 } 7579 if (!group) {
7577 7580 if (start >= le32_to_cpu(main_bm->i_clusters)) {
7578 len = range->len >> osb->s_clustersize_bits; 7581 ret = -EINVAL;
7579 if (start + len > le32_to_cpu(main_bm->i_clusters))
7580 len = le32_to_cpu(main_bm->i_clusters) - start;
7581
7582 trace_ocfs2_trim_fs(start, len, minlen);
7583
7584 ocfs2_trim_fs_lock_res_init(osb);
7585 ret = ocfs2_trim_fs_lock(osb, NULL, 1);
7586 if (ret < 0) {
7587 if (ret != -EAGAIN) {
7588 mlog_errno(ret);
7589 ocfs2_trim_fs_lock_res_uninit(osb);
7590 goto out_unlock; 7582 goto out_unlock;
7591 } 7583 }
7592 7584
7593 mlog(ML_NOTICE, "Wait for trim on device (%s) to " 7585 if (start + len > le32_to_cpu(main_bm->i_clusters))
7594 "finish, which is running from another node.\n", 7586 len = le32_to_cpu(main_bm->i_clusters) - start;
7595 osb->dev_str);
7596 ret = ocfs2_trim_fs_lock(osb, &info, 0);
7597 if (ret < 0) {
7598 mlog_errno(ret);
7599 ocfs2_trim_fs_lock_res_uninit(osb);
7600 goto out_unlock;
7601 }
7602 7587
7603 if (info.tf_valid && info.tf_success && 7588 /*
7604 info.tf_start == start && info.tf_len == len && 7589 * Determine first and last group to examine based on
7605 info.tf_minlen == minlen) { 7590 * start and len
7606 /* Avoid sending duplicated trim to a shared device */ 7591 */
7607 mlog(ML_NOTICE, "The same trim on device (%s) was " 7592 first_group = ocfs2_which_cluster_group(main_bm_inode, start);
7608 "just done from node (%u), return.\n", 7593 if (first_group == osb->first_cluster_group_blkno)
7609 osb->dev_str, info.tf_nodenum); 7594 first_bit = start;
7610 range->len = info.tf_trimlen; 7595 else
7611 goto out_trimunlock; 7596 first_bit = start - ocfs2_blocks_to_clusters(sb,
7612 } 7597 first_group);
7598 last_group = ocfs2_which_cluster_group(main_bm_inode,
7599 start + len - 1);
7600 group = first_group;
7613 } 7601 }
7614 7602
7615 info.tf_nodenum = osb->node_num; 7603 do {
7616 info.tf_start = start;
7617 info.tf_len = len;
7618 info.tf_minlen = minlen;
7619
7620 /* Determine first and last group to examine based on start and len */
7621 first_group = ocfs2_which_cluster_group(main_bm_inode, start);
7622 if (first_group == osb->first_cluster_group_blkno)
7623 first_bit = start;
7624 else
7625 first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
7626 last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
7627 last_bit = osb->bitmap_cpg;
7628
7629 trimmed = 0;
7630 for (group = first_group; group <= last_group;) {
7631 if (first_bit + len >= osb->bitmap_cpg) 7604 if (first_bit + len >= osb->bitmap_cpg)
7632 last_bit = osb->bitmap_cpg; 7605 last_bit = osb->bitmap_cpg;
7633 else 7606 else
@@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7659 group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); 7632 group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
7660 else 7633 else
7661 group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); 7634 group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
7662 } 7635 } while (0);
7663 range->len = trimmed * sb->s_blocksize;
7664 7636
7665 info.tf_trimlen = range->len;
7666 info.tf_success = (ret ? 0 : 1);
7667 pinfo = &info;
7668out_trimunlock:
7669 ocfs2_trim_fs_unlock(osb, pinfo);
7670 ocfs2_trim_fs_lock_res_uninit(osb);
7671out_unlock: 7637out_unlock:
7672 ocfs2_inode_unlock(main_bm_inode, 0); 7638 ocfs2_inode_unlock(main_bm_inode, 0);
7673 brelse(main_bm_bh); 7639 brelse(main_bm_bh);
7640 main_bm_bh = NULL;
7674out_mutex: 7641out_mutex:
7675 inode_unlock(main_bm_inode); 7642 inode_unlock(main_bm_inode);
7676 iput(main_bm_inode); 7643 iput(main_bm_inode);
7644
7645 /*
7646 * If all the groups trim are not done or failed, but we should release
7647 * main_bm related locks for avoiding the current IO starve, then go to
7648 * trim the next group
7649 */
7650 if (ret >= 0 && group <= last_group)
7651 goto next_group;
7677out: 7652out:
7653 range->len = trimmed * sb->s_blocksize;
7654 return ret;
7655}
7656
7657int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7658{
7659 int ret;
7660 struct ocfs2_super *osb = OCFS2_SB(sb);
7661 struct ocfs2_trim_fs_info info, *pinfo = NULL;
7662
7663 ocfs2_trim_fs_lock_res_init(osb);
7664
7665 trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
7666
7667 ret = ocfs2_trim_fs_lock(osb, NULL, 1);
7668 if (ret < 0) {
7669 if (ret != -EAGAIN) {
7670 mlog_errno(ret);
7671 ocfs2_trim_fs_lock_res_uninit(osb);
7672 return ret;
7673 }
7674
7675 mlog(ML_NOTICE, "Wait for trim on device (%s) to "
7676 "finish, which is running from another node.\n",
7677 osb->dev_str);
7678 ret = ocfs2_trim_fs_lock(osb, &info, 0);
7679 if (ret < 0) {
7680 mlog_errno(ret);
7681 ocfs2_trim_fs_lock_res_uninit(osb);
7682 return ret;
7683 }
7684
7685 if (info.tf_valid && info.tf_success &&
7686 info.tf_start == range->start &&
7687 info.tf_len == range->len &&
7688 info.tf_minlen == range->minlen) {
7689 /* Avoid sending duplicated trim to a shared device */
7690 mlog(ML_NOTICE, "The same trim on device (%s) was "
7691 "just done from node (%u), return.\n",
7692 osb->dev_str, info.tf_nodenum);
7693 range->len = info.tf_trimlen;
7694 goto out;
7695 }
7696 }
7697
7698 info.tf_nodenum = osb->node_num;
7699 info.tf_start = range->start;
7700 info.tf_len = range->len;
7701 info.tf_minlen = range->minlen;
7702
7703 ret = ocfs2_trim_mainbm(sb, range);
7704
7705 info.tf_trimlen = range->len;
7706 info.tf_success = (ret < 0 ? 0 : 1);
7707 pinfo = &info;
7708out:
7709 ocfs2_trim_fs_unlock(osb, pinfo);
7710 ocfs2_trim_fs_lock_res_uninit(osb);
7678 return ret; 7711 return ret;
7679} 7712}
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 7c835824247e..af405586c5b1 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
686{ 686{
687 struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; 687 struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
688 688
689 /* Only one trimfs thread are allowed to work at the same time. */
690 mutex_lock(&osb->obs_trim_fs_mutex);
691
689 ocfs2_lock_res_init_once(lockres); 692 ocfs2_lock_res_init_once(lockres);
690 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name); 693 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
691 ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS, 694 ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
@@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb)
698 701
699 ocfs2_simple_drop_lockres(osb, lockres); 702 ocfs2_simple_drop_lockres(osb, lockres);
700 ocfs2_lock_res_free(lockres); 703 ocfs2_lock_res_free(lockres);
704
705 mutex_unlock(&osb->obs_trim_fs_mutex);
701} 706}
702 707
703static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res, 708static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 4f86ac0027b5..1f029fbe8b8d 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -407,6 +407,7 @@ struct ocfs2_super
407 struct ocfs2_lock_res osb_rename_lockres; 407 struct ocfs2_lock_res osb_rename_lockres;
408 struct ocfs2_lock_res osb_nfs_sync_lockres; 408 struct ocfs2_lock_res osb_nfs_sync_lockres;
409 struct ocfs2_lock_res osb_trim_fs_lockres; 409 struct ocfs2_lock_res osb_trim_fs_lockres;
410 struct mutex obs_trim_fs_mutex;
410 struct ocfs2_dlm_debug *osb_dlm_debug; 411 struct ocfs2_dlm_debug *osb_dlm_debug;
411 412
412 struct dentry *osb_debug_root; 413 struct dentry *osb_debug_root;
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 2ee76a90ba8f..dc4bce1649c1 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent,
712 712
713DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group); 713DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
714 714
715DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
716
715DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs); 717DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
716 718
717/* End of trace events for fs/ocfs2/alloc.c. */ 719/* End of trace events for fs/ocfs2/alloc.c. */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3415e0b09398..96ae7cedd487 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
1847 if (ocfs2_is_hard_readonly(osb)) 1847 if (ocfs2_is_hard_readonly(osb))
1848 goto leave; 1848 goto leave;
1849 1849
1850 mutex_init(&osb->obs_trim_fs_mutex);
1851
1850 status = ocfs2_dlm_init(osb); 1852 status = ocfs2_dlm_init(osb);
1851 if (status < 0) { 1853 if (status < 0) {
1852 mlog_errno(status); 1854 mlog_errno(status);