aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/linux-2.6/xfs_trace.h
diff options
context:
space:
mode:
authorDave Chinner <david@fromorbit.com>2010-05-20 22:07:08 -0400
committerAlex Elder <aelder@sgi.com>2010-05-24 11:34:00 -0400
commited3b4d6cdc81e8feefdbfa3c584614be301b6d39 (patch)
tree5b8cd5735dfbc5eb834f96d25a8eb587186715be /fs/xfs/linux-2.6/xfs_trace.h
parent955833cf2ad0aa39b336e853cad212d867199984 (diff)
xfs: Improve scalability of busy extent tracking
When we free a metadata extent, we record it in the per-AG busy extent array so that it is not re-used before the freeing transaction hits the disk. This array is fixed size, so when it overflows we make further allocation transactions synchronous because we cannot track more freed extents until those transactions hit the disk and are completed. Under heavy mixed allocation and freeing workloads with large log buffers, we can overflow this array quite easily. Further, the array is sparsely populated, which means that inserts need to search for a free slot, and array searches often have to search many more slots that are actually used to check all the busy extents. Quite inefficient, really. To enable this aspect of extent freeing to scale better, we need a structure that can grow dynamically. While in other areas of XFS we have used radix trees, the extents being freed are at random locations on disk so are better suited to being indexed by an rbtree. So, use a per-AG rbtree indexed by block number to track busy extents. This incures a memory allocation when marking an extent busy, but should not occur too often in low memory situations. This should scale to an arbitrary number of extents so should not be a limitation for features such as in-memory aggregation of transactions. However, there are still situations where we can't avoid allocating busy extents (such as allocation from the AGFL). To minimise the overhead of such occurences, we need to avoid doing a synchronous log force while holding the AGF locked to ensure that the previous transactions are safely on disk before we use the extent. We can do this by marking the transaction doing the allocation as synchronous rather issuing a log force. Because of the locking involved and the ordering of transactions, the synchronous transaction provides the same guarantees as a synchronous log force because it ensures that all the prior transactions are already on disk when the synchronous transaction hits the disk. i.e. it preserves the free->allocate order of the extent correctly in recovery. By doing this, we avoid holding the AGF locked while log writes are in progress, hence reducing the length of time the lock is held and therefore we increase the rate at which we can allocate and free from the allocation group, thereby increasing overall throughput. The only problem with this approach is that when a metadata buffer is marked stale (e.g. a directory block is removed), then buffer remains pinned and locked until the log goes to disk. The issue here is that if that stale buffer is reallocated in a subsequent transaction, the attempt to lock that buffer in the transaction will hang waiting the log to go to disk to unlock and unpin the buffer. Hence if someone tries to lock a pinned, stale, locked buffer we need to push on the log to get it unlocked ASAP. Effectively we are trading off a guaranteed log force for a much less common trigger for log force to occur. Ideally we should not reallocate busy extents. That is a much more complex fix to the problem as it involves direct intervention in the allocation btree searches in many places. This is left to a future set of modifications. Finally, now that we track busy extents in allocated memory, we don't need the descriptors in the transaction structure to point to them. We can replace the complex busy chunk infrastructure with a simple linked list of busy extents. This allows us to remove a large chunk of code, making the overall change a net reduction in code size. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_trace.h')
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h83
1 files changed, 56 insertions, 27 deletions
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 8a319cfd2901..ff6bc797baf2 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -1059,83 +1059,112 @@ TRACE_EVENT(xfs_bunmap,
1059 1059
1060); 1060);
1061 1061
1062#define XFS_BUSY_SYNC \
1063 { 0, "async" }, \
1064 { 1, "sync" }
1065
1062TRACE_EVENT(xfs_alloc_busy, 1066TRACE_EVENT(xfs_alloc_busy,
1063 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, 1067 TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
1064 xfs_extlen_t len, int slot), 1068 xfs_agblock_t agbno, xfs_extlen_t len, int sync),
1065 TP_ARGS(mp, agno, agbno, len, slot), 1069 TP_ARGS(trans, agno, agbno, len, sync),
1066 TP_STRUCT__entry( 1070 TP_STRUCT__entry(
1067 __field(dev_t, dev) 1071 __field(dev_t, dev)
1072 __field(struct xfs_trans *, tp)
1073 __field(int, tid)
1068 __field(xfs_agnumber_t, agno) 1074 __field(xfs_agnumber_t, agno)
1069 __field(xfs_agblock_t, agbno) 1075 __field(xfs_agblock_t, agbno)
1070 __field(xfs_extlen_t, len) 1076 __field(xfs_extlen_t, len)
1071 __field(int, slot) 1077 __field(int, sync)
1072 ), 1078 ),
1073 TP_fast_assign( 1079 TP_fast_assign(
1074 __entry->dev = mp->m_super->s_dev; 1080 __entry->dev = trans->t_mountp->m_super->s_dev;
1081 __entry->tp = trans;
1082 __entry->tid = trans->t_ticket->t_tid;
1075 __entry->agno = agno; 1083 __entry->agno = agno;
1076 __entry->agbno = agbno; 1084 __entry->agbno = agbno;
1077 __entry->len = len; 1085 __entry->len = len;
1078 __entry->slot = slot; 1086 __entry->sync = sync;
1079 ), 1087 ),
1080 TP_printk("dev %d:%d agno %u agbno %u len %u slot %d", 1088 TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
1081 MAJOR(__entry->dev), MINOR(__entry->dev), 1089 MAJOR(__entry->dev), MINOR(__entry->dev),
1090 __entry->tp,
1091 __entry->tid,
1082 __entry->agno, 1092 __entry->agno,
1083 __entry->agbno, 1093 __entry->agbno,
1084 __entry->len, 1094 __entry->len,
1085 __entry->slot) 1095 __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
1086 1096
1087); 1097);
1088 1098
1089#define XFS_BUSY_STATES \
1090 { 0, "found" }, \
1091 { 1, "missing" }
1092
1093TRACE_EVENT(xfs_alloc_unbusy, 1099TRACE_EVENT(xfs_alloc_unbusy,
1094 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, 1100 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1095 int slot, int found), 1101 xfs_agblock_t agbno, xfs_extlen_t len),
1096 TP_ARGS(mp, agno, slot, found), 1102 TP_ARGS(mp, agno, agbno, len),
1097 TP_STRUCT__entry( 1103 TP_STRUCT__entry(
1098 __field(dev_t, dev) 1104 __field(dev_t, dev)
1099 __field(xfs_agnumber_t, agno) 1105 __field(xfs_agnumber_t, agno)
1100 __field(int, slot) 1106 __field(xfs_agblock_t, agbno)
1101 __field(int, found) 1107 __field(xfs_extlen_t, len)
1102 ), 1108 ),
1103 TP_fast_assign( 1109 TP_fast_assign(
1104 __entry->dev = mp->m_super->s_dev; 1110 __entry->dev = mp->m_super->s_dev;
1105 __entry->agno = agno; 1111 __entry->agno = agno;
1106 __entry->slot = slot; 1112 __entry->agbno = agbno;
1107 __entry->found = found; 1113 __entry->len = len;
1108 ), 1114 ),
1109 TP_printk("dev %d:%d agno %u slot %d %s", 1115 TP_printk("dev %d:%d agno %u agbno %u len %u",
1110 MAJOR(__entry->dev), MINOR(__entry->dev), 1116 MAJOR(__entry->dev), MINOR(__entry->dev),
1111 __entry->agno, 1117 __entry->agno,
1112 __entry->slot, 1118 __entry->agbno,
1113 __print_symbolic(__entry->found, XFS_BUSY_STATES)) 1119 __entry->len)
1114); 1120);
1115 1121
1122#define XFS_BUSY_STATES \
1123 { 0, "missing" }, \
1124 { 1, "found" }
1125
1116TRACE_EVENT(xfs_alloc_busysearch, 1126TRACE_EVENT(xfs_alloc_busysearch,
1117 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, 1127 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1118 xfs_extlen_t len, xfs_lsn_t lsn), 1128 xfs_agblock_t agbno, xfs_extlen_t len, int found),
1119 TP_ARGS(mp, agno, agbno, len, lsn), 1129 TP_ARGS(mp, agno, agbno, len, found),
1120 TP_STRUCT__entry( 1130 TP_STRUCT__entry(
1121 __field(dev_t, dev) 1131 __field(dev_t, dev)
1122 __field(xfs_agnumber_t, agno) 1132 __field(xfs_agnumber_t, agno)
1123 __field(xfs_agblock_t, agbno) 1133 __field(xfs_agblock_t, agbno)
1124 __field(xfs_extlen_t, len) 1134 __field(xfs_extlen_t, len)
1125 __field(xfs_lsn_t, lsn) 1135 __field(int, found)
1126 ), 1136 ),
1127 TP_fast_assign( 1137 TP_fast_assign(
1128 __entry->dev = mp->m_super->s_dev; 1138 __entry->dev = mp->m_super->s_dev;
1129 __entry->agno = agno; 1139 __entry->agno = agno;
1130 __entry->agbno = agbno; 1140 __entry->agbno = agbno;
1131 __entry->len = len; 1141 __entry->len = len;
1132 __entry->lsn = lsn; 1142 __entry->found = found;
1133 ), 1143 ),
1134 TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx", 1144 TP_printk("dev %d:%d agno %u agbno %u len %u %s",
1135 MAJOR(__entry->dev), MINOR(__entry->dev), 1145 MAJOR(__entry->dev), MINOR(__entry->dev),
1136 __entry->agno, 1146 __entry->agno,
1137 __entry->agbno, 1147 __entry->agbno,
1138 __entry->len, 1148 __entry->len,
1149 __print_symbolic(__entry->found, XFS_BUSY_STATES))
1150);
1151
1152TRACE_EVENT(xfs_trans_commit_lsn,
1153 TP_PROTO(struct xfs_trans *trans),
1154 TP_ARGS(trans),
1155 TP_STRUCT__entry(
1156 __field(dev_t, dev)
1157 __field(struct xfs_trans *, tp)
1158 __field(xfs_lsn_t, lsn)
1159 ),
1160 TP_fast_assign(
1161 __entry->dev = trans->t_mountp->m_super->s_dev;
1162 __entry->tp = trans;
1163 __entry->lsn = trans->t_commit_lsn;
1164 ),
1165 TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
1166 MAJOR(__entry->dev), MINOR(__entry->dev),
1167 __entry->tp,
1139 __entry->lsn) 1168 __entry->lsn)
1140); 1169);
1141 1170