diff options
author | David Chinner <dgc@sgi.com> | 2008-04-09 22:18:39 -0400 |
---|---|---|
committer | Lachlan McIlroy <lachlan@redback.melbourne.sgi.com> | 2008-04-17 21:50:22 -0400 |
commit | 114d23aae51233b2bc62d8e2a632bcb55de1953d (patch) | |
tree | 39aa3e7e6dd32c39a416e34c4a663cb329315685 | |
parent | 2abdb8c88110bab78bfe17e51346e735560daa02 (diff) |
[XFS] Per iclog callback chain lock
Rather than use the icloglock for protecting the iclog completion callback
chain, use a new per-iclog lock so that walking the callback chain doesn't
require holding a global lock.
This reduces contention on the icloglock during transaction commit and log
I/O completion by reducing the number of times we need to hold the global
icloglock during these operations.
SGI-PV: 978729
SGI-Modid: xfs-linux-melb:xfs-kern:30770a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
-rw-r--r-- | fs/xfs/xfs_log.c | 35 | ||||
-rw-r--r-- | fs/xfs/xfs_log_priv.h | 33 |
2 files changed, 45 insertions, 23 deletions
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 1fa980933895..7a5b12d93537 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c | |||
@@ -397,12 +397,10 @@ xfs_log_notify(xfs_mount_t *mp, /* mount of partition */ | |||
397 | void *iclog_hndl, /* iclog to hang callback off */ | 397 | void *iclog_hndl, /* iclog to hang callback off */ |
398 | xfs_log_callback_t *cb) | 398 | xfs_log_callback_t *cb) |
399 | { | 399 | { |
400 | xlog_t *log = mp->m_log; | ||
401 | xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl; | 400 | xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl; |
402 | int abortflg; | 401 | int abortflg; |
403 | 402 | ||
404 | cb->cb_next = NULL; | 403 | spin_lock(&iclog->ic_callback_lock); |
405 | spin_lock(&log->l_icloglock); | ||
406 | abortflg = (iclog->ic_state & XLOG_STATE_IOERROR); | 404 | abortflg = (iclog->ic_state & XLOG_STATE_IOERROR); |
407 | if (!abortflg) { | 405 | if (!abortflg) { |
408 | ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) || | 406 | ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) || |
@@ -411,7 +409,7 @@ xfs_log_notify(xfs_mount_t *mp, /* mount of partition */ | |||
411 | *(iclog->ic_callback_tail) = cb; | 409 | *(iclog->ic_callback_tail) = cb; |
412 | iclog->ic_callback_tail = &(cb->cb_next); | 410 | iclog->ic_callback_tail = &(cb->cb_next); |
413 | } | 411 | } |
414 | spin_unlock(&log->l_icloglock); | 412 | spin_unlock(&iclog->ic_callback_lock); |
415 | return abortflg; | 413 | return abortflg; |
416 | } /* xfs_log_notify */ | 414 | } /* xfs_log_notify */ |
417 | 415 | ||
@@ -1257,6 +1255,8 @@ xlog_alloc_log(xfs_mount_t *mp, | |||
1257 | iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize; | 1255 | iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize; |
1258 | iclog->ic_state = XLOG_STATE_ACTIVE; | 1256 | iclog->ic_state = XLOG_STATE_ACTIVE; |
1259 | iclog->ic_log = log; | 1257 | iclog->ic_log = log; |
1258 | atomic_set(&iclog->ic_refcnt, 0); | ||
1259 | spin_lock_init(&iclog->ic_callback_lock); | ||
1260 | iclog->ic_callback_tail = &(iclog->ic_callback); | 1260 | iclog->ic_callback_tail = &(iclog->ic_callback); |
1261 | iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize; | 1261 | iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize; |
1262 | 1262 | ||
@@ -1987,7 +1987,7 @@ xlog_state_clean_log(xlog_t *log) | |||
1987 | if (iclog->ic_state == XLOG_STATE_DIRTY) { | 1987 | if (iclog->ic_state == XLOG_STATE_DIRTY) { |
1988 | iclog->ic_state = XLOG_STATE_ACTIVE; | 1988 | iclog->ic_state = XLOG_STATE_ACTIVE; |
1989 | iclog->ic_offset = 0; | 1989 | iclog->ic_offset = 0; |
1990 | iclog->ic_callback = NULL; /* don't need to free */ | 1990 | ASSERT(iclog->ic_callback == NULL); |
1991 | /* | 1991 | /* |
1992 | * If the number of ops in this iclog indicate it just | 1992 | * If the number of ops in this iclog indicate it just |
1993 | * contains the dummy transaction, we can | 1993 | * contains the dummy transaction, we can |
@@ -2190,37 +2190,40 @@ xlog_state_do_callback( | |||
2190 | be64_to_cpu(iclog->ic_header.h_lsn); | 2190 | be64_to_cpu(iclog->ic_header.h_lsn); |
2191 | spin_unlock(&log->l_grant_lock); | 2191 | spin_unlock(&log->l_grant_lock); |
2192 | 2192 | ||
2193 | /* | ||
2194 | * Keep processing entries in the callback list | ||
2195 | * until we come around and it is empty. We | ||
2196 | * need to atomically see that the list is | ||
2197 | * empty and change the state to DIRTY so that | ||
2198 | * we don't miss any more callbacks being added. | ||
2199 | */ | ||
2200 | spin_lock(&log->l_icloglock); | ||
2201 | } else { | 2193 | } else { |
2194 | spin_unlock(&log->l_icloglock); | ||
2202 | ioerrors++; | 2195 | ioerrors++; |
2203 | } | 2196 | } |
2204 | cb = iclog->ic_callback; | ||
2205 | 2197 | ||
2198 | /* | ||
2199 | * Keep processing entries in the callback list until | ||
2200 | * we come around and it is empty. We need to | ||
2201 | * atomically see that the list is empty and change the | ||
2202 | * state to DIRTY so that we don't miss any more | ||
2203 | * callbacks being added. | ||
2204 | */ | ||
2205 | spin_lock(&iclog->ic_callback_lock); | ||
2206 | cb = iclog->ic_callback; | ||
2206 | while (cb) { | 2207 | while (cb) { |
2207 | iclog->ic_callback_tail = &(iclog->ic_callback); | 2208 | iclog->ic_callback_tail = &(iclog->ic_callback); |
2208 | iclog->ic_callback = NULL; | 2209 | iclog->ic_callback = NULL; |
2209 | spin_unlock(&log->l_icloglock); | 2210 | spin_unlock(&iclog->ic_callback_lock); |
2210 | 2211 | ||
2211 | /* perform callbacks in the order given */ | 2212 | /* perform callbacks in the order given */ |
2212 | for (; cb; cb = cb_next) { | 2213 | for (; cb; cb = cb_next) { |
2213 | cb_next = cb->cb_next; | 2214 | cb_next = cb->cb_next; |
2214 | cb->cb_func(cb->cb_arg, aborted); | 2215 | cb->cb_func(cb->cb_arg, aborted); |
2215 | } | 2216 | } |
2216 | spin_lock(&log->l_icloglock); | 2217 | spin_lock(&iclog->ic_callback_lock); |
2217 | cb = iclog->ic_callback; | 2218 | cb = iclog->ic_callback; |
2218 | } | 2219 | } |
2219 | 2220 | ||
2220 | loopdidcallbacks++; | 2221 | loopdidcallbacks++; |
2221 | funcdidcallbacks++; | 2222 | funcdidcallbacks++; |
2222 | 2223 | ||
2224 | spin_lock(&log->l_icloglock); | ||
2223 | ASSERT(iclog->ic_callback == NULL); | 2225 | ASSERT(iclog->ic_callback == NULL); |
2226 | spin_unlock(&iclog->ic_callback_lock); | ||
2224 | if (!(iclog->ic_state & XLOG_STATE_IOERROR)) | 2227 | if (!(iclog->ic_state & XLOG_STATE_IOERROR)) |
2225 | iclog->ic_state = XLOG_STATE_DIRTY; | 2228 | iclog->ic_state = XLOG_STATE_DIRTY; |
2226 | 2229 | ||
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 01c63db25a1d..104b623aa082 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h | |||
@@ -324,6 +324,19 @@ typedef struct xlog_rec_ext_header { | |||
324 | * - ic_offset is the current number of bytes written to in this iclog. | 324 | * - ic_offset is the current number of bytes written to in this iclog. |
325 | * - ic_refcnt is bumped when someone is writing to the log. | 325 | * - ic_refcnt is bumped when someone is writing to the log. |
326 | * - ic_state is the state of the iclog. | 326 | * - ic_state is the state of the iclog. |
327 | * | ||
328 | * Because of cacheline contention on large machines, we need to separate | ||
329 | * various resources onto different cachelines. To start with, make the | ||
330 | * structure cacheline aligned. The following fields can be contended on | ||
331 | * by independent processes: | ||
332 | * | ||
333 | * - ic_callback_* | ||
334 | * - ic_refcnt | ||
335 | * - fields protected by the global l_icloglock | ||
336 | * | ||
337 | * so we need to ensure that these fields are located in separate cachelines. | ||
338 | * We'll put all the read-only and l_icloglock fields in the first cacheline, | ||
339 | * and move everything else out to subsequent cachelines. | ||
327 | */ | 340 | */ |
328 | typedef struct xlog_iclog_fields { | 341 | typedef struct xlog_iclog_fields { |
329 | sv_t ic_forcesema; | 342 | sv_t ic_forcesema; |
@@ -332,18 +345,23 @@ typedef struct xlog_iclog_fields { | |||
332 | struct xlog_in_core *ic_prev; | 345 | struct xlog_in_core *ic_prev; |
333 | struct xfs_buf *ic_bp; | 346 | struct xfs_buf *ic_bp; |
334 | struct log *ic_log; | 347 | struct log *ic_log; |
335 | xfs_log_callback_t *ic_callback; | ||
336 | xfs_log_callback_t **ic_callback_tail; | ||
337 | #ifdef XFS_LOG_TRACE | ||
338 | struct ktrace *ic_trace; | ||
339 | #endif | ||
340 | int ic_size; | 348 | int ic_size; |
341 | int ic_offset; | 349 | int ic_offset; |
342 | atomic_t ic_refcnt; | ||
343 | int ic_bwritecnt; | 350 | int ic_bwritecnt; |
344 | ushort_t ic_state; | 351 | ushort_t ic_state; |
345 | char *ic_datap; /* pointer to iclog data */ | 352 | char *ic_datap; /* pointer to iclog data */ |
346 | } xlog_iclog_fields_t; | 353 | #ifdef XFS_LOG_TRACE |
354 | struct ktrace *ic_trace; | ||
355 | #endif | ||
356 | |||
357 | /* Callback structures need their own cacheline */ | ||
358 | spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; | ||
359 | xfs_log_callback_t *ic_callback; | ||
360 | xfs_log_callback_t **ic_callback_tail; | ||
361 | |||
362 | /* reference counts need their own cacheline */ | ||
363 | atomic_t ic_refcnt ____cacheline_aligned_in_smp; | ||
364 | } xlog_iclog_fields_t ____cacheline_aligned_in_smp; | ||
347 | 365 | ||
348 | typedef union xlog_in_core2 { | 366 | typedef union xlog_in_core2 { |
349 | xlog_rec_header_t hic_header; | 367 | xlog_rec_header_t hic_header; |
@@ -366,6 +384,7 @@ typedef struct xlog_in_core { | |||
366 | #define ic_bp hic_fields.ic_bp | 384 | #define ic_bp hic_fields.ic_bp |
367 | #define ic_log hic_fields.ic_log | 385 | #define ic_log hic_fields.ic_log |
368 | #define ic_callback hic_fields.ic_callback | 386 | #define ic_callback hic_fields.ic_callback |
387 | #define ic_callback_lock hic_fields.ic_callback_lock | ||
369 | #define ic_callback_tail hic_fields.ic_callback_tail | 388 | #define ic_callback_tail hic_fields.ic_callback_tail |
370 | #define ic_trace hic_fields.ic_trace | 389 | #define ic_trace hic_fields.ic_trace |
371 | #define ic_size hic_fields.ic_size | 390 | #define ic_size hic_fields.ic_size |