aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-05-24 17:14:46 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-05-24 17:14:46 -0400
commit9978306e31a8f89bd81fbc4c49fd9aefb1d30d10 (patch)
tree85bbd03336a82d20a00761ed35eb05536936b881
parentabe81e25f08abbac493754a043f7a91a1b3e0f93 (diff)
parent14c26c6a05de138a4fd9a0c05ff8e7435a618324 (diff)
Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs
Pull XFS update from Ben Myers: - Removal of xfsbufd - Background CIL flushes have been moved to a workqueue. - Fix to xfs_check_page_type applicable to filesystems where blocksize < page size - Fix for stale data exposure when extsize hints are used. - A series of xfs_buf cache cleanups. - Fix for XFS_IOC_ALLOCSP - Cleanups for includes and removal of xfs_lrw.[ch]. - Moved all busy extent handling to it's own file so that it is easier to merge with userspace. - Fix for log mount failure. - Fix to enable inode reclaim during quotacheck at mount time. - Fix for delalloc quota accounting. - Fix for memory reclaim deadlock on agi buffer. - Fixes for failed writes and to clean up stale delalloc blocks. - Fix to use GFP_NOFS in blkdev_issue_flush - SEEK_DATA/SEEK_HOLE support * 'for-linus' of git://oss.sgi.com/xfs/xfs: (57 commits) xfs: add trace points for log forces xfs: fix memory reclaim deadlock on agi buffer xfs: fix delalloc quota accounting on failure xfs: protect xfs_sync_worker with s_umount semaphore xfs: introduce SEEK_DATA/SEEK_HOLE support xfs: make xfs_extent_busy_trim not static xfs: make XBF_MAPPED the default behaviour xfs: flush outstanding buffers on log mount failure xfs: Properly exclude IO type flags from buffer flags xfs: clean up xfs_bit.h includes xfs: move xfs_do_force_shutdown() and kill xfs_rw.c xfs: move xfs_get_extsz_hint() and kill xfs_rw.h xfs: move xfs_fsb_to_db to xfs_bmap.h xfs: clean up busy extent naming xfs: move busy extent handling to it's own file xfs: move xfsagino_t to xfs_types.h xfs: use iolock on XFS_IOC_ALLOCSP calls xfs: kill XBF_DONTBLOCK xfs: kill xfs_read_buf() xfs: kill XBF_LOCK ...
-rw-r--r--MAINTAINERS2
-rw-r--r--fs/xfs/Makefile2
-rw-r--r--fs/xfs/xfs_ag.h18
-rw-r--r--fs/xfs/xfs_alloc.c585
-rw-r--r--fs/xfs/xfs_alloc.h28
-rw-r--r--fs/xfs/xfs_alloc_btree.c9
-rw-r--r--fs/xfs/xfs_aops.c218
-rw-r--r--fs/xfs/xfs_attr.c25
-rw-r--r--fs/xfs/xfs_attr_leaf.c3
-rw-r--r--fs/xfs/xfs_bmap.c32
-rw-r--r--fs/xfs/xfs_bmap.h3
-rw-r--r--fs/xfs/xfs_bmap_btree.c1
-rw-r--r--fs/xfs/xfs_btree.c1
-rw-r--r--fs/xfs/xfs_buf.c593
-rw-r--r--fs/xfs/xfs_buf.h96
-rw-r--r--fs/xfs/xfs_buf_item.c123
-rw-r--r--fs/xfs/xfs_da_btree.c17
-rw-r--r--fs/xfs/xfs_dfrag.c2
-rw-r--r--fs/xfs/xfs_dir2.c1
-rw-r--r--fs/xfs/xfs_dir2_block.c1
-rw-r--r--fs/xfs/xfs_dir2_data.c1
-rw-r--r--fs/xfs/xfs_dir2_leaf.c1
-rw-r--r--fs/xfs/xfs_dir2_node.c1
-rw-r--r--fs/xfs/xfs_dir2_sf.c1
-rw-r--r--fs/xfs/xfs_discard.c6
-rw-r--r--fs/xfs/xfs_dquot.c91
-rw-r--r--fs/xfs/xfs_dquot.h3
-rw-r--r--fs/xfs/xfs_dquot_item.c162
-rw-r--r--fs/xfs/xfs_error.c1
-rw-r--r--fs/xfs/xfs_export.c1
-rw-r--r--fs/xfs/xfs_extent_busy.c603
-rw-r--r--fs/xfs/xfs_extent_busy.h69
-rw-r--r--fs/xfs/xfs_extfree_item.c59
-rw-r--r--fs/xfs/xfs_file.c327
-rw-r--r--fs/xfs/xfs_fsops.c82
-rw-r--r--fs/xfs/xfs_ialloc.c10
-rw-r--r--fs/xfs/xfs_ialloc.h9
-rw-r--r--fs/xfs/xfs_ialloc_btree.c1
-rw-r--r--fs/xfs/xfs_iget.c24
-rw-r--r--fs/xfs/xfs_inode.c132
-rw-r--r--fs/xfs/xfs_inode.h5
-rw-r--r--fs/xfs/xfs_inode_item.c176
-rw-r--r--fs/xfs/xfs_inode_item.h2
-rw-r--r--fs/xfs/xfs_inum.h5
-rw-r--r--fs/xfs/xfs_ioctl.c2
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c59
-rw-r--r--fs/xfs/xfs_iops.c15
-rw-r--r--fs/xfs/xfs_itable.c1
-rw-r--r--fs/xfs/xfs_log.c49
-rw-r--r--fs/xfs/xfs_log.h1
-rw-r--r--fs/xfs/xfs_log_cil.c253
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_log_recover.c103
-rw-r--r--fs/xfs/xfs_message.c1
-rw-r--r--fs/xfs/xfs_mount.c77
-rw-r--r--fs/xfs/xfs_mount.h2
-rw-r--r--fs/xfs/xfs_qm.c196
-rw-r--r--fs/xfs/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/xfs_qm_syscalls.c1
-rw-r--r--fs/xfs/xfs_quotaops.c1
-rw-r--r--fs/xfs/xfs_rename.c1
-rw-r--r--fs/xfs/xfs_rtalloc.c10
-rw-r--r--fs/xfs/xfs_rw.c156
-rw-r--r--fs/xfs/xfs_rw.h47
-rw-r--r--fs/xfs/xfs_super.c49
-rw-r--r--fs/xfs/xfs_sync.c281
-rw-r--r--fs/xfs/xfs_trace.c2
-rw-r--r--fs/xfs/xfs_trace.h53
-rw-r--r--fs/xfs/xfs_trans.c7
-rw-r--r--fs/xfs/xfs_trans.h18
-rw-r--r--fs/xfs/xfs_trans_ail.c207
-rw-r--r--fs/xfs/xfs_trans_buf.c126
-rw-r--r--fs/xfs/xfs_trans_dquot.c2
-rw-r--r--fs/xfs/xfs_trans_extfree.c1
-rw-r--r--fs/xfs/xfs_trans_inode.c2
-rw-r--r--fs/xfs/xfs_trans_priv.h12
-rw-r--r--fs/xfs/xfs_types.h5
-rw-r--r--fs/xfs/xfs_utils.c2
-rw-r--r--fs/xfs/xfs_vnodeops.c31
80 files changed, 2459 insertions, 2852 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index eaff0392eb32..150a29f3cd33 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7623,7 +7623,7 @@ XFS FILESYSTEM
7623P: Silicon Graphics Inc 7623P: Silicon Graphics Inc
7624M: Ben Myers <bpm@sgi.com> 7624M: Ben Myers <bpm@sgi.com>
7625M: Alex Elder <elder@kernel.org> 7625M: Alex Elder <elder@kernel.org>
7626M: xfs-masters@oss.sgi.com 7626M: xfs@oss.sgi.com
7627L: xfs@oss.sgi.com 7627L: xfs@oss.sgi.com
7628W: http://oss.sgi.com/projects/xfs 7628W: http://oss.sgi.com/projects/xfs
7629T: git git://oss.sgi.com/xfs/xfs.git 7629T: git git://oss.sgi.com/xfs/xfs.git
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 0a9977983f92..d2bf974b1a2f 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -33,6 +33,7 @@ xfs-y += xfs_aops.o \
33 xfs_discard.o \ 33 xfs_discard.o \
34 xfs_error.o \ 34 xfs_error.o \
35 xfs_export.o \ 35 xfs_export.o \
36 xfs_extent_busy.o \
36 xfs_file.o \ 37 xfs_file.o \
37 xfs_filestream.o \ 38 xfs_filestream.o \
38 xfs_fsops.o \ 39 xfs_fsops.o \
@@ -49,7 +50,6 @@ xfs-y += xfs_aops.o \
49 xfs_sync.o \ 50 xfs_sync.o \
50 xfs_xattr.o \ 51 xfs_xattr.o \
51 xfs_rename.o \ 52 xfs_rename.o \
52 xfs_rw.o \
53 xfs_utils.o \ 53 xfs_utils.o \
54 xfs_vnodeops.o \ 54 xfs_vnodeops.o \
55 kmem.o \ 55 kmem.o \
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 4805f009f923..44d65c1533c0 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -175,24 +175,6 @@ typedef struct xfs_agfl {
175} xfs_agfl_t; 175} xfs_agfl_t;
176 176
177/* 177/*
178 * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
179 * have been freed but whose transactions aren't committed to disk yet.
180 *
181 * Note that we use the transaction ID to record the transaction, not the
182 * transaction structure itself. See xfs_alloc_busy_insert() for details.
183 */
184struct xfs_busy_extent {
185 struct rb_node rb_node; /* ag by-bno indexed search tree */
186 struct list_head list; /* transaction busy extent list */
187 xfs_agnumber_t agno;
188 xfs_agblock_t bno;
189 xfs_extlen_t length;
190 unsigned int flags;
191#define XFS_ALLOC_BUSY_DISCARDED 0x01 /* undergoing a discard op. */
192#define XFS_ALLOC_BUSY_SKIP_DISCARD 0x02 /* do not discard */
193};
194
195/*
196 * Per-ag incore structure, copies of information in agf and agi, 178 * Per-ag incore structure, copies of information in agf and agi,
197 * to improve the performance of allocation group selection. 179 * to improve the performance of allocation group selection.
198 */ 180 */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 0f0df2759b09..229641fb8e67 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -20,7 +20,6 @@
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 23#include "xfs_trans.h"
25#include "xfs_sb.h" 24#include "xfs_sb.h"
26#include "xfs_ag.h" 25#include "xfs_ag.h"
@@ -32,6 +31,7 @@
32#include "xfs_inode.h" 31#include "xfs_inode.h"
33#include "xfs_btree.h" 32#include "xfs_btree.h"
34#include "xfs_alloc.h" 33#include "xfs_alloc.h"
34#include "xfs_extent_busy.h"
35#include "xfs_error.h" 35#include "xfs_error.h"
36#include "xfs_trace.h" 36#include "xfs_trace.h"
37 37
@@ -47,8 +47,6 @@ STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
47STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); 47STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
48STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, 48STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
49 xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); 49 xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
50STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *,
51 xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *);
52 50
53/* 51/*
54 * Lookup the record equal to [bno, len] in the btree given by cur. 52 * Lookup the record equal to [bno, len] in the btree given by cur.
@@ -152,7 +150,7 @@ xfs_alloc_compute_aligned(
152 xfs_extlen_t len; 150 xfs_extlen_t len;
153 151
154 /* Trim busy sections out of found extent */ 152 /* Trim busy sections out of found extent */
155 xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len); 153 xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
156 154
157 if (args->alignment > 1 && len >= args->minlen) { 155 if (args->alignment > 1 && len >= args->minlen) {
158 xfs_agblock_t aligned_bno = roundup(bno, args->alignment); 156 xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
@@ -536,7 +534,7 @@ xfs_alloc_ag_vextent(
536 if (error) 534 if (error)
537 return error; 535 return error;
538 536
539 ASSERT(!xfs_alloc_busy_search(args->mp, args->agno, 537 ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
540 args->agbno, args->len)); 538 args->agbno, args->len));
541 } 539 }
542 540
@@ -603,7 +601,7 @@ xfs_alloc_ag_vextent_exact(
603 /* 601 /*
604 * Check for overlapping busy extents. 602 * Check for overlapping busy extents.
605 */ 603 */
606 xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen); 604 xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen);
607 605
608 /* 606 /*
609 * Give up if the start of the extent is busy, or the freespace isn't 607 * Give up if the start of the extent is busy, or the freespace isn't
@@ -1391,7 +1389,7 @@ xfs_alloc_ag_vextent_small(
1391 if (error) 1389 if (error)
1392 goto error0; 1390 goto error0;
1393 if (fbno != NULLAGBLOCK) { 1391 if (fbno != NULLAGBLOCK) {
1394 xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1, 1392 xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
1395 args->userdata); 1393 args->userdata);
1396 1394
1397 if (args->userdata) { 1395 if (args->userdata) {
@@ -2496,579 +2494,8 @@ xfs_free_extent(
2496 2494
2497 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); 2495 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
2498 if (!error) 2496 if (!error)
2499 xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0); 2497 xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0);
2500error0: 2498error0:
2501 xfs_perag_put(args.pag); 2499 xfs_perag_put(args.pag);
2502 return error; 2500 return error;
2503} 2501}
2504
2505void
2506xfs_alloc_busy_insert(
2507 struct xfs_trans *tp,
2508 xfs_agnumber_t agno,
2509 xfs_agblock_t bno,
2510 xfs_extlen_t len,
2511 unsigned int flags)
2512{
2513 struct xfs_busy_extent *new;
2514 struct xfs_busy_extent *busyp;
2515 struct xfs_perag *pag;
2516 struct rb_node **rbp;
2517 struct rb_node *parent = NULL;
2518
2519 new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
2520 if (!new) {
2521 /*
2522 * No Memory! Since it is now not possible to track the free
2523 * block, make this a synchronous transaction to insure that
2524 * the block is not reused before this transaction commits.
2525 */
2526 trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len);
2527 xfs_trans_set_sync(tp);
2528 return;
2529 }
2530
2531 new->agno = agno;
2532 new->bno = bno;
2533 new->length = len;
2534 INIT_LIST_HEAD(&new->list);
2535 new->flags = flags;
2536
2537 /* trace before insert to be able to see failed inserts */
2538 trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
2539
2540 pag = xfs_perag_get(tp->t_mountp, new->agno);
2541 spin_lock(&pag->pagb_lock);
2542 rbp = &pag->pagb_tree.rb_node;
2543 while (*rbp) {
2544 parent = *rbp;
2545 busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
2546
2547 if (new->bno < busyp->bno) {
2548 rbp = &(*rbp)->rb_left;
2549 ASSERT(new->bno + new->length <= busyp->bno);
2550 } else if (new->bno > busyp->bno) {
2551 rbp = &(*rbp)->rb_right;
2552 ASSERT(bno >= busyp->bno + busyp->length);
2553 } else {
2554 ASSERT(0);
2555 }
2556 }
2557
2558 rb_link_node(&new->rb_node, parent, rbp);
2559 rb_insert_color(&new->rb_node, &pag->pagb_tree);
2560
2561 list_add(&new->list, &tp->t_busy);
2562 spin_unlock(&pag->pagb_lock);
2563 xfs_perag_put(pag);
2564}
2565
2566/*
2567 * Search for a busy extent within the range of the extent we are about to
2568 * allocate. You need to be holding the busy extent tree lock when calling
2569 * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
2570 * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
2571 * match. This is done so that a non-zero return indicates an overlap that
2572 * will require a synchronous transaction, but it can still be
2573 * used to distinguish between a partial or exact match.
2574 */
2575int
2576xfs_alloc_busy_search(
2577 struct xfs_mount *mp,
2578 xfs_agnumber_t agno,
2579 xfs_agblock_t bno,
2580 xfs_extlen_t len)
2581{
2582 struct xfs_perag *pag;
2583 struct rb_node *rbp;
2584 struct xfs_busy_extent *busyp;
2585 int match = 0;
2586
2587 pag = xfs_perag_get(mp, agno);
2588 spin_lock(&pag->pagb_lock);
2589
2590 rbp = pag->pagb_tree.rb_node;
2591
2592 /* find closest start bno overlap */
2593 while (rbp) {
2594 busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
2595 if (bno < busyp->bno) {
2596 /* may overlap, but exact start block is lower */
2597 if (bno + len > busyp->bno)
2598 match = -1;
2599 rbp = rbp->rb_left;
2600 } else if (bno > busyp->bno) {
2601 /* may overlap, but exact start block is higher */
2602 if (bno < busyp->bno + busyp->length)
2603 match = -1;
2604 rbp = rbp->rb_right;
2605 } else {
2606 /* bno matches busyp, length determines exact match */
2607 match = (busyp->length == len) ? 1 : -1;
2608 break;
2609 }
2610 }
2611 spin_unlock(&pag->pagb_lock);
2612 xfs_perag_put(pag);
2613 return match;
2614}
2615
2616/*
2617 * The found free extent [fbno, fend] overlaps part or all of the given busy
2618 * extent. If the overlap covers the beginning, the end, or all of the busy
2619 * extent, the overlapping portion can be made unbusy and used for the
2620 * allocation. We can't split a busy extent because we can't modify a
2621 * transaction/CIL context busy list, but we can update an entries block
2622 * number or length.
2623 *
2624 * Returns true if the extent can safely be reused, or false if the search
2625 * needs to be restarted.
2626 */
2627STATIC bool
2628xfs_alloc_busy_update_extent(
2629 struct xfs_mount *mp,
2630 struct xfs_perag *pag,
2631 struct xfs_busy_extent *busyp,
2632 xfs_agblock_t fbno,
2633 xfs_extlen_t flen,
2634 bool userdata)
2635{
2636 xfs_agblock_t fend = fbno + flen;
2637 xfs_agblock_t bbno = busyp->bno;
2638 xfs_agblock_t bend = bbno + busyp->length;
2639
2640 /*
2641 * This extent is currently being discarded. Give the thread
2642 * performing the discard a chance to mark the extent unbusy
2643 * and retry.
2644 */
2645 if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) {
2646 spin_unlock(&pag->pagb_lock);
2647 delay(1);
2648 spin_lock(&pag->pagb_lock);
2649 return false;
2650 }
2651
2652 /*
2653 * If there is a busy extent overlapping a user allocation, we have
2654 * no choice but to force the log and retry the search.
2655 *
2656 * Fortunately this does not happen during normal operation, but
2657 * only if the filesystem is very low on space and has to dip into
2658 * the AGFL for normal allocations.
2659 */
2660 if (userdata)
2661 goto out_force_log;
2662
2663 if (bbno < fbno && bend > fend) {
2664 /*
2665 * Case 1:
2666 * bbno bend
2667 * +BBBBBBBBBBBBBBBBB+
2668 * +---------+
2669 * fbno fend
2670 */
2671
2672 /*
2673 * We would have to split the busy extent to be able to track
2674 * it correct, which we cannot do because we would have to
2675 * modify the list of busy extents attached to the transaction
2676 * or CIL context, which is immutable.
2677 *
2678 * Force out the log to clear the busy extent and retry the
2679 * search.
2680 */
2681 goto out_force_log;
2682 } else if (bbno >= fbno && bend <= fend) {
2683 /*
2684 * Case 2:
2685 * bbno bend
2686 * +BBBBBBBBBBBBBBBBB+
2687 * +-----------------+
2688 * fbno fend
2689 *
2690 * Case 3:
2691 * bbno bend
2692 * +BBBBBBBBBBBBBBBBB+
2693 * +--------------------------+
2694 * fbno fend
2695 *
2696 * Case 4:
2697 * bbno bend
2698 * +BBBBBBBBBBBBBBBBB+
2699 * +--------------------------+
2700 * fbno fend
2701 *
2702 * Case 5:
2703 * bbno bend
2704 * +BBBBBBBBBBBBBBBBB+
2705 * +-----------------------------------+
2706 * fbno fend
2707 *
2708 */
2709
2710 /*
2711 * The busy extent is fully covered by the extent we are
2712 * allocating, and can simply be removed from the rbtree.
2713 * However we cannot remove it from the immutable list
2714 * tracking busy extents in the transaction or CIL context,
2715 * so set the length to zero to mark it invalid.
2716 *
2717 * We also need to restart the busy extent search from the
2718 * tree root, because erasing the node can rearrange the
2719 * tree topology.
2720 */
2721 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2722 busyp->length = 0;
2723 return false;
2724 } else if (fend < bend) {
2725 /*
2726 * Case 6:
2727 * bbno bend
2728 * +BBBBBBBBBBBBBBBBB+
2729 * +---------+
2730 * fbno fend
2731 *
2732 * Case 7:
2733 * bbno bend
2734 * +BBBBBBBBBBBBBBBBB+
2735 * +------------------+
2736 * fbno fend
2737 *
2738 */
2739 busyp->bno = fend;
2740 } else if (bbno < fbno) {
2741 /*
2742 * Case 8:
2743 * bbno bend
2744 * +BBBBBBBBBBBBBBBBB+
2745 * +-------------+
2746 * fbno fend
2747 *
2748 * Case 9:
2749 * bbno bend
2750 * +BBBBBBBBBBBBBBBBB+
2751 * +----------------------+
2752 * fbno fend
2753 */
2754 busyp->length = fbno - busyp->bno;
2755 } else {
2756 ASSERT(0);
2757 }
2758
2759 trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen);
2760 return true;
2761
2762out_force_log:
2763 spin_unlock(&pag->pagb_lock);
2764 xfs_log_force(mp, XFS_LOG_SYNC);
2765 trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen);
2766 spin_lock(&pag->pagb_lock);
2767 return false;
2768}
2769
2770
2771/*
2772 * For a given extent [fbno, flen], make sure we can reuse it safely.
2773 */
2774void
2775xfs_alloc_busy_reuse(
2776 struct xfs_mount *mp,
2777 xfs_agnumber_t agno,
2778 xfs_agblock_t fbno,
2779 xfs_extlen_t flen,
2780 bool userdata)
2781{
2782 struct xfs_perag *pag;
2783 struct rb_node *rbp;
2784
2785 ASSERT(flen > 0);
2786
2787 pag = xfs_perag_get(mp, agno);
2788 spin_lock(&pag->pagb_lock);
2789restart:
2790 rbp = pag->pagb_tree.rb_node;
2791 while (rbp) {
2792 struct xfs_busy_extent *busyp =
2793 rb_entry(rbp, struct xfs_busy_extent, rb_node);
2794 xfs_agblock_t bbno = busyp->bno;
2795 xfs_agblock_t bend = bbno + busyp->length;
2796
2797 if (fbno + flen <= bbno) {
2798 rbp = rbp->rb_left;
2799 continue;
2800 } else if (fbno >= bend) {
2801 rbp = rbp->rb_right;
2802 continue;
2803 }
2804
2805 if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen,
2806 userdata))
2807 goto restart;
2808 }
2809 spin_unlock(&pag->pagb_lock);
2810 xfs_perag_put(pag);
2811}
2812
2813/*
2814 * For a given extent [fbno, flen], search the busy extent list to find a
2815 * subset of the extent that is not busy. If *rlen is smaller than
2816 * args->minlen no suitable extent could be found, and the higher level
2817 * code needs to force out the log and retry the allocation.
2818 */
2819STATIC void
2820xfs_alloc_busy_trim(
2821 struct xfs_alloc_arg *args,
2822 xfs_agblock_t bno,
2823 xfs_extlen_t len,
2824 xfs_agblock_t *rbno,
2825 xfs_extlen_t *rlen)
2826{
2827 xfs_agblock_t fbno;
2828 xfs_extlen_t flen;
2829 struct rb_node *rbp;
2830
2831 ASSERT(len > 0);
2832
2833 spin_lock(&args->pag->pagb_lock);
2834restart:
2835 fbno = bno;
2836 flen = len;
2837 rbp = args->pag->pagb_tree.rb_node;
2838 while (rbp && flen >= args->minlen) {
2839 struct xfs_busy_extent *busyp =
2840 rb_entry(rbp, struct xfs_busy_extent, rb_node);
2841 xfs_agblock_t fend = fbno + flen;
2842 xfs_agblock_t bbno = busyp->bno;
2843 xfs_agblock_t bend = bbno + busyp->length;
2844
2845 if (fend <= bbno) {
2846 rbp = rbp->rb_left;
2847 continue;
2848 } else if (fbno >= bend) {
2849 rbp = rbp->rb_right;
2850 continue;
2851 }
2852
2853 /*
2854 * If this is a metadata allocation, try to reuse the busy
2855 * extent instead of trimming the allocation.
2856 */
2857 if (!args->userdata &&
2858 !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) {
2859 if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
2860 busyp, fbno, flen,
2861 false))
2862 goto restart;
2863 continue;
2864 }
2865
2866 if (bbno <= fbno) {
2867 /* start overlap */
2868
2869 /*
2870 * Case 1:
2871 * bbno bend
2872 * +BBBBBBBBBBBBBBBBB+
2873 * +---------+
2874 * fbno fend
2875 *
2876 * Case 2:
2877 * bbno bend
2878 * +BBBBBBBBBBBBBBBBB+
2879 * +-------------+
2880 * fbno fend
2881 *
2882 * Case 3:
2883 * bbno bend
2884 * +BBBBBBBBBBBBBBBBB+
2885 * +-------------+
2886 * fbno fend
2887 *
2888 * Case 4:
2889 * bbno bend
2890 * +BBBBBBBBBBBBBBBBB+
2891 * +-----------------+
2892 * fbno fend
2893 *
2894 * No unbusy region in extent, return failure.
2895 */
2896 if (fend <= bend)
2897 goto fail;
2898
2899 /*
2900 * Case 5:
2901 * bbno bend
2902 * +BBBBBBBBBBBBBBBBB+
2903 * +----------------------+
2904 * fbno fend
2905 *
2906 * Case 6:
2907 * bbno bend
2908 * +BBBBBBBBBBBBBBBBB+
2909 * +--------------------------+
2910 * fbno fend
2911 *
2912 * Needs to be trimmed to:
2913 * +-------+
2914 * fbno fend
2915 */
2916 fbno = bend;
2917 } else if (bend >= fend) {
2918 /* end overlap */
2919
2920 /*
2921 * Case 7:
2922 * bbno bend
2923 * +BBBBBBBBBBBBBBBBB+
2924 * +------------------+
2925 * fbno fend
2926 *
2927 * Case 8:
2928 * bbno bend
2929 * +BBBBBBBBBBBBBBBBB+
2930 * +--------------------------+
2931 * fbno fend
2932 *
2933 * Needs to be trimmed to:
2934 * +-------+
2935 * fbno fend
2936 */
2937 fend = bbno;
2938 } else {
2939 /* middle overlap */
2940
2941 /*
2942 * Case 9:
2943 * bbno bend
2944 * +BBBBBBBBBBBBBBBBB+
2945 * +-----------------------------------+
2946 * fbno fend
2947 *
2948 * Can be trimmed to:
2949 * +-------+ OR +-------+
2950 * fbno fend fbno fend
2951 *
2952 * Backward allocation leads to significant
2953 * fragmentation of directories, which degrades
2954 * directory performance, therefore we always want to
2955 * choose the option that produces forward allocation
2956 * patterns.
2957 * Preferring the lower bno extent will make the next
2958 * request use "fend" as the start of the next
2959 * allocation; if the segment is no longer busy at
2960 * that point, we'll get a contiguous allocation, but
2961 * even if it is still busy, we will get a forward
2962 * allocation.
2963 * We try to avoid choosing the segment at "bend",
2964 * because that can lead to the next allocation
2965 * taking the segment at "fbno", which would be a
2966 * backward allocation. We only use the segment at
2967 * "fbno" if it is much larger than the current
2968 * requested size, because in that case there's a
2969 * good chance subsequent allocations will be
2970 * contiguous.
2971 */
2972 if (bbno - fbno >= args->maxlen) {
2973 /* left candidate fits perfect */
2974 fend = bbno;
2975 } else if (fend - bend >= args->maxlen * 4) {
2976 /* right candidate has enough free space */
2977 fbno = bend;
2978 } else if (bbno - fbno >= args->minlen) {
2979 /* left candidate fits minimum requirement */
2980 fend = bbno;
2981 } else {
2982 goto fail;
2983 }
2984 }
2985
2986 flen = fend - fbno;
2987 }
2988 spin_unlock(&args->pag->pagb_lock);
2989
2990 if (fbno != bno || flen != len) {
2991 trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len,
2992 fbno, flen);
2993 }
2994 *rbno = fbno;
2995 *rlen = flen;
2996 return;
2997fail:
2998 /*
2999 * Return a zero extent length as failure indications. All callers
3000 * re-check if the trimmed extent satisfies the minlen requirement.
3001 */
3002 spin_unlock(&args->pag->pagb_lock);
3003 trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
3004 *rbno = fbno;
3005 *rlen = 0;
3006}
3007
3008static void
3009xfs_alloc_busy_clear_one(
3010 struct xfs_mount *mp,
3011 struct xfs_perag *pag,
3012 struct xfs_busy_extent *busyp)
3013{
3014 if (busyp->length) {
3015 trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno,
3016 busyp->length);
3017 rb_erase(&busyp->rb_node, &pag->pagb_tree);
3018 }
3019
3020 list_del_init(&busyp->list);
3021 kmem_free(busyp);
3022}
3023
3024/*
3025 * Remove all extents on the passed in list from the busy extents tree.
3026 * If do_discard is set skip extents that need to be discarded, and mark
3027 * these as undergoing a discard operation instead.
3028 */
3029void
3030xfs_alloc_busy_clear(
3031 struct xfs_mount *mp,
3032 struct list_head *list,
3033 bool do_discard)
3034{
3035 struct xfs_busy_extent *busyp, *n;
3036 struct xfs_perag *pag = NULL;
3037 xfs_agnumber_t agno = NULLAGNUMBER;
3038
3039 list_for_each_entry_safe(busyp, n, list, list) {
3040 if (busyp->agno != agno) {
3041 if (pag) {
3042 spin_unlock(&pag->pagb_lock);
3043 xfs_perag_put(pag);
3044 }
3045 pag = xfs_perag_get(mp, busyp->agno);
3046 spin_lock(&pag->pagb_lock);
3047 agno = busyp->agno;
3048 }
3049
3050 if (do_discard && busyp->length &&
3051 !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD))
3052 busyp->flags = XFS_ALLOC_BUSY_DISCARDED;
3053 else
3054 xfs_alloc_busy_clear_one(mp, pag, busyp);
3055 }
3056
3057 if (pag) {
3058 spin_unlock(&pag->pagb_lock);
3059 xfs_perag_put(pag);
3060 }
3061}
3062
3063/*
3064 * Callback for list_sort to sort busy extents by the AG they reside in.
3065 */
3066int
3067xfs_busy_extent_ag_cmp(
3068 void *priv,
3069 struct list_head *a,
3070 struct list_head *b)
3071{
3072 return container_of(a, struct xfs_busy_extent, list)->agno -
3073 container_of(b, struct xfs_busy_extent, list)->agno;
3074}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 3a7e7d8f8ded..93be4a667ca1 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -23,7 +23,6 @@ struct xfs_btree_cur;
23struct xfs_mount; 23struct xfs_mount;
24struct xfs_perag; 24struct xfs_perag;
25struct xfs_trans; 25struct xfs_trans;
26struct xfs_busy_extent;
27 26
28extern struct workqueue_struct *xfs_alloc_wq; 27extern struct workqueue_struct *xfs_alloc_wq;
29 28
@@ -139,33 +138,6 @@ xfs_extlen_t
139xfs_alloc_longest_free_extent(struct xfs_mount *mp, 138xfs_alloc_longest_free_extent(struct xfs_mount *mp,
140 struct xfs_perag *pag); 139 struct xfs_perag *pag);
141 140
142#ifdef __KERNEL__
143void
144xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
145 xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
146
147void
148xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list,
149 bool do_discard);
150
151int
152xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
153 xfs_agblock_t bno, xfs_extlen_t len);
154
155void
156xfs_alloc_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
157 xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
158
159int
160xfs_busy_extent_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
161
162static inline void xfs_alloc_busy_sort(struct list_head *list)
163{
164 list_sort(NULL, list, xfs_busy_extent_ag_cmp);
165}
166
167#endif /* __KERNEL__ */
168
169/* 141/*
170 * Compute and fill in value of m_ag_maxlevels. 142 * Compute and fill in value of m_ag_maxlevels.
171 */ 143 */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index ffb3386e45c1..f1647caace8f 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -18,9 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h" 21#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 22#include "xfs_trans.h"
25#include "xfs_sb.h" 23#include "xfs_sb.h"
26#include "xfs_ag.h" 24#include "xfs_ag.h"
@@ -32,6 +30,7 @@
32#include "xfs_inode.h" 30#include "xfs_inode.h"
33#include "xfs_btree.h" 31#include "xfs_btree.h"
34#include "xfs_alloc.h" 32#include "xfs_alloc.h"
33#include "xfs_extent_busy.h"
35#include "xfs_error.h" 34#include "xfs_error.h"
36#include "xfs_trace.h" 35#include "xfs_trace.h"
37 36
@@ -94,7 +93,7 @@ xfs_allocbt_alloc_block(
94 return 0; 93 return 0;
95 } 94 }
96 95
97 xfs_alloc_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false); 96 xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
98 97
99 xfs_trans_agbtree_delta(cur->bc_tp, 1); 98 xfs_trans_agbtree_delta(cur->bc_tp, 1);
100 new->s = cpu_to_be32(bno); 99 new->s = cpu_to_be32(bno);
@@ -119,8 +118,8 @@ xfs_allocbt_free_block(
119 if (error) 118 if (error)
120 return error; 119 return error;
121 120
122 xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, 121 xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
123 XFS_ALLOC_BUSY_SKIP_DISCARD); 122 XFS_EXTENT_BUSY_SKIP_DISCARD);
124 xfs_trans_agbtree_delta(cur->bc_tp, -1); 123 xfs_trans_agbtree_delta(cur->bc_tp, -1);
125 return 0; 124 return 0;
126} 125}
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 0dbb9e70fe21..ae31c313a79e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -16,9 +16,7 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_bit.h"
20#include "xfs_log.h" 19#include "xfs_log.h"
21#include "xfs_inum.h"
22#include "xfs_sb.h" 20#include "xfs_sb.h"
23#include "xfs_ag.h" 21#include "xfs_ag.h"
24#include "xfs_trans.h" 22#include "xfs_trans.h"
@@ -29,7 +27,6 @@
29#include "xfs_inode_item.h" 27#include "xfs_inode_item.h"
30#include "xfs_alloc.h" 28#include "xfs_alloc.h"
31#include "xfs_error.h" 29#include "xfs_error.h"
32#include "xfs_rw.h"
33#include "xfs_iomap.h" 30#include "xfs_iomap.h"
34#include "xfs_vnodeops.h" 31#include "xfs_vnodeops.h"
35#include "xfs_trace.h" 32#include "xfs_trace.h"
@@ -623,7 +620,7 @@ xfs_map_at_offset(
623 * or delayed allocate extent. 620 * or delayed allocate extent.
624 */ 621 */
625STATIC int 622STATIC int
626xfs_is_delayed_page( 623xfs_check_page_type(
627 struct page *page, 624 struct page *page,
628 unsigned int type) 625 unsigned int type)
629{ 626{
@@ -637,11 +634,11 @@ xfs_is_delayed_page(
637 bh = head = page_buffers(page); 634 bh = head = page_buffers(page);
638 do { 635 do {
639 if (buffer_unwritten(bh)) 636 if (buffer_unwritten(bh))
640 acceptable = (type == IO_UNWRITTEN); 637 acceptable += (type == IO_UNWRITTEN);
641 else if (buffer_delay(bh)) 638 else if (buffer_delay(bh))
642 acceptable = (type == IO_DELALLOC); 639 acceptable += (type == IO_DELALLOC);
643 else if (buffer_dirty(bh) && buffer_mapped(bh)) 640 else if (buffer_dirty(bh) && buffer_mapped(bh))
644 acceptable = (type == IO_OVERWRITE); 641 acceptable += (type == IO_OVERWRITE);
645 else 642 else
646 break; 643 break;
647 } while ((bh = bh->b_this_page) != head); 644 } while ((bh = bh->b_this_page) != head);
@@ -684,7 +681,7 @@ xfs_convert_page(
684 goto fail_unlock_page; 681 goto fail_unlock_page;
685 if (page->mapping != inode->i_mapping) 682 if (page->mapping != inode->i_mapping)
686 goto fail_unlock_page; 683 goto fail_unlock_page;
687 if (!xfs_is_delayed_page(page, (*ioendp)->io_type)) 684 if (!xfs_check_page_type(page, (*ioendp)->io_type))
688 goto fail_unlock_page; 685 goto fail_unlock_page;
689 686
690 /* 687 /*
@@ -834,7 +831,7 @@ xfs_aops_discard_page(
834 struct buffer_head *bh, *head; 831 struct buffer_head *bh, *head;
835 loff_t offset = page_offset(page); 832 loff_t offset = page_offset(page);
836 833
837 if (!xfs_is_delayed_page(page, IO_DELALLOC)) 834 if (!xfs_check_page_type(page, IO_DELALLOC))
838 goto out_invalidate; 835 goto out_invalidate;
839 836
840 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 837 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -1146,7 +1143,14 @@ __xfs_get_blocks(
1146 if (!create && direct && offset >= i_size_read(inode)) 1143 if (!create && direct && offset >= i_size_read(inode))
1147 return 0; 1144 return 0;
1148 1145
1149 if (create) { 1146 /*
1147 * Direct I/O is usually done on preallocated files, so try getting
1148 * a block mapping without an exclusive lock first. For buffered
1149 * writes we already have the exclusive iolock anyway, so avoiding
1150 * a lock roundtrip here by taking the ilock exclusive from the
1151 * beginning is a useful micro optimization.
1152 */
1153 if (create && !direct) {
1150 lockmode = XFS_ILOCK_EXCL; 1154 lockmode = XFS_ILOCK_EXCL;
1151 xfs_ilock(ip, lockmode); 1155 xfs_ilock(ip, lockmode);
1152 } else { 1156 } else {
@@ -1168,23 +1172,45 @@ __xfs_get_blocks(
1168 (!nimaps || 1172 (!nimaps ||
1169 (imap.br_startblock == HOLESTARTBLOCK || 1173 (imap.br_startblock == HOLESTARTBLOCK ||
1170 imap.br_startblock == DELAYSTARTBLOCK))) { 1174 imap.br_startblock == DELAYSTARTBLOCK))) {
1171 if (direct) { 1175 if (direct || xfs_get_extsz_hint(ip)) {
1176 /*
1177 * Drop the ilock in preparation for starting the block
1178 * allocation transaction. It will be retaken
1179 * exclusively inside xfs_iomap_write_direct for the
1180 * actual allocation.
1181 */
1182 xfs_iunlock(ip, lockmode);
1172 error = xfs_iomap_write_direct(ip, offset, size, 1183 error = xfs_iomap_write_direct(ip, offset, size,
1173 &imap, nimaps); 1184 &imap, nimaps);
1185 if (error)
1186 return -error;
1187 new = 1;
1174 } else { 1188 } else {
1189 /*
1190 * Delalloc reservations do not require a transaction,
1191 * we can go on without dropping the lock here. If we
1192 * are allocating a new delalloc block, make sure that
1193 * we set the new flag so that we mark the buffer new so
1194 * that we know that it is newly allocated if the write
1195 * fails.
1196 */
1197 if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
1198 new = 1;
1175 error = xfs_iomap_write_delay(ip, offset, size, &imap); 1199 error = xfs_iomap_write_delay(ip, offset, size, &imap);
1200 if (error)
1201 goto out_unlock;
1202
1203 xfs_iunlock(ip, lockmode);
1176 } 1204 }
1177 if (error)
1178 goto out_unlock;
1179 1205
1180 trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap); 1206 trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
1181 } else if (nimaps) { 1207 } else if (nimaps) {
1182 trace_xfs_get_blocks_found(ip, offset, size, 0, &imap); 1208 trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
1209 xfs_iunlock(ip, lockmode);
1183 } else { 1210 } else {
1184 trace_xfs_get_blocks_notfound(ip, offset, size); 1211 trace_xfs_get_blocks_notfound(ip, offset, size);
1185 goto out_unlock; 1212 goto out_unlock;
1186 } 1213 }
1187 xfs_iunlock(ip, lockmode);
1188 1214
1189 if (imap.br_startblock != HOLESTARTBLOCK && 1215 if (imap.br_startblock != HOLESTARTBLOCK &&
1190 imap.br_startblock != DELAYSTARTBLOCK) { 1216 imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1386,52 +1412,91 @@ out_destroy_ioend:
1386 return ret; 1412 return ret;
1387} 1413}
1388 1414
1415/*
1416 * Punch out the delalloc blocks we have already allocated.
1417 *
1418 * Don't bother with xfs_setattr given that nothing can have made it to disk yet
1419 * as the page is still locked at this point.
1420 */
1421STATIC void
1422xfs_vm_kill_delalloc_range(
1423 struct inode *inode,
1424 loff_t start,
1425 loff_t end)
1426{
1427 struct xfs_inode *ip = XFS_I(inode);
1428 xfs_fileoff_t start_fsb;
1429 xfs_fileoff_t end_fsb;
1430 int error;
1431
1432 start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
1433 end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
1434 if (end_fsb <= start_fsb)
1435 return;
1436
1437 xfs_ilock(ip, XFS_ILOCK_EXCL);
1438 error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
1439 end_fsb - start_fsb);
1440 if (error) {
1441 /* something screwed, just bail */
1442 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1443 xfs_alert(ip->i_mount,
1444 "xfs_vm_write_failed: unable to clean up ino %lld",
1445 ip->i_ino);
1446 }
1447 }
1448 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1449}
1450
1389STATIC void 1451STATIC void
1390xfs_vm_write_failed( 1452xfs_vm_write_failed(
1391 struct address_space *mapping, 1453 struct inode *inode,
1392 loff_t to) 1454 struct page *page,
1455 loff_t pos,
1456 unsigned len)
1393{ 1457{
1394 struct inode *inode = mapping->host; 1458 loff_t block_offset = pos & PAGE_MASK;
1459 loff_t block_start;
1460 loff_t block_end;
1461 loff_t from = pos & (PAGE_CACHE_SIZE - 1);
1462 loff_t to = from + len;
1463 struct buffer_head *bh, *head;
1395 1464
1396 if (to > inode->i_size) { 1465 ASSERT(block_offset + from == pos);
1397 /*
1398 * Punch out the delalloc blocks we have already allocated.
1399 *
1400 * Don't bother with xfs_setattr given that nothing can have
1401 * made it to disk yet as the page is still locked at this
1402 * point.
1403 */
1404 struct xfs_inode *ip = XFS_I(inode);
1405 xfs_fileoff_t start_fsb;
1406 xfs_fileoff_t end_fsb;
1407 int error;
1408 1466
1409 truncate_pagecache(inode, to, inode->i_size); 1467 head = page_buffers(page);
1468 block_start = 0;
1469 for (bh = head; bh != head || !block_start;
1470 bh = bh->b_this_page, block_start = block_end,
1471 block_offset += bh->b_size) {
1472 block_end = block_start + bh->b_size;
1410 1473
1411 /* 1474 /* skip buffers before the write */
1412 * Check if there are any blocks that are outside of i_size 1475 if (block_end <= from)
1413 * that need to be trimmed back. 1476 continue;
1414 */ 1477
1415 start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1; 1478 /* if the buffer is after the write, we're done */
1416 end_fsb = XFS_B_TO_FSB(ip->i_mount, to); 1479 if (block_start >= to)
1417 if (end_fsb <= start_fsb) 1480 break;
1418 return; 1481
1419 1482 if (!buffer_delay(bh))
1420 xfs_ilock(ip, XFS_ILOCK_EXCL); 1483 continue;
1421 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1484
1422 end_fsb - start_fsb); 1485 if (!buffer_new(bh) && block_offset < i_size_read(inode))
1423 if (error) { 1486 continue;
1424 /* something screwed, just bail */ 1487
1425 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1488 xfs_vm_kill_delalloc_range(inode, block_offset,
1426 xfs_alert(ip->i_mount, 1489 block_offset + bh->b_size);
1427 "xfs_vm_write_failed: unable to clean up ino %lld",
1428 ip->i_ino);
1429 }
1430 }
1431 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1432 } 1490 }
1491
1433} 1492}
1434 1493
1494/*
1495 * This used to call block_write_begin(), but it unlocks and releases the page
1496 * on error, and we need that page to be able to punch stale delalloc blocks out
1497 * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
1498 * the appropriate point.
1499 */
1435STATIC int 1500STATIC int
1436xfs_vm_write_begin( 1501xfs_vm_write_begin(
1437 struct file *file, 1502 struct file *file,
@@ -1442,15 +1507,40 @@ xfs_vm_write_begin(
1442 struct page **pagep, 1507 struct page **pagep,
1443 void **fsdata) 1508 void **fsdata)
1444{ 1509{
1445 int ret; 1510 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1511 struct page *page;
1512 int status;
1446 1513
1447 ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS, 1514 ASSERT(len <= PAGE_CACHE_SIZE);
1448 pagep, xfs_get_blocks); 1515
1449 if (unlikely(ret)) 1516 page = grab_cache_page_write_begin(mapping, index,
1450 xfs_vm_write_failed(mapping, pos + len); 1517 flags | AOP_FLAG_NOFS);
1451 return ret; 1518 if (!page)
1519 return -ENOMEM;
1520
1521 status = __block_write_begin(page, pos, len, xfs_get_blocks);
1522 if (unlikely(status)) {
1523 struct inode *inode = mapping->host;
1524
1525 xfs_vm_write_failed(inode, page, pos, len);
1526 unlock_page(page);
1527
1528 if (pos + len > i_size_read(inode))
1529 truncate_pagecache(inode, pos + len, i_size_read(inode));
1530
1531 page_cache_release(page);
1532 page = NULL;
1533 }
1534
1535 *pagep = page;
1536 return status;
1452} 1537}
1453 1538
1539/*
1540 * On failure, we only need to kill delalloc blocks beyond EOF because they
1541 * will never be written. For blocks within EOF, generic_write_end() zeros them
1542 * so they are safe to leave alone and be written with all the other valid data.
1543 */
1454STATIC int 1544STATIC int
1455xfs_vm_write_end( 1545xfs_vm_write_end(
1456 struct file *file, 1546 struct file *file,
@@ -1463,9 +1553,19 @@ xfs_vm_write_end(
1463{ 1553{
1464 int ret; 1554 int ret;
1465 1555
1556 ASSERT(len <= PAGE_CACHE_SIZE);
1557
1466 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 1558 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
1467 if (unlikely(ret < len)) 1559 if (unlikely(ret < len)) {
1468 xfs_vm_write_failed(mapping, pos + len); 1560 struct inode *inode = mapping->host;
1561 size_t isize = i_size_read(inode);
1562 loff_t to = pos + len;
1563
1564 if (to > isize) {
1565 truncate_pagecache(inode, to, isize);
1566 xfs_vm_kill_delalloc_range(inode, isize, to);
1567 }
1568 }
1469 return ret; 1569 return ret;
1470} 1570}
1471 1571
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 65d61b948ead..a17ff01b5adf 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -21,7 +21,6 @@
21#include "xfs_types.h" 21#include "xfs_types.h"
22#include "xfs_bit.h" 22#include "xfs_bit.h"
23#include "xfs_log.h" 23#include "xfs_log.h"
24#include "xfs_inum.h"
25#include "xfs_trans.h" 24#include "xfs_trans.h"
26#include "xfs_sb.h" 25#include "xfs_sb.h"
27#include "xfs_ag.h" 26#include "xfs_ag.h"
@@ -39,7 +38,6 @@
39#include "xfs_error.h" 38#include "xfs_error.h"
40#include "xfs_quota.h" 39#include "xfs_quota.h"
41#include "xfs_trans_space.h" 40#include "xfs_trans_space.h"
42#include "xfs_rw.h"
43#include "xfs_vnodeops.h" 41#include "xfs_vnodeops.h"
44#include "xfs_trace.h" 42#include "xfs_trace.h"
45 43
@@ -1987,14 +1985,12 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
1987 (map[i].br_startblock != HOLESTARTBLOCK)); 1985 (map[i].br_startblock != HOLESTARTBLOCK));
1988 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); 1986 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
1989 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); 1987 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
1990 error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, 1988 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
1991 blkcnt, XBF_LOCK | XBF_DONT_BLOCK, 1989 dblkno, blkcnt, 0, &bp);
1992 &bp);
1993 if (error) 1990 if (error)
1994 return(error); 1991 return(error);
1995 1992
1996 tmp = (valuelen < XFS_BUF_SIZE(bp)) 1993 tmp = min_t(int, valuelen, BBTOB(bp->b_length));
1997 ? valuelen : XFS_BUF_SIZE(bp);
1998 xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ); 1994 xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ);
1999 xfs_buf_relse(bp); 1995 xfs_buf_relse(bp);
2000 dst += tmp; 1996 dst += tmp;
@@ -2097,6 +2093,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2097 lblkno = args->rmtblkno; 2093 lblkno = args->rmtblkno;
2098 valuelen = args->valuelen; 2094 valuelen = args->valuelen;
2099 while (valuelen > 0) { 2095 while (valuelen > 0) {
2096 int buflen;
2097
2100 /* 2098 /*
2101 * Try to remember where we decided to put the value. 2099 * Try to remember where we decided to put the value.
2102 */ 2100 */
@@ -2114,15 +2112,16 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2114 dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), 2112 dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
2115 blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); 2113 blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
2116 2114
2117 bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 2115 bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0);
2118 XBF_LOCK | XBF_DONT_BLOCK);
2119 if (!bp) 2116 if (!bp)
2120 return ENOMEM; 2117 return ENOMEM;
2121 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : 2118
2122 XFS_BUF_SIZE(bp); 2119 buflen = BBTOB(bp->b_length);
2120 tmp = min_t(int, valuelen, buflen);
2123 xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE); 2121 xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
2124 if (tmp < XFS_BUF_SIZE(bp)) 2122 if (tmp < buflen)
2125 xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); 2123 xfs_buf_zero(bp, tmp, buflen - tmp);
2124
2126 error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ 2125 error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
2127 xfs_buf_relse(bp); 2126 xfs_buf_relse(bp);
2128 if (error) 2127 if (error)
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 76d93dc953e1..7d89d800f517 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -20,7 +20,6 @@
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 23#include "xfs_trans.h"
25#include "xfs_sb.h" 24#include "xfs_sb.h"
26#include "xfs_ag.h" 25#include "xfs_ag.h"
@@ -2983,7 +2982,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
2983 map.br_blockcount); 2982 map.br_blockcount);
2984 bp = xfs_trans_get_buf(*trans, 2983 bp = xfs_trans_get_buf(*trans,
2985 dp->i_mount->m_ddev_targp, 2984 dp->i_mount->m_ddev_targp,
2986 dblkno, dblkcnt, XBF_LOCK); 2985 dblkno, dblkcnt, 0);
2987 if (!bp) 2986 if (!bp)
2988 return ENOMEM; 2987 return ENOMEM;
2989 xfs_trans_binval(*trans, bp); 2988 xfs_trans_binval(*trans, bp);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 85e7e327bcd8..58b815ec8c91 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -41,7 +41,6 @@
41#include "xfs_rtalloc.h" 41#include "xfs_rtalloc.h"
42#include "xfs_error.h" 42#include "xfs_error.h"
43#include "xfs_attr_leaf.h" 43#include "xfs_attr_leaf.h"
44#include "xfs_rw.h"
45#include "xfs_quota.h" 44#include "xfs_quota.h"
46#include "xfs_trans_space.h" 45#include "xfs_trans_space.h"
47#include "xfs_buf_item.h" 46#include "xfs_buf_item.h"
@@ -4527,7 +4526,7 @@ out_unreserve_blocks:
4527 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0); 4526 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0);
4528out_unreserve_quota: 4527out_unreserve_quota:
4529 if (XFS_IS_QUOTA_ON(mp)) 4528 if (XFS_IS_QUOTA_ON(mp))
4530 xfs_trans_unreserve_quota_nblks(NULL, ip, alen, 0, rt ? 4529 xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ?
4531 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); 4530 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
4532 return error; 4531 return error;
4533} 4532}
@@ -5621,8 +5620,20 @@ xfs_getbmap(
5621 XFS_FSB_TO_BB(mp, map[i].br_blockcount); 5620 XFS_FSB_TO_BB(mp, map[i].br_blockcount);
5622 out[cur_ext].bmv_unused1 = 0; 5621 out[cur_ext].bmv_unused1 = 0;
5623 out[cur_ext].bmv_unused2 = 0; 5622 out[cur_ext].bmv_unused2 = 0;
5624 ASSERT(((iflags & BMV_IF_DELALLOC) != 0) || 5623
5625 (map[i].br_startblock != DELAYSTARTBLOCK)); 5624 /*
5625 * delayed allocation extents that start beyond EOF can
5626 * occur due to speculative EOF allocation when the
5627 * delalloc extent is larger than the largest freespace
5628 * extent at conversion time. These extents cannot be
5629 * converted by data writeback, so can exist here even
5630 * if we are not supposed to be finding delalloc
5631 * extents.
5632 */
5633 if (map[i].br_startblock == DELAYSTARTBLOCK &&
5634 map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
5635 ASSERT((iflags & BMV_IF_DELALLOC) != 0);
5636
5626 if (map[i].br_startblock == HOLESTARTBLOCK && 5637 if (map[i].br_startblock == HOLESTARTBLOCK &&
5627 whichfork == XFS_ATTR_FORK) { 5638 whichfork == XFS_ATTR_FORK) {
5628 /* came to the end of attribute fork */ 5639 /* came to the end of attribute fork */
@@ -6157,3 +6168,16 @@ next_block:
6157 6168
6158 return error; 6169 return error;
6159} 6170}
6171
6172/*
6173 * Convert the given file system block to a disk block. We have to treat it
6174 * differently based on whether the file is a real time file or not, because the
6175 * bmap code does.
6176 */
6177xfs_daddr_t
6178xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
6179{
6180 return (XFS_IS_REALTIME_INODE(ip) ? \
6181 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
6182 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
6183}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 89ee672d378a..803b56d7ce16 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -211,6 +211,9 @@ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
211 int whichfork, int *count); 211 int whichfork, int *count);
212int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, 212int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
213 xfs_fileoff_t start_fsb, xfs_fileoff_t length); 213 xfs_fileoff_t start_fsb, xfs_fileoff_t length);
214
215xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
216
214#endif /* __KERNEL__ */ 217#endif /* __KERNEL__ */
215 218
216#endif /* __XFS_BMAP_H__ */ 219#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index e2f5d59cbeaf..862084a47a7e 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -20,7 +20,6 @@
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 23#include "xfs_trans.h"
25#include "xfs_sb.h" 24#include "xfs_sb.h"
26#include "xfs_ag.h" 25#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 1f19f03af9d3..e53e317b1582 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -20,7 +20,6 @@
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 23#include "xfs_trans.h"
25#include "xfs_sb.h" 24#include "xfs_sb.h"
26#include "xfs_ag.h" 25#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 6819b5163e33..172d3cc8f8cb 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -35,14 +35,12 @@
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36 36
37#include "xfs_sb.h" 37#include "xfs_sb.h"
38#include "xfs_inum.h"
39#include "xfs_log.h" 38#include "xfs_log.h"
40#include "xfs_ag.h" 39#include "xfs_ag.h"
41#include "xfs_mount.h" 40#include "xfs_mount.h"
42#include "xfs_trace.h" 41#include "xfs_trace.h"
43 42
44static kmem_zone_t *xfs_buf_zone; 43static kmem_zone_t *xfs_buf_zone;
45STATIC int xfsbufd(void *);
46 44
47static struct workqueue_struct *xfslogd_workqueue; 45static struct workqueue_struct *xfslogd_workqueue;
48 46
@@ -57,11 +55,7 @@ static struct workqueue_struct *xfslogd_workqueue;
57#endif 55#endif
58 56
59#define xb_to_gfp(flags) \ 57#define xb_to_gfp(flags) \
60 ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \ 58 ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
61 ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
62
63#define xb_to_km(flags) \
64 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
65 59
66 60
67static inline int 61static inline int
@@ -71,11 +65,11 @@ xfs_buf_is_vmapped(
71 /* 65 /*
72 * Return true if the buffer is vmapped. 66 * Return true if the buffer is vmapped.
73 * 67 *
74 * The XBF_MAPPED flag is set if the buffer should be mapped, but the 68 * b_addr is null if the buffer is not mapped, but the code is clever
75 * code is clever enough to know it doesn't have to map a single page, 69 * enough to know it doesn't have to map a single page, so the check has
76 * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1. 70 * to be both for b_addr and bp->b_page_count > 1.
77 */ 71 */
78 return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1; 72 return bp->b_addr && bp->b_page_count > 1;
79} 73}
80 74
81static inline int 75static inline int
@@ -144,8 +138,17 @@ void
144xfs_buf_stale( 138xfs_buf_stale(
145 struct xfs_buf *bp) 139 struct xfs_buf *bp)
146{ 140{
141 ASSERT(xfs_buf_islocked(bp));
142
147 bp->b_flags |= XBF_STALE; 143 bp->b_flags |= XBF_STALE;
148 xfs_buf_delwri_dequeue(bp); 144
145 /*
146 * Clear the delwri status so that a delwri queue walker will not
147 * flush this buffer to disk now that it is stale. The delwri queue has
148 * a reference to the buffer, so this is safe to do.
149 */
150 bp->b_flags &= ~_XBF_DELWRI_Q;
151
149 atomic_set(&(bp)->b_lru_ref, 0); 152 atomic_set(&(bp)->b_lru_ref, 0);
150 if (!list_empty(&bp->b_lru)) { 153 if (!list_empty(&bp->b_lru)) {
151 struct xfs_buftarg *btp = bp->b_target; 154 struct xfs_buftarg *btp = bp->b_target;
@@ -164,22 +167,22 @@ xfs_buf_stale(
164struct xfs_buf * 167struct xfs_buf *
165xfs_buf_alloc( 168xfs_buf_alloc(
166 struct xfs_buftarg *target, 169 struct xfs_buftarg *target,
167 xfs_off_t range_base, 170 xfs_daddr_t blkno,
168 size_t range_length, 171 size_t numblks,
169 xfs_buf_flags_t flags) 172 xfs_buf_flags_t flags)
170{ 173{
171 struct xfs_buf *bp; 174 struct xfs_buf *bp;
172 175
173 bp = kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)); 176 bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS);
174 if (unlikely(!bp)) 177 if (unlikely(!bp))
175 return NULL; 178 return NULL;
176 179
177 /* 180 /*
178 * We don't want certain flags to appear in b_flags. 181 * We don't want certain flags to appear in b_flags unless they are
182 * specifically set by later operations on the buffer.
179 */ 183 */
180 flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD); 184 flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
181 185
182 memset(bp, 0, sizeof(xfs_buf_t));
183 atomic_set(&bp->b_hold, 1); 186 atomic_set(&bp->b_hold, 1);
184 atomic_set(&bp->b_lru_ref, 1); 187 atomic_set(&bp->b_lru_ref, 1);
185 init_completion(&bp->b_iowait); 188 init_completion(&bp->b_iowait);
@@ -189,14 +192,22 @@ xfs_buf_alloc(
189 sema_init(&bp->b_sema, 0); /* held, no waiters */ 192 sema_init(&bp->b_sema, 0); /* held, no waiters */
190 XB_SET_OWNER(bp); 193 XB_SET_OWNER(bp);
191 bp->b_target = target; 194 bp->b_target = target;
192 bp->b_file_offset = range_base; 195
193 /* 196 /*
194 * Set buffer_length and count_desired to the same value initially. 197 * Set length and io_length to the same value initially.
195 * I/O routines should use count_desired, which will be the same in 198 * I/O routines should use io_length, which will be the same in
196 * most cases but may be reset (e.g. XFS recovery). 199 * most cases but may be reset (e.g. XFS recovery).
197 */ 200 */
198 bp->b_buffer_length = bp->b_count_desired = range_length; 201 bp->b_length = numblks;
202 bp->b_io_length = numblks;
199 bp->b_flags = flags; 203 bp->b_flags = flags;
204
205 /*
206 * We do not set the block number here in the buffer because we have not
207 * finished initialising the buffer. We insert the buffer into the cache
208 * in this state, so this ensures that we are unable to do IO on a
209 * buffer that hasn't been fully initialised.
210 */
200 bp->b_bn = XFS_BUF_DADDR_NULL; 211 bp->b_bn = XFS_BUF_DADDR_NULL;
201 atomic_set(&bp->b_pin_count, 0); 212 atomic_set(&bp->b_pin_count, 0);
202 init_waitqueue_head(&bp->b_waiters); 213 init_waitqueue_head(&bp->b_waiters);
@@ -219,13 +230,12 @@ _xfs_buf_get_pages(
219{ 230{
220 /* Make sure that we have a page list */ 231 /* Make sure that we have a page list */
221 if (bp->b_pages == NULL) { 232 if (bp->b_pages == NULL) {
222 bp->b_offset = xfs_buf_poff(bp->b_file_offset);
223 bp->b_page_count = page_count; 233 bp->b_page_count = page_count;
224 if (page_count <= XB_PAGES) { 234 if (page_count <= XB_PAGES) {
225 bp->b_pages = bp->b_page_array; 235 bp->b_pages = bp->b_page_array;
226 } else { 236 } else {
227 bp->b_pages = kmem_alloc(sizeof(struct page *) * 237 bp->b_pages = kmem_alloc(sizeof(struct page *) *
228 page_count, xb_to_km(flags)); 238 page_count, KM_NOFS);
229 if (bp->b_pages == NULL) 239 if (bp->b_pages == NULL)
230 return -ENOMEM; 240 return -ENOMEM;
231 } 241 }
@@ -288,11 +298,11 @@ xfs_buf_allocate_memory(
288 xfs_buf_t *bp, 298 xfs_buf_t *bp,
289 uint flags) 299 uint flags)
290{ 300{
291 size_t size = bp->b_count_desired; 301 size_t size;
292 size_t nbytes, offset; 302 size_t nbytes, offset;
293 gfp_t gfp_mask = xb_to_gfp(flags); 303 gfp_t gfp_mask = xb_to_gfp(flags);
294 unsigned short page_count, i; 304 unsigned short page_count, i;
295 xfs_off_t end; 305 xfs_off_t start, end;
296 int error; 306 int error;
297 307
298 /* 308 /*
@@ -300,15 +310,15 @@ xfs_buf_allocate_memory(
300 * the memory from the heap - there's no need for the complexity of 310 * the memory from the heap - there's no need for the complexity of
301 * page arrays to keep allocation down to order 0. 311 * page arrays to keep allocation down to order 0.
302 */ 312 */
303 if (bp->b_buffer_length < PAGE_SIZE) { 313 size = BBTOB(bp->b_length);
304 bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags)); 314 if (size < PAGE_SIZE) {
315 bp->b_addr = kmem_alloc(size, KM_NOFS);
305 if (!bp->b_addr) { 316 if (!bp->b_addr) {
306 /* low memory - use alloc_page loop instead */ 317 /* low memory - use alloc_page loop instead */
307 goto use_alloc_page; 318 goto use_alloc_page;
308 } 319 }
309 320
310 if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) & 321 if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
311 PAGE_MASK) !=
312 ((unsigned long)bp->b_addr & PAGE_MASK)) { 322 ((unsigned long)bp->b_addr & PAGE_MASK)) {
313 /* b_addr spans two pages - use alloc_page instead */ 323 /* b_addr spans two pages - use alloc_page instead */
314 kmem_free(bp->b_addr); 324 kmem_free(bp->b_addr);
@@ -319,13 +329,14 @@ xfs_buf_allocate_memory(
319 bp->b_pages = bp->b_page_array; 329 bp->b_pages = bp->b_page_array;
320 bp->b_pages[0] = virt_to_page(bp->b_addr); 330 bp->b_pages[0] = virt_to_page(bp->b_addr);
321 bp->b_page_count = 1; 331 bp->b_page_count = 1;
322 bp->b_flags |= XBF_MAPPED | _XBF_KMEM; 332 bp->b_flags |= _XBF_KMEM;
323 return 0; 333 return 0;
324 } 334 }
325 335
326use_alloc_page: 336use_alloc_page:
327 end = bp->b_file_offset + bp->b_buffer_length; 337 start = BBTOB(bp->b_bn) >> PAGE_SHIFT;
328 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); 338 end = (BBTOB(bp->b_bn + bp->b_length) + PAGE_SIZE - 1) >> PAGE_SHIFT;
339 page_count = end - start;
329 error = _xfs_buf_get_pages(bp, page_count, flags); 340 error = _xfs_buf_get_pages(bp, page_count, flags);
330 if (unlikely(error)) 341 if (unlikely(error))
331 return error; 342 return error;
@@ -388,8 +399,9 @@ _xfs_buf_map_pages(
388 if (bp->b_page_count == 1) { 399 if (bp->b_page_count == 1) {
389 /* A single page buffer is always mappable */ 400 /* A single page buffer is always mappable */
390 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 401 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
391 bp->b_flags |= XBF_MAPPED; 402 } else if (flags & XBF_UNMAPPED) {
392 } else if (flags & XBF_MAPPED) { 403 bp->b_addr = NULL;
404 } else {
393 int retried = 0; 405 int retried = 0;
394 406
395 do { 407 do {
@@ -403,7 +415,6 @@ _xfs_buf_map_pages(
403 if (!bp->b_addr) 415 if (!bp->b_addr)
404 return -ENOMEM; 416 return -ENOMEM;
405 bp->b_addr += bp->b_offset; 417 bp->b_addr += bp->b_offset;
406 bp->b_flags |= XBF_MAPPED;
407 } 418 }
408 419
409 return 0; 420 return 0;
@@ -420,29 +431,27 @@ _xfs_buf_map_pages(
420 */ 431 */
421xfs_buf_t * 432xfs_buf_t *
422_xfs_buf_find( 433_xfs_buf_find(
423 xfs_buftarg_t *btp, /* block device target */ 434 struct xfs_buftarg *btp,
424 xfs_off_t ioff, /* starting offset of range */ 435 xfs_daddr_t blkno,
425 size_t isize, /* length of range */ 436 size_t numblks,
426 xfs_buf_flags_t flags, 437 xfs_buf_flags_t flags,
427 xfs_buf_t *new_bp) 438 xfs_buf_t *new_bp)
428{ 439{
429 xfs_off_t range_base; 440 size_t numbytes;
430 size_t range_length;
431 struct xfs_perag *pag; 441 struct xfs_perag *pag;
432 struct rb_node **rbp; 442 struct rb_node **rbp;
433 struct rb_node *parent; 443 struct rb_node *parent;
434 xfs_buf_t *bp; 444 xfs_buf_t *bp;
435 445
436 range_base = (ioff << BBSHIFT); 446 numbytes = BBTOB(numblks);
437 range_length = (isize << BBSHIFT);
438 447
439 /* Check for IOs smaller than the sector size / not sector aligned */ 448 /* Check for IOs smaller than the sector size / not sector aligned */
440 ASSERT(!(range_length < (1 << btp->bt_sshift))); 449 ASSERT(!(numbytes < (1 << btp->bt_sshift)));
441 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); 450 ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
442 451
443 /* get tree root */ 452 /* get tree root */
444 pag = xfs_perag_get(btp->bt_mount, 453 pag = xfs_perag_get(btp->bt_mount,
445 xfs_daddr_to_agno(btp->bt_mount, ioff)); 454 xfs_daddr_to_agno(btp->bt_mount, blkno));
446 455
447 /* walk tree */ 456 /* walk tree */
448 spin_lock(&pag->pag_buf_lock); 457 spin_lock(&pag->pag_buf_lock);
@@ -453,20 +462,20 @@ _xfs_buf_find(
453 parent = *rbp; 462 parent = *rbp;
454 bp = rb_entry(parent, struct xfs_buf, b_rbnode); 463 bp = rb_entry(parent, struct xfs_buf, b_rbnode);
455 464
456 if (range_base < bp->b_file_offset) 465 if (blkno < bp->b_bn)
457 rbp = &(*rbp)->rb_left; 466 rbp = &(*rbp)->rb_left;
458 else if (range_base > bp->b_file_offset) 467 else if (blkno > bp->b_bn)
459 rbp = &(*rbp)->rb_right; 468 rbp = &(*rbp)->rb_right;
460 else { 469 else {
461 /* 470 /*
462 * found a block offset match. If the range doesn't 471 * found a block number match. If the range doesn't
463 * match, the only way this is allowed is if the buffer 472 * match, the only way this is allowed is if the buffer
464 * in the cache is stale and the transaction that made 473 * in the cache is stale and the transaction that made
465 * it stale has not yet committed. i.e. we are 474 * it stale has not yet committed. i.e. we are
466 * reallocating a busy extent. Skip this buffer and 475 * reallocating a busy extent. Skip this buffer and
467 * continue searching to the right for an exact match. 476 * continue searching to the right for an exact match.
468 */ 477 */
469 if (bp->b_buffer_length != range_length) { 478 if (bp->b_length != numblks) {
470 ASSERT(bp->b_flags & XBF_STALE); 479 ASSERT(bp->b_flags & XBF_STALE);
471 rbp = &(*rbp)->rb_right; 480 rbp = &(*rbp)->rb_right;
472 continue; 481 continue;
@@ -511,7 +520,7 @@ found:
511 */ 520 */
512 if (bp->b_flags & XBF_STALE) { 521 if (bp->b_flags & XBF_STALE) {
513 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 522 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
514 bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES; 523 bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
515 } 524 }
516 525
517 trace_xfs_buf_find(bp, flags, _RET_IP_); 526 trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -526,63 +535,59 @@ found:
526 */ 535 */
527struct xfs_buf * 536struct xfs_buf *
528xfs_buf_get( 537xfs_buf_get(
529 xfs_buftarg_t *target,/* target for buffer */ 538 xfs_buftarg_t *target,
530 xfs_off_t ioff, /* starting offset of range */ 539 xfs_daddr_t blkno,
531 size_t isize, /* length of range */ 540 size_t numblks,
532 xfs_buf_flags_t flags) 541 xfs_buf_flags_t flags)
533{ 542{
534 struct xfs_buf *bp; 543 struct xfs_buf *bp;
535 struct xfs_buf *new_bp; 544 struct xfs_buf *new_bp;
536 int error = 0; 545 int error = 0;
537 546
538 bp = _xfs_buf_find(target, ioff, isize, flags, NULL); 547 bp = _xfs_buf_find(target, blkno, numblks, flags, NULL);
539 if (likely(bp)) 548 if (likely(bp))
540 goto found; 549 goto found;
541 550
542 new_bp = xfs_buf_alloc(target, ioff << BBSHIFT, isize << BBSHIFT, 551 new_bp = xfs_buf_alloc(target, blkno, numblks, flags);
543 flags);
544 if (unlikely(!new_bp)) 552 if (unlikely(!new_bp))
545 return NULL; 553 return NULL;
546 554
547 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); 555 error = xfs_buf_allocate_memory(new_bp, flags);
548 if (!bp) { 556 if (error) {
549 kmem_zone_free(xfs_buf_zone, new_bp); 557 kmem_zone_free(xfs_buf_zone, new_bp);
550 return NULL; 558 return NULL;
551 } 559 }
552 560
553 if (bp == new_bp) { 561 bp = _xfs_buf_find(target, blkno, numblks, flags, new_bp);
554 error = xfs_buf_allocate_memory(bp, flags); 562 if (!bp) {
555 if (error) 563 xfs_buf_free(new_bp);
556 goto no_buffer; 564 return NULL;
557 } else 565 }
558 kmem_zone_free(xfs_buf_zone, new_bp); 566
567 if (bp != new_bp)
568 xfs_buf_free(new_bp);
559 569
560 /* 570 /*
561 * Now we have a workable buffer, fill in the block number so 571 * Now we have a workable buffer, fill in the block number so
562 * that we can do IO on it. 572 * that we can do IO on it.
563 */ 573 */
564 bp->b_bn = ioff; 574 bp->b_bn = blkno;
565 bp->b_count_desired = bp->b_buffer_length; 575 bp->b_io_length = bp->b_length;
566 576
567found: 577found:
568 if (!(bp->b_flags & XBF_MAPPED)) { 578 if (!bp->b_addr) {
569 error = _xfs_buf_map_pages(bp, flags); 579 error = _xfs_buf_map_pages(bp, flags);
570 if (unlikely(error)) { 580 if (unlikely(error)) {
571 xfs_warn(target->bt_mount, 581 xfs_warn(target->bt_mount,
572 "%s: failed to map pages\n", __func__); 582 "%s: failed to map pages\n", __func__);
573 goto no_buffer; 583 xfs_buf_relse(bp);
584 return NULL;
574 } 585 }
575 } 586 }
576 587
577 XFS_STATS_INC(xb_get); 588 XFS_STATS_INC(xb_get);
578 trace_xfs_buf_get(bp, flags, _RET_IP_); 589 trace_xfs_buf_get(bp, flags, _RET_IP_);
579 return bp; 590 return bp;
580
581no_buffer:
582 if (flags & (XBF_LOCK | XBF_TRYLOCK))
583 xfs_buf_unlock(bp);
584 xfs_buf_rele(bp);
585 return NULL;
586} 591}
587 592
588STATIC int 593STATIC int
@@ -590,32 +595,30 @@ _xfs_buf_read(
590 xfs_buf_t *bp, 595 xfs_buf_t *bp,
591 xfs_buf_flags_t flags) 596 xfs_buf_flags_t flags)
592{ 597{
593 int status; 598 ASSERT(!(flags & XBF_WRITE));
594
595 ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
596 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); 599 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
597 600
598 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | XBF_READ_AHEAD); 601 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
599 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); 602 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
600 603
601 status = xfs_buf_iorequest(bp); 604 xfs_buf_iorequest(bp);
602 if (status || bp->b_error || (flags & XBF_ASYNC)) 605 if (flags & XBF_ASYNC)
603 return status; 606 return 0;
604 return xfs_buf_iowait(bp); 607 return xfs_buf_iowait(bp);
605} 608}
606 609
607xfs_buf_t * 610xfs_buf_t *
608xfs_buf_read( 611xfs_buf_read(
609 xfs_buftarg_t *target, 612 xfs_buftarg_t *target,
610 xfs_off_t ioff, 613 xfs_daddr_t blkno,
611 size_t isize, 614 size_t numblks,
612 xfs_buf_flags_t flags) 615 xfs_buf_flags_t flags)
613{ 616{
614 xfs_buf_t *bp; 617 xfs_buf_t *bp;
615 618
616 flags |= XBF_READ; 619 flags |= XBF_READ;
617 620
618 bp = xfs_buf_get(target, ioff, isize, flags); 621 bp = xfs_buf_get(target, blkno, numblks, flags);
619 if (bp) { 622 if (bp) {
620 trace_xfs_buf_read(bp, flags, _RET_IP_); 623 trace_xfs_buf_read(bp, flags, _RET_IP_);
621 624
@@ -627,7 +630,8 @@ xfs_buf_read(
627 * Read ahead call which is already satisfied, 630 * Read ahead call which is already satisfied,
628 * drop the buffer 631 * drop the buffer
629 */ 632 */
630 goto no_buffer; 633 xfs_buf_relse(bp);
634 return NULL;
631 } else { 635 } else {
632 /* We do not want read in the flags */ 636 /* We do not want read in the flags */
633 bp->b_flags &= ~XBF_READ; 637 bp->b_flags &= ~XBF_READ;
@@ -635,12 +639,6 @@ xfs_buf_read(
635 } 639 }
636 640
637 return bp; 641 return bp;
638
639 no_buffer:
640 if (flags & (XBF_LOCK | XBF_TRYLOCK))
641 xfs_buf_unlock(bp);
642 xfs_buf_rele(bp);
643 return NULL;
644} 642}
645 643
646/* 644/*
@@ -650,14 +648,14 @@ xfs_buf_read(
650void 648void
651xfs_buf_readahead( 649xfs_buf_readahead(
652 xfs_buftarg_t *target, 650 xfs_buftarg_t *target,
653 xfs_off_t ioff, 651 xfs_daddr_t blkno,
654 size_t isize) 652 size_t numblks)
655{ 653{
656 if (bdi_read_congested(target->bt_bdi)) 654 if (bdi_read_congested(target->bt_bdi))
657 return; 655 return;
658 656
659 xfs_buf_read(target, ioff, isize, 657 xfs_buf_read(target, blkno, numblks,
660 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK); 658 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
661} 659}
662 660
663/* 661/*
@@ -666,16 +664,15 @@ xfs_buf_readahead(
666 */ 664 */
667struct xfs_buf * 665struct xfs_buf *
668xfs_buf_read_uncached( 666xfs_buf_read_uncached(
669 struct xfs_mount *mp,
670 struct xfs_buftarg *target, 667 struct xfs_buftarg *target,
671 xfs_daddr_t daddr, 668 xfs_daddr_t daddr,
672 size_t length, 669 size_t numblks,
673 int flags) 670 int flags)
674{ 671{
675 xfs_buf_t *bp; 672 xfs_buf_t *bp;
676 int error; 673 int error;
677 674
678 bp = xfs_buf_get_uncached(target, length, flags); 675 bp = xfs_buf_get_uncached(target, numblks, flags);
679 if (!bp) 676 if (!bp)
680 return NULL; 677 return NULL;
681 678
@@ -683,9 +680,9 @@ xfs_buf_read_uncached(
683 XFS_BUF_SET_ADDR(bp, daddr); 680 XFS_BUF_SET_ADDR(bp, daddr);
684 XFS_BUF_READ(bp); 681 XFS_BUF_READ(bp);
685 682
686 xfsbdstrat(mp, bp); 683 xfsbdstrat(target->bt_mount, bp);
687 error = xfs_buf_iowait(bp); 684 error = xfs_buf_iowait(bp);
688 if (error || bp->b_error) { 685 if (error) {
689 xfs_buf_relse(bp); 686 xfs_buf_relse(bp);
690 return NULL; 687 return NULL;
691 } 688 }
@@ -699,7 +696,7 @@ xfs_buf_read_uncached(
699void 696void
700xfs_buf_set_empty( 697xfs_buf_set_empty(
701 struct xfs_buf *bp, 698 struct xfs_buf *bp,
702 size_t len) 699 size_t numblks)
703{ 700{
704 if (bp->b_pages) 701 if (bp->b_pages)
705 _xfs_buf_free_pages(bp); 702 _xfs_buf_free_pages(bp);
@@ -707,10 +704,9 @@ xfs_buf_set_empty(
707 bp->b_pages = NULL; 704 bp->b_pages = NULL;
708 bp->b_page_count = 0; 705 bp->b_page_count = 0;
709 bp->b_addr = NULL; 706 bp->b_addr = NULL;
710 bp->b_file_offset = 0; 707 bp->b_length = numblks;
711 bp->b_buffer_length = bp->b_count_desired = len; 708 bp->b_io_length = numblks;
712 bp->b_bn = XFS_BUF_DADDR_NULL; 709 bp->b_bn = XFS_BUF_DADDR_NULL;
713 bp->b_flags &= ~XBF_MAPPED;
714} 710}
715 711
716static inline struct page * 712static inline struct page *
@@ -749,7 +745,7 @@ xfs_buf_associate_memory(
749 bp->b_pages = NULL; 745 bp->b_pages = NULL;
750 bp->b_addr = mem; 746 bp->b_addr = mem;
751 747
752 rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK); 748 rval = _xfs_buf_get_pages(bp, page_count, 0);
753 if (rval) 749 if (rval)
754 return rval; 750 return rval;
755 751
@@ -760,9 +756,8 @@ xfs_buf_associate_memory(
760 pageaddr += PAGE_SIZE; 756 pageaddr += PAGE_SIZE;
761 } 757 }
762 758
763 bp->b_count_desired = len; 759 bp->b_io_length = BTOBB(len);
764 bp->b_buffer_length = buflen; 760 bp->b_length = BTOBB(buflen);
765 bp->b_flags |= XBF_MAPPED;
766 761
767 return 0; 762 return 0;
768} 763}
@@ -770,17 +765,18 @@ xfs_buf_associate_memory(
770xfs_buf_t * 765xfs_buf_t *
771xfs_buf_get_uncached( 766xfs_buf_get_uncached(
772 struct xfs_buftarg *target, 767 struct xfs_buftarg *target,
773 size_t len, 768 size_t numblks,
774 int flags) 769 int flags)
775{ 770{
776 unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; 771 unsigned long page_count;
777 int error, i; 772 int error, i;
778 xfs_buf_t *bp; 773 xfs_buf_t *bp;
779 774
780 bp = xfs_buf_alloc(target, 0, len, 0); 775 bp = xfs_buf_alloc(target, 0, numblks, 0);
781 if (unlikely(bp == NULL)) 776 if (unlikely(bp == NULL))
782 goto fail; 777 goto fail;
783 778
779 page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
784 error = _xfs_buf_get_pages(bp, page_count, 0); 780 error = _xfs_buf_get_pages(bp, page_count, 0);
785 if (error) 781 if (error)
786 goto fail_free_buf; 782 goto fail_free_buf;
@@ -792,7 +788,7 @@ xfs_buf_get_uncached(
792 } 788 }
793 bp->b_flags |= _XBF_PAGES; 789 bp->b_flags |= _XBF_PAGES;
794 790
795 error = _xfs_buf_map_pages(bp, XBF_MAPPED); 791 error = _xfs_buf_map_pages(bp, 0);
796 if (unlikely(error)) { 792 if (unlikely(error)) {
797 xfs_warn(target->bt_mount, 793 xfs_warn(target->bt_mount,
798 "%s: failed to map pages\n", __func__); 794 "%s: failed to map pages\n", __func__);
@@ -855,7 +851,7 @@ xfs_buf_rele(
855 spin_unlock(&pag->pag_buf_lock); 851 spin_unlock(&pag->pag_buf_lock);
856 } else { 852 } else {
857 xfs_buf_lru_del(bp); 853 xfs_buf_lru_del(bp);
858 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); 854 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
859 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); 855 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
860 spin_unlock(&pag->pag_buf_lock); 856 spin_unlock(&pag->pag_buf_lock);
861 xfs_perag_put(pag); 857 xfs_perag_put(pag);
@@ -915,13 +911,6 @@ xfs_buf_lock(
915 trace_xfs_buf_lock_done(bp, _RET_IP_); 911 trace_xfs_buf_lock_done(bp, _RET_IP_);
916} 912}
917 913
918/*
919 * Releases the lock on the buffer object.
920 * If the buffer is marked delwri but is not queued, do so before we
921 * unlock the buffer as we need to set flags correctly. We also need to
922 * take a reference for the delwri queue because the unlocker is going to
923 * drop their's and they don't know we just queued it.
924 */
925void 914void
926xfs_buf_unlock( 915xfs_buf_unlock(
927 struct xfs_buf *bp) 916 struct xfs_buf *bp)
@@ -1008,9 +997,8 @@ xfs_buf_ioerror_alert(
1008 const char *func) 997 const char *func)
1009{ 998{
1010 xfs_alert(bp->b_target->bt_mount, 999 xfs_alert(bp->b_target->bt_mount,
1011"metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd", 1000"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d",
1012 (__uint64_t)XFS_BUF_ADDR(bp), func, 1001 (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length);
1013 bp->b_error, XFS_BUF_COUNT(bp));
1014} 1002}
1015 1003
1016int 1004int
@@ -1019,10 +1007,11 @@ xfs_bwrite(
1019{ 1007{
1020 int error; 1008 int error;
1021 1009
1010 ASSERT(xfs_buf_islocked(bp));
1011
1022 bp->b_flags |= XBF_WRITE; 1012 bp->b_flags |= XBF_WRITE;
1023 bp->b_flags &= ~(XBF_ASYNC | XBF_READ); 1013 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q);
1024 1014
1025 xfs_buf_delwri_dequeue(bp);
1026 xfs_bdstrat_cb(bp); 1015 xfs_bdstrat_cb(bp);
1027 1016
1028 error = xfs_buf_iowait(bp); 1017 error = xfs_buf_iowait(bp);
@@ -1181,7 +1170,7 @@ _xfs_buf_ioapply(
1181 int rw, map_i, total_nr_pages, nr_pages; 1170 int rw, map_i, total_nr_pages, nr_pages;
1182 struct bio *bio; 1171 struct bio *bio;
1183 int offset = bp->b_offset; 1172 int offset = bp->b_offset;
1184 int size = bp->b_count_desired; 1173 int size = BBTOB(bp->b_io_length);
1185 sector_t sector = bp->b_bn; 1174 sector_t sector = bp->b_bn;
1186 1175
1187 total_nr_pages = bp->b_page_count; 1176 total_nr_pages = bp->b_page_count;
@@ -1229,7 +1218,7 @@ next_chunk:
1229 break; 1218 break;
1230 1219
1231 offset = 0; 1220 offset = 0;
1232 sector += nbytes >> BBSHIFT; 1221 sector += BTOBB(nbytes);
1233 size -= nbytes; 1222 size -= nbytes;
1234 total_nr_pages--; 1223 total_nr_pages--;
1235 } 1224 }
@@ -1248,13 +1237,13 @@ next_chunk:
1248 } 1237 }
1249} 1238}
1250 1239
1251int 1240void
1252xfs_buf_iorequest( 1241xfs_buf_iorequest(
1253 xfs_buf_t *bp) 1242 xfs_buf_t *bp)
1254{ 1243{
1255 trace_xfs_buf_iorequest(bp, _RET_IP_); 1244 trace_xfs_buf_iorequest(bp, _RET_IP_);
1256 1245
1257 ASSERT(!(bp->b_flags & XBF_DELWRI)); 1246 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
1258 1247
1259 if (bp->b_flags & XBF_WRITE) 1248 if (bp->b_flags & XBF_WRITE)
1260 xfs_buf_wait_unpin(bp); 1249 xfs_buf_wait_unpin(bp);
@@ -1269,13 +1258,12 @@ xfs_buf_iorequest(
1269 _xfs_buf_ioend(bp, 0); 1258 _xfs_buf_ioend(bp, 0);
1270 1259
1271 xfs_buf_rele(bp); 1260 xfs_buf_rele(bp);
1272 return 0;
1273} 1261}
1274 1262
1275/* 1263/*
1276 * Waits for I/O to complete on the buffer supplied. 1264 * Waits for I/O to complete on the buffer supplied. It returns immediately if
1277 * It returns immediately if no I/O is pending. 1265 * no I/O is pending or there is already a pending error on the buffer. It
1278 * It returns the I/O error code, if any, or 0 if there was no error. 1266 * returns the I/O error code, if any, or 0 if there was no error.
1279 */ 1267 */
1280int 1268int
1281xfs_buf_iowait( 1269xfs_buf_iowait(
@@ -1283,7 +1271,8 @@ xfs_buf_iowait(
1283{ 1271{
1284 trace_xfs_buf_iowait(bp, _RET_IP_); 1272 trace_xfs_buf_iowait(bp, _RET_IP_);
1285 1273
1286 wait_for_completion(&bp->b_iowait); 1274 if (!bp->b_error)
1275 wait_for_completion(&bp->b_iowait);
1287 1276
1288 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1277 trace_xfs_buf_iowait_done(bp, _RET_IP_);
1289 return bp->b_error; 1278 return bp->b_error;
@@ -1296,7 +1285,7 @@ xfs_buf_offset(
1296{ 1285{
1297 struct page *page; 1286 struct page *page;
1298 1287
1299 if (bp->b_flags & XBF_MAPPED) 1288 if (bp->b_addr)
1300 return bp->b_addr + offset; 1289 return bp->b_addr + offset;
1301 1290
1302 offset += bp->b_offset; 1291 offset += bp->b_offset;
@@ -1315,27 +1304,30 @@ xfs_buf_iomove(
1315 void *data, /* data address */ 1304 void *data, /* data address */
1316 xfs_buf_rw_t mode) /* read/write/zero flag */ 1305 xfs_buf_rw_t mode) /* read/write/zero flag */
1317{ 1306{
1318 size_t bend, cpoff, csize; 1307 size_t bend;
1319 struct page *page;
1320 1308
1321 bend = boff + bsize; 1309 bend = boff + bsize;
1322 while (boff < bend) { 1310 while (boff < bend) {
1323 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; 1311 struct page *page;
1324 cpoff = xfs_buf_poff(boff + bp->b_offset); 1312 int page_index, page_offset, csize;
1325 csize = min_t(size_t, 1313
1326 PAGE_SIZE-cpoff, bp->b_count_desired-boff); 1314 page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
1315 page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
1316 page = bp->b_pages[page_index];
1317 csize = min_t(size_t, PAGE_SIZE - page_offset,
1318 BBTOB(bp->b_io_length) - boff);
1327 1319
1328 ASSERT(((csize + cpoff) <= PAGE_SIZE)); 1320 ASSERT((csize + page_offset) <= PAGE_SIZE);
1329 1321
1330 switch (mode) { 1322 switch (mode) {
1331 case XBRW_ZERO: 1323 case XBRW_ZERO:
1332 memset(page_address(page) + cpoff, 0, csize); 1324 memset(page_address(page) + page_offset, 0, csize);
1333 break; 1325 break;
1334 case XBRW_READ: 1326 case XBRW_READ:
1335 memcpy(data, page_address(page) + cpoff, csize); 1327 memcpy(data, page_address(page) + page_offset, csize);
1336 break; 1328 break;
1337 case XBRW_WRITE: 1329 case XBRW_WRITE:
1338 memcpy(page_address(page) + cpoff, data, csize); 1330 memcpy(page_address(page) + page_offset, data, csize);
1339 } 1331 }
1340 1332
1341 boff += csize; 1333 boff += csize;
@@ -1435,11 +1427,9 @@ xfs_free_buftarg(
1435{ 1427{
1436 unregister_shrinker(&btp->bt_shrinker); 1428 unregister_shrinker(&btp->bt_shrinker);
1437 1429
1438 xfs_flush_buftarg(btp, 1);
1439 if (mp->m_flags & XFS_MOUNT_BARRIER) 1430 if (mp->m_flags & XFS_MOUNT_BARRIER)
1440 xfs_blkdev_issue_flush(btp); 1431 xfs_blkdev_issue_flush(btp);
1441 1432
1442 kthread_stop(btp->bt_task);
1443 kmem_free(btp); 1433 kmem_free(btp);
1444} 1434}
1445 1435
@@ -1491,20 +1481,6 @@ xfs_setsize_buftarg(
1491 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); 1481 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
1492} 1482}
1493 1483
1494STATIC int
1495xfs_alloc_delwri_queue(
1496 xfs_buftarg_t *btp,
1497 const char *fsname)
1498{
1499 INIT_LIST_HEAD(&btp->bt_delwri_queue);
1500 spin_lock_init(&btp->bt_delwri_lock);
1501 btp->bt_flags = 0;
1502 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
1503 if (IS_ERR(btp->bt_task))
1504 return PTR_ERR(btp->bt_task);
1505 return 0;
1506}
1507
1508xfs_buftarg_t * 1484xfs_buftarg_t *
1509xfs_alloc_buftarg( 1485xfs_alloc_buftarg(
1510 struct xfs_mount *mp, 1486 struct xfs_mount *mp,
@@ -1527,8 +1503,6 @@ xfs_alloc_buftarg(
1527 spin_lock_init(&btp->bt_lru_lock); 1503 spin_lock_init(&btp->bt_lru_lock);
1528 if (xfs_setsize_buftarg_early(btp, bdev)) 1504 if (xfs_setsize_buftarg_early(btp, bdev))
1529 goto error; 1505 goto error;
1530 if (xfs_alloc_delwri_queue(btp, fsname))
1531 goto error;
1532 btp->bt_shrinker.shrink = xfs_buftarg_shrink; 1506 btp->bt_shrinker.shrink = xfs_buftarg_shrink;
1533 btp->bt_shrinker.seeks = DEFAULT_SEEKS; 1507 btp->bt_shrinker.seeks = DEFAULT_SEEKS;
1534 register_shrinker(&btp->bt_shrinker); 1508 register_shrinker(&btp->bt_shrinker);
@@ -1539,125 +1513,52 @@ error:
1539 return NULL; 1513 return NULL;
1540} 1514}
1541 1515
1542
1543/* 1516/*
1544 * Delayed write buffer handling 1517 * Add a buffer to the delayed write list.
1518 *
1519 * This queues a buffer for writeout if it hasn't already been. Note that
1520 * neither this routine nor the buffer list submission functions perform
1521 * any internal synchronization. It is expected that the lists are thread-local
1522 * to the callers.
1523 *
1524 * Returns true if we queued up the buffer, or false if it already had
1525 * been on the buffer list.
1545 */ 1526 */
1546void 1527bool
1547xfs_buf_delwri_queue( 1528xfs_buf_delwri_queue(
1548 xfs_buf_t *bp) 1529 struct xfs_buf *bp,
1530 struct list_head *list)
1549{ 1531{
1550 struct xfs_buftarg *btp = bp->b_target; 1532 ASSERT(xfs_buf_islocked(bp));
1551
1552 trace_xfs_buf_delwri_queue(bp, _RET_IP_);
1553
1554 ASSERT(!(bp->b_flags & XBF_READ)); 1533 ASSERT(!(bp->b_flags & XBF_READ));
1555 1534
1556 spin_lock(&btp->bt_delwri_lock); 1535 /*
1557 if (!list_empty(&bp->b_list)) { 1536 * If the buffer is already marked delwri it already is queued up
1558 /* if already in the queue, move it to the tail */ 1537 * by someone else for imediate writeout. Just ignore it in that
1559 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 1538 * case.
1560 list_move_tail(&bp->b_list, &btp->bt_delwri_queue); 1539 */
1561 } else { 1540 if (bp->b_flags & _XBF_DELWRI_Q) {
1562 /* start xfsbufd as it is about to have something to do */ 1541 trace_xfs_buf_delwri_queued(bp, _RET_IP_);
1563 if (list_empty(&btp->bt_delwri_queue)) 1542 return false;
1564 wake_up_process(bp->b_target->bt_task);
1565
1566 atomic_inc(&bp->b_hold);
1567 bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC;
1568 list_add_tail(&bp->b_list, &btp->bt_delwri_queue);
1569 }
1570 bp->b_queuetime = jiffies;
1571 spin_unlock(&btp->bt_delwri_lock);
1572}
1573
1574void
1575xfs_buf_delwri_dequeue(
1576 xfs_buf_t *bp)
1577{
1578 int dequeued = 0;
1579
1580 spin_lock(&bp->b_target->bt_delwri_lock);
1581 if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
1582 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1583 list_del_init(&bp->b_list);
1584 dequeued = 1;
1585 } 1543 }
1586 bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
1587 spin_unlock(&bp->b_target->bt_delwri_lock);
1588
1589 if (dequeued)
1590 xfs_buf_rele(bp);
1591
1592 trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
1593}
1594
1595/*
1596 * If a delwri buffer needs to be pushed before it has aged out, then promote
1597 * it to the head of the delwri queue so that it will be flushed on the next
1598 * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
1599 * than the age currently needed to flush the buffer. Hence the next time the
1600 * xfsbufd sees it is guaranteed to be considered old enough to flush.
1601 */
1602void
1603xfs_buf_delwri_promote(
1604 struct xfs_buf *bp)
1605{
1606 struct xfs_buftarg *btp = bp->b_target;
1607 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
1608 1544
1609 ASSERT(bp->b_flags & XBF_DELWRI); 1545 trace_xfs_buf_delwri_queue(bp, _RET_IP_);
1610 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1611 1546
1612 /* 1547 /*
1613 * Check the buffer age before locking the delayed write queue as we 1548 * If a buffer gets written out synchronously or marked stale while it
1614 * don't need to promote buffers that are already past the flush age. 1549 * is on a delwri list we lazily remove it. To do this, the other party
1550 * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
1551 * It remains referenced and on the list. In a rare corner case it
1552 * might get readded to a delwri list after the synchronous writeout, in
1553 * which case we need just need to re-add the flag here.
1615 */ 1554 */
1616 if (bp->b_queuetime < jiffies - age) 1555 bp->b_flags |= _XBF_DELWRI_Q;
1617 return; 1556 if (list_empty(&bp->b_list)) {
1618 bp->b_queuetime = jiffies - age; 1557 atomic_inc(&bp->b_hold);
1619 spin_lock(&btp->bt_delwri_lock); 1558 list_add_tail(&bp->b_list, list);
1620 list_move(&bp->b_list, &btp->bt_delwri_queue);
1621 spin_unlock(&btp->bt_delwri_lock);
1622}
1623
1624/*
1625 * Move as many buffers as specified to the supplied list
1626 * idicating if we skipped any buffers to prevent deadlocks.
1627 */
1628STATIC int
1629xfs_buf_delwri_split(
1630 xfs_buftarg_t *target,
1631 struct list_head *list,
1632 unsigned long age)
1633{
1634 xfs_buf_t *bp, *n;
1635 int skipped = 0;
1636 int force;
1637
1638 force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1639 INIT_LIST_HEAD(list);
1640 spin_lock(&target->bt_delwri_lock);
1641 list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) {
1642 ASSERT(bp->b_flags & XBF_DELWRI);
1643
1644 if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) {
1645 if (!force &&
1646 time_before(jiffies, bp->b_queuetime + age)) {
1647 xfs_buf_unlock(bp);
1648 break;
1649 }
1650
1651 bp->b_flags &= ~(XBF_DELWRI | _XBF_DELWRI_Q);
1652 bp->b_flags |= XBF_WRITE;
1653 list_move_tail(&bp->b_list, list);
1654 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1655 } else
1656 skipped++;
1657 } 1559 }
1658 1560
1659 spin_unlock(&target->bt_delwri_lock); 1561 return true;
1660 return skipped;
1661} 1562}
1662 1563
1663/* 1564/*
@@ -1683,99 +1584,109 @@ xfs_buf_cmp(
1683 return 0; 1584 return 0;
1684} 1585}
1685 1586
1686STATIC int 1587static int
1687xfsbufd( 1588__xfs_buf_delwri_submit(
1688 void *data) 1589 struct list_head *buffer_list,
1590 struct list_head *io_list,
1591 bool wait)
1689{ 1592{
1690 xfs_buftarg_t *target = (xfs_buftarg_t *)data; 1593 struct blk_plug plug;
1691 1594 struct xfs_buf *bp, *n;
1692 current->flags |= PF_MEMALLOC; 1595 int pinned = 0;
1693 1596
1694 set_freezable(); 1597 list_for_each_entry_safe(bp, n, buffer_list, b_list) {
1598 if (!wait) {
1599 if (xfs_buf_ispinned(bp)) {
1600 pinned++;
1601 continue;
1602 }
1603 if (!xfs_buf_trylock(bp))
1604 continue;
1605 } else {
1606 xfs_buf_lock(bp);
1607 }
1695 1608
1696 do { 1609 /*
1697 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); 1610 * Someone else might have written the buffer synchronously or
1698 long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); 1611 * marked it stale in the meantime. In that case only the
1699 struct list_head tmp; 1612 * _XBF_DELWRI_Q flag got cleared, and we have to drop the
1700 struct blk_plug plug; 1613 * reference and remove it from the list here.
1614 */
1615 if (!(bp->b_flags & _XBF_DELWRI_Q)) {
1616 list_del_init(&bp->b_list);
1617 xfs_buf_relse(bp);
1618 continue;
1619 }
1701 1620
1702 if (unlikely(freezing(current))) 1621 list_move_tail(&bp->b_list, io_list);
1703 try_to_freeze(); 1622 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1623 }
1704 1624
1705 /* sleep for a long time if there is nothing to do. */ 1625 list_sort(NULL, io_list, xfs_buf_cmp);
1706 if (list_empty(&target->bt_delwri_queue))
1707 tout = MAX_SCHEDULE_TIMEOUT;
1708 schedule_timeout_interruptible(tout);
1709 1626
1710 xfs_buf_delwri_split(target, &tmp, age); 1627 blk_start_plug(&plug);
1711 list_sort(NULL, &tmp, xfs_buf_cmp); 1628 list_for_each_entry_safe(bp, n, io_list, b_list) {
1629 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
1630 bp->b_flags |= XBF_WRITE;
1712 1631
1713 blk_start_plug(&plug); 1632 if (!wait) {
1714 while (!list_empty(&tmp)) { 1633 bp->b_flags |= XBF_ASYNC;
1715 struct xfs_buf *bp;
1716 bp = list_first_entry(&tmp, struct xfs_buf, b_list);
1717 list_del_init(&bp->b_list); 1634 list_del_init(&bp->b_list);
1718 xfs_bdstrat_cb(bp);
1719 } 1635 }
1720 blk_finish_plug(&plug); 1636 xfs_bdstrat_cb(bp);
1721 } while (!kthread_should_stop()); 1637 }
1638 blk_finish_plug(&plug);
1722 1639
1723 return 0; 1640 return pinned;
1724} 1641}
1725 1642
1726/* 1643/*
1727 * Go through all incore buffers, and release buffers if they belong to 1644 * Write out a buffer list asynchronously.
1728 * the given device. This is used in filesystem error handling to 1645 *
1729 * preserve the consistency of its metadata. 1646 * This will take the @buffer_list, write all non-locked and non-pinned buffers
1647 * out and not wait for I/O completion on any of the buffers. This interface
1648 * is only safely useable for callers that can track I/O completion by higher
1649 * level means, e.g. AIL pushing as the @buffer_list is consumed in this
1650 * function.
1730 */ 1651 */
1731int 1652int
1732xfs_flush_buftarg( 1653xfs_buf_delwri_submit_nowait(
1733 xfs_buftarg_t *target, 1654 struct list_head *buffer_list)
1734 int wait)
1735{ 1655{
1736 xfs_buf_t *bp; 1656 LIST_HEAD (io_list);
1737 int pincount = 0; 1657 return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
1738 LIST_HEAD(tmp_list); 1658}
1739 LIST_HEAD(wait_list);
1740 struct blk_plug plug;
1741 1659
1742 flush_workqueue(xfslogd_workqueue); 1660/*
1661 * Write out a buffer list synchronously.
1662 *
1663 * This will take the @buffer_list, write all buffers out and wait for I/O
1664 * completion on all of the buffers. @buffer_list is consumed by the function,
1665 * so callers must have some other way of tracking buffers if they require such
1666 * functionality.
1667 */
1668int
1669xfs_buf_delwri_submit(
1670 struct list_head *buffer_list)
1671{
1672 LIST_HEAD (io_list);
1673 int error = 0, error2;
1674 struct xfs_buf *bp;
1743 1675
1744 set_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1676 __xfs_buf_delwri_submit(buffer_list, &io_list, true);
1745 pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
1746 1677
1747 /* 1678 /* Wait for IO to complete. */
1748 * Dropped the delayed write list lock, now walk the temporary list. 1679 while (!list_empty(&io_list)) {
1749 * All I/O is issued async and then if we need to wait for completion 1680 bp = list_first_entry(&io_list, struct xfs_buf, b_list);
1750 * we do that after issuing all the IO.
1751 */
1752 list_sort(NULL, &tmp_list, xfs_buf_cmp);
1753 1681
1754 blk_start_plug(&plug);
1755 while (!list_empty(&tmp_list)) {
1756 bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
1757 ASSERT(target == bp->b_target);
1758 list_del_init(&bp->b_list); 1682 list_del_init(&bp->b_list);
1759 if (wait) { 1683 error2 = xfs_buf_iowait(bp);
1760 bp->b_flags &= ~XBF_ASYNC; 1684 xfs_buf_relse(bp);
1761 list_add(&bp->b_list, &wait_list); 1685 if (!error)
1762 } 1686 error = error2;
1763 xfs_bdstrat_cb(bp);
1764 }
1765 blk_finish_plug(&plug);
1766
1767 if (wait) {
1768 /* Wait for IO to complete. */
1769 while (!list_empty(&wait_list)) {
1770 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
1771
1772 list_del_init(&bp->b_list);
1773 xfs_buf_iowait(bp);
1774 xfs_buf_relse(bp);
1775 }
1776 } 1687 }
1777 1688
1778 return pincount; 1689 return error;
1779} 1690}
1780 1691
1781int __init 1692int __init
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 5bf3be45f543..7f1d1392ce37 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -32,11 +32,6 @@
32 32
33#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) 33#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
34 34
35#define xfs_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE)
36#define xfs_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT)
37#define xfs_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT)
38#define xfs_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK)
39
40typedef enum { 35typedef enum {
41 XBRW_READ = 1, /* transfer into target memory */ 36 XBRW_READ = 1, /* transfer into target memory */
42 XBRW_WRITE = 2, /* transfer from target memory */ 37 XBRW_WRITE = 2, /* transfer from target memory */
@@ -46,11 +41,9 @@ typedef enum {
46#define XBF_READ (1 << 0) /* buffer intended for reading from device */ 41#define XBF_READ (1 << 0) /* buffer intended for reading from device */
47#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ 42#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
48#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ 43#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
49#define XBF_MAPPED (1 << 3) /* buffer mapped (b_addr valid) */
50#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ 44#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
51#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ 45#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
52#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */ 46#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
53#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */
54 47
55/* I/O hints for the BIO layer */ 48/* I/O hints for the BIO layer */
56#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ 49#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */
@@ -58,14 +51,13 @@ typedef enum {
58#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */ 51#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */
59 52
60/* flags used only as arguments to access routines */ 53/* flags used only as arguments to access routines */
61#define XBF_LOCK (1 << 15)/* lock requested */
62#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ 54#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */
63#define XBF_DONT_BLOCK (1 << 17)/* do not block in current thread */ 55#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */
64 56
65/* flags used only internally */ 57/* flags used only internally */
66#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ 58#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */
67#define _XBF_KMEM (1 << 21)/* backed by heap memory */ 59#define _XBF_KMEM (1 << 21)/* backed by heap memory */
68#define _XBF_DELWRI_Q (1 << 22)/* buffer on delwri queue */ 60#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
69 61
70typedef unsigned int xfs_buf_flags_t; 62typedef unsigned int xfs_buf_flags_t;
71 63
@@ -73,25 +65,18 @@ typedef unsigned int xfs_buf_flags_t;
73 { XBF_READ, "READ" }, \ 65 { XBF_READ, "READ" }, \
74 { XBF_WRITE, "WRITE" }, \ 66 { XBF_WRITE, "WRITE" }, \
75 { XBF_READ_AHEAD, "READ_AHEAD" }, \ 67 { XBF_READ_AHEAD, "READ_AHEAD" }, \
76 { XBF_MAPPED, "MAPPED" }, \
77 { XBF_ASYNC, "ASYNC" }, \ 68 { XBF_ASYNC, "ASYNC" }, \
78 { XBF_DONE, "DONE" }, \ 69 { XBF_DONE, "DONE" }, \
79 { XBF_DELWRI, "DELWRI" }, \
80 { XBF_STALE, "STALE" }, \ 70 { XBF_STALE, "STALE" }, \
81 { XBF_SYNCIO, "SYNCIO" }, \ 71 { XBF_SYNCIO, "SYNCIO" }, \
82 { XBF_FUA, "FUA" }, \ 72 { XBF_FUA, "FUA" }, \
83 { XBF_FLUSH, "FLUSH" }, \ 73 { XBF_FLUSH, "FLUSH" }, \
84 { XBF_LOCK, "LOCK" }, /* should never be set */\ 74 { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\
85 { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\ 75 { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\
86 { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\
87 { _XBF_PAGES, "PAGES" }, \ 76 { _XBF_PAGES, "PAGES" }, \
88 { _XBF_KMEM, "KMEM" }, \ 77 { _XBF_KMEM, "KMEM" }, \
89 { _XBF_DELWRI_Q, "DELWRI_Q" } 78 { _XBF_DELWRI_Q, "DELWRI_Q" }
90 79
91typedef enum {
92 XBT_FORCE_FLUSH = 0,
93} xfs_buftarg_flags_t;
94
95typedef struct xfs_buftarg { 80typedef struct xfs_buftarg {
96 dev_t bt_dev; 81 dev_t bt_dev;
97 struct block_device *bt_bdev; 82 struct block_device *bt_bdev;
@@ -101,12 +86,6 @@ typedef struct xfs_buftarg {
101 unsigned int bt_sshift; 86 unsigned int bt_sshift;
102 size_t bt_smask; 87 size_t bt_smask;
103 88
104 /* per device delwri queue */
105 struct task_struct *bt_task;
106 struct list_head bt_delwri_queue;
107 spinlock_t bt_delwri_lock;
108 unsigned long bt_flags;
109
110 /* LRU control structures */ 89 /* LRU control structures */
111 struct shrinker bt_shrinker; 90 struct shrinker bt_shrinker;
112 struct list_head bt_lru; 91 struct list_head bt_lru;
@@ -128,8 +107,8 @@ typedef struct xfs_buf {
128 * fast-path on locking. 107 * fast-path on locking.
129 */ 108 */
130 struct rb_node b_rbnode; /* rbtree node */ 109 struct rb_node b_rbnode; /* rbtree node */
131 xfs_off_t b_file_offset; /* offset in file */ 110 xfs_daddr_t b_bn; /* block number for I/O */
132 size_t b_buffer_length;/* size of buffer in bytes */ 111 int b_length; /* size of buffer in BBs */
133 atomic_t b_hold; /* reference count */ 112 atomic_t b_hold; /* reference count */
134 atomic_t b_lru_ref; /* lru reclaim ref count */ 113 atomic_t b_lru_ref; /* lru reclaim ref count */
135 xfs_buf_flags_t b_flags; /* status flags */ 114 xfs_buf_flags_t b_flags; /* status flags */
@@ -140,8 +119,6 @@ typedef struct xfs_buf {
140 struct list_head b_list; 119 struct list_head b_list;
141 struct xfs_perag *b_pag; /* contains rbtree root */ 120 struct xfs_perag *b_pag; /* contains rbtree root */
142 xfs_buftarg_t *b_target; /* buffer target (device) */ 121 xfs_buftarg_t *b_target; /* buffer target (device) */
143 xfs_daddr_t b_bn; /* block number for I/O */
144 size_t b_count_desired;/* desired transfer size */
145 void *b_addr; /* virtual address of buffer */ 122 void *b_addr; /* virtual address of buffer */
146 struct work_struct b_iodone_work; 123 struct work_struct b_iodone_work;
147 xfs_buf_iodone_t b_iodone; /* I/O completion function */ 124 xfs_buf_iodone_t b_iodone; /* I/O completion function */
@@ -150,7 +127,7 @@ typedef struct xfs_buf {
150 struct xfs_trans *b_transp; 127 struct xfs_trans *b_transp;
151 struct page **b_pages; /* array of page pointers */ 128 struct page **b_pages; /* array of page pointers */
152 struct page *b_page_array[XB_PAGES]; /* inline pages */ 129 struct page *b_page_array[XB_PAGES]; /* inline pages */
153 unsigned long b_queuetime; /* time buffer was queued */ 130 int b_io_length; /* IO size in BBs */
154 atomic_t b_pin_count; /* pin count */ 131 atomic_t b_pin_count; /* pin count */
155 atomic_t b_io_remaining; /* #outstanding I/O requests */ 132 atomic_t b_io_remaining; /* #outstanding I/O requests */
156 unsigned int b_page_count; /* size of page array */ 133 unsigned int b_page_count; /* size of page array */
@@ -163,26 +140,30 @@ typedef struct xfs_buf {
163 140
164 141
165/* Finding and Reading Buffers */ 142/* Finding and Reading Buffers */
166extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t, 143struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, xfs_daddr_t blkno,
167 xfs_buf_flags_t, xfs_buf_t *); 144 size_t numblks, xfs_buf_flags_t flags,
145 struct xfs_buf *new_bp);
168#define xfs_incore(buftarg,blkno,len,lockit) \ 146#define xfs_incore(buftarg,blkno,len,lockit) \
169 _xfs_buf_find(buftarg, blkno ,len, lockit, NULL) 147 _xfs_buf_find(buftarg, blkno ,len, lockit, NULL)
170 148
171extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t, 149struct xfs_buf *xfs_buf_get(struct xfs_buftarg *target, xfs_daddr_t blkno,
172 xfs_buf_flags_t); 150 size_t numblks, xfs_buf_flags_t flags);
173extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t, 151struct xfs_buf *xfs_buf_read(struct xfs_buftarg *target, xfs_daddr_t blkno,
174 xfs_buf_flags_t); 152 size_t numblks, xfs_buf_flags_t flags);
175 153void xfs_buf_readahead(struct xfs_buftarg *target, xfs_daddr_t blkno,
176struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *, xfs_off_t, size_t, 154 size_t numblks);
177 xfs_buf_flags_t); 155
178extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len); 156struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks);
179extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int); 157struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *target, xfs_daddr_t blkno,
180extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); 158 size_t numblks, xfs_buf_flags_t flags);
181extern void xfs_buf_hold(xfs_buf_t *); 159void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks);
182extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t); 160int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
183struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp, 161
184 struct xfs_buftarg *target, 162struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
185 xfs_daddr_t daddr, size_t length, int flags); 163 int flags);
164struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target,
165 xfs_daddr_t daddr, size_t numblks, int flags);
166void xfs_buf_hold(struct xfs_buf *bp);
186 167
187/* Releasing Buffers */ 168/* Releasing Buffers */
188extern void xfs_buf_free(xfs_buf_t *); 169extern void xfs_buf_free(xfs_buf_t *);
@@ -204,7 +185,7 @@ extern int xfs_bdstrat_cb(struct xfs_buf *);
204extern void xfs_buf_ioend(xfs_buf_t *, int); 185extern void xfs_buf_ioend(xfs_buf_t *, int);
205extern void xfs_buf_ioerror(xfs_buf_t *, int); 186extern void xfs_buf_ioerror(xfs_buf_t *, int);
206extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); 187extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
207extern int xfs_buf_iorequest(xfs_buf_t *); 188extern void xfs_buf_iorequest(xfs_buf_t *);
208extern int xfs_buf_iowait(xfs_buf_t *); 189extern int xfs_buf_iowait(xfs_buf_t *);
209extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, 190extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
210 xfs_buf_rw_t); 191 xfs_buf_rw_t);
@@ -220,24 +201,22 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp)
220extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); 201extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
221 202
222/* Delayed Write Buffer Routines */ 203/* Delayed Write Buffer Routines */
223extern void xfs_buf_delwri_queue(struct xfs_buf *); 204extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
224extern void xfs_buf_delwri_dequeue(struct xfs_buf *); 205extern int xfs_buf_delwri_submit(struct list_head *);
225extern void xfs_buf_delwri_promote(struct xfs_buf *); 206extern int xfs_buf_delwri_submit_nowait(struct list_head *);
226 207
227/* Buffer Daemon Setup Routines */ 208/* Buffer Daemon Setup Routines */
228extern int xfs_buf_init(void); 209extern int xfs_buf_init(void);
229extern void xfs_buf_terminate(void); 210extern void xfs_buf_terminate(void);
230 211
231#define XFS_BUF_ZEROFLAGS(bp) \ 212#define XFS_BUF_ZEROFLAGS(bp) \
232 ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \ 213 ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \
233 XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) 214 XBF_SYNCIO|XBF_FUA|XBF_FLUSH))
234 215
235void xfs_buf_stale(struct xfs_buf *bp); 216void xfs_buf_stale(struct xfs_buf *bp);
236#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) 217#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
237#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) 218#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
238 219
239#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)
240
241#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) 220#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE)
242#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE) 221#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE)
243#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE) 222#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE)
@@ -256,12 +235,6 @@ void xfs_buf_stale(struct xfs_buf *bp);
256 235
257#define XFS_BUF_ADDR(bp) ((bp)->b_bn) 236#define XFS_BUF_ADDR(bp) ((bp)->b_bn)
258#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno)) 237#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno))
259#define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset)
260#define XFS_BUF_SET_OFFSET(bp, off) ((bp)->b_file_offset = (off))
261#define XFS_BUF_COUNT(bp) ((bp)->b_count_desired)
262#define XFS_BUF_SET_COUNT(bp, cnt) ((bp)->b_count_desired = (cnt))
263#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length)
264#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt))
265 238
266static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) 239static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
267{ 240{
@@ -287,7 +260,6 @@ extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
287extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); 260extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
288extern void xfs_wait_buftarg(xfs_buftarg_t *); 261extern void xfs_wait_buftarg(xfs_buftarg_t *);
289extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 262extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
290extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
291 263
292#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) 264#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
293#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) 265#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index eac97ef81e2a..45df2b857d48 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -20,7 +20,6 @@
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 23#include "xfs_trans.h"
25#include "xfs_sb.h" 24#include "xfs_sb.h"
26#include "xfs_ag.h" 25#include "xfs_ag.h"
@@ -123,11 +122,11 @@ xfs_buf_item_log_check(
123 ASSERT(bip->bli_logged != NULL); 122 ASSERT(bip->bli_logged != NULL);
124 123
125 bp = bip->bli_buf; 124 bp = bip->bli_buf;
126 ASSERT(XFS_BUF_COUNT(bp) > 0); 125 ASSERT(bp->b_length > 0);
127 ASSERT(bp->b_addr != NULL); 126 ASSERT(bp->b_addr != NULL);
128 orig = bip->bli_orig; 127 orig = bip->bli_orig;
129 buffer = bp->b_addr; 128 buffer = bp->b_addr;
130 for (x = 0; x < XFS_BUF_COUNT(bp); x++) { 129 for (x = 0; x < BBTOB(bp->b_length); x++) {
131 if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) { 130 if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
132 xfs_emerg(bp->b_mount, 131 xfs_emerg(bp->b_mount,
133 "%s: bip %x buffer %x orig %x index %d", 132 "%s: bip %x buffer %x orig %x index %d",
@@ -418,7 +417,6 @@ xfs_buf_item_unpin(
418 if (freed && stale) { 417 if (freed && stale) {
419 ASSERT(bip->bli_flags & XFS_BLI_STALE); 418 ASSERT(bip->bli_flags & XFS_BLI_STALE);
420 ASSERT(xfs_buf_islocked(bp)); 419 ASSERT(xfs_buf_islocked(bp));
421 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
422 ASSERT(XFS_BUF_ISSTALE(bp)); 420 ASSERT(XFS_BUF_ISSTALE(bp));
423 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); 421 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
424 422
@@ -455,42 +453,42 @@ xfs_buf_item_unpin(
455 bp->b_iodone = NULL; 453 bp->b_iodone = NULL;
456 } else { 454 } else {
457 spin_lock(&ailp->xa_lock); 455 spin_lock(&ailp->xa_lock);
458 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip); 456 xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR);
459 xfs_buf_item_relse(bp); 457 xfs_buf_item_relse(bp);
460 ASSERT(bp->b_fspriv == NULL); 458 ASSERT(bp->b_fspriv == NULL);
461 } 459 }
462 xfs_buf_relse(bp); 460 xfs_buf_relse(bp);
461 } else if (freed && remove) {
462 xfs_buf_lock(bp);
463 xfs_buf_ioerror(bp, EIO);
464 XFS_BUF_UNDONE(bp);
465 xfs_buf_stale(bp);
466 xfs_buf_ioend(bp, 0);
463 } 467 }
464} 468}
465 469
466/*
467 * This is called to attempt to lock the buffer associated with this
468 * buf log item. Don't sleep on the buffer lock. If we can't get
469 * the lock right away, return 0. If we can get the lock, take a
470 * reference to the buffer. If this is a delayed write buffer that
471 * needs AIL help to be written back, invoke the pushbuf routine
472 * rather than the normal success path.
473 */
474STATIC uint 470STATIC uint
475xfs_buf_item_trylock( 471xfs_buf_item_push(
476 struct xfs_log_item *lip) 472 struct xfs_log_item *lip,
473 struct list_head *buffer_list)
477{ 474{
478 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 475 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
479 struct xfs_buf *bp = bip->bli_buf; 476 struct xfs_buf *bp = bip->bli_buf;
477 uint rval = XFS_ITEM_SUCCESS;
480 478
481 if (xfs_buf_ispinned(bp)) 479 if (xfs_buf_ispinned(bp))
482 return XFS_ITEM_PINNED; 480 return XFS_ITEM_PINNED;
483 if (!xfs_buf_trylock(bp)) 481 if (!xfs_buf_trylock(bp))
484 return XFS_ITEM_LOCKED; 482 return XFS_ITEM_LOCKED;
485 483
486 /* take a reference to the buffer. */
487 xfs_buf_hold(bp);
488
489 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 484 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
490 trace_xfs_buf_item_trylock(bip); 485
491 if (XFS_BUF_ISDELAYWRITE(bp)) 486 trace_xfs_buf_item_push(bip);
492 return XFS_ITEM_PUSHBUF; 487
493 return XFS_ITEM_SUCCESS; 488 if (!xfs_buf_delwri_queue(bp, buffer_list))
489 rval = XFS_ITEM_FLUSHING;
490 xfs_buf_unlock(bp);
491 return rval;
494} 492}
495 493
496/* 494/*
@@ -603,49 +601,6 @@ xfs_buf_item_committed(
603 return lsn; 601 return lsn;
604} 602}
605 603
606/*
607 * The buffer is locked, but is not a delayed write buffer. This happens
608 * if we race with IO completion and hence we don't want to try to write it
609 * again. Just release the buffer.
610 */
611STATIC void
612xfs_buf_item_push(
613 struct xfs_log_item *lip)
614{
615 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
616 struct xfs_buf *bp = bip->bli_buf;
617
618 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
619 ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
620
621 trace_xfs_buf_item_push(bip);
622
623 xfs_buf_relse(bp);
624}
625
626/*
627 * The buffer is locked and is a delayed write buffer. Promote the buffer
628 * in the delayed write queue as the caller knows that they must invoke
629 * the xfsbufd to get this buffer written. We have to unlock the buffer
630 * to allow the xfsbufd to write it, too.
631 */
632STATIC bool
633xfs_buf_item_pushbuf(
634 struct xfs_log_item *lip)
635{
636 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
637 struct xfs_buf *bp = bip->bli_buf;
638
639 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
640 ASSERT(XFS_BUF_ISDELAYWRITE(bp));
641
642 trace_xfs_buf_item_pushbuf(bip);
643
644 xfs_buf_delwri_promote(bp);
645 xfs_buf_relse(bp);
646 return true;
647}
648
649STATIC void 604STATIC void
650xfs_buf_item_committing( 605xfs_buf_item_committing(
651 struct xfs_log_item *lip, 606 struct xfs_log_item *lip,
@@ -661,11 +616,9 @@ static const struct xfs_item_ops xfs_buf_item_ops = {
661 .iop_format = xfs_buf_item_format, 616 .iop_format = xfs_buf_item_format,
662 .iop_pin = xfs_buf_item_pin, 617 .iop_pin = xfs_buf_item_pin,
663 .iop_unpin = xfs_buf_item_unpin, 618 .iop_unpin = xfs_buf_item_unpin,
664 .iop_trylock = xfs_buf_item_trylock,
665 .iop_unlock = xfs_buf_item_unlock, 619 .iop_unlock = xfs_buf_item_unlock,
666 .iop_committed = xfs_buf_item_committed, 620 .iop_committed = xfs_buf_item_committed,
667 .iop_push = xfs_buf_item_push, 621 .iop_push = xfs_buf_item_push,
668 .iop_pushbuf = xfs_buf_item_pushbuf,
669 .iop_committing = xfs_buf_item_committing 622 .iop_committing = xfs_buf_item_committing
670}; 623};
671 624
@@ -703,7 +656,8 @@ xfs_buf_item_init(
703 * truncate any pieces. map_size is the size of the 656 * truncate any pieces. map_size is the size of the
704 * bitmap needed to describe the chunks of the buffer. 657 * bitmap needed to describe the chunks of the buffer.
705 */ 658 */
706 chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT); 659 chunks = (int)((BBTOB(bp->b_length) + (XFS_BLF_CHUNK - 1)) >>
660 XFS_BLF_SHIFT);
707 map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); 661 map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
708 662
709 bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, 663 bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
@@ -713,7 +667,7 @@ xfs_buf_item_init(
713 xfs_buf_hold(bp); 667 xfs_buf_hold(bp);
714 bip->bli_format.blf_type = XFS_LI_BUF; 668 bip->bli_format.blf_type = XFS_LI_BUF;
715 bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); 669 bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
716 bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); 670 bip->bli_format.blf_len = (ushort)bp->b_length;
717 bip->bli_format.blf_map_size = map_size; 671 bip->bli_format.blf_map_size = map_size;
718 672
719#ifdef XFS_TRANS_DEBUG 673#ifdef XFS_TRANS_DEBUG
@@ -725,9 +679,9 @@ xfs_buf_item_init(
725 * the buffer to indicate which bytes the callers have asked 679 * the buffer to indicate which bytes the callers have asked
726 * to have logged. 680 * to have logged.
727 */ 681 */
728 bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP); 682 bip->bli_orig = kmem_alloc(BBTOB(bp->b_length), KM_SLEEP);
729 memcpy(bip->bli_orig, bp->b_addr, XFS_BUF_COUNT(bp)); 683 memcpy(bip->bli_orig, bp->b_addr, BBTOB(bp->b_length));
730 bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP); 684 bip->bli_logged = kmem_zalloc(BBTOB(bp->b_length) / NBBY, KM_SLEEP);
731#endif 685#endif
732 686
733 /* 687 /*
@@ -984,20 +938,27 @@ xfs_buf_iodone_callbacks(
984 * If the write was asynchronous then no one will be looking for the 938 * If the write was asynchronous then no one will be looking for the
985 * error. Clear the error state and write the buffer out again. 939 * error. Clear the error state and write the buffer out again.
986 * 940 *
987 * During sync or umount we'll write all pending buffers again 941 * XXX: This helps against transient write errors, but we need to find
988 * synchronous, which will catch these errors if they keep hanging 942 * a way to shut the filesystem down if the writes keep failing.
989 * around. 943 *
944 * In practice we'll shut the filesystem down soon as non-transient
945 * erorrs tend to affect the whole device and a failing log write
946 * will make us give up. But we really ought to do better here.
990 */ 947 */
991 if (XFS_BUF_ISASYNC(bp)) { 948 if (XFS_BUF_ISASYNC(bp)) {
949 ASSERT(bp->b_iodone != NULL);
950
951 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
952
992 xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ 953 xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
993 954
994 if (!XFS_BUF_ISSTALE(bp)) { 955 if (!XFS_BUF_ISSTALE(bp)) {
995 xfs_buf_delwri_queue(bp); 956 bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE;
996 XFS_BUF_DONE(bp); 957 xfs_bdstrat_cb(bp);
958 } else {
959 xfs_buf_relse(bp);
997 } 960 }
998 ASSERT(bp->b_iodone != NULL); 961
999 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1000 xfs_buf_relse(bp);
1001 return; 962 return;
1002 } 963 }
1003 964
@@ -1045,6 +1006,6 @@ xfs_buf_iodone(
1045 * Either way, AIL is useless if we're forcing a shutdown. 1006 * Either way, AIL is useless if we're forcing a shutdown.
1046 */ 1007 */
1047 spin_lock(&ailp->xa_lock); 1008 spin_lock(&ailp->xa_lock);
1048 xfs_trans_ail_delete(ailp, lip); 1009 xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
1049 xfs_buf_item_free(BUF_ITEM(lip)); 1010 xfs_buf_item_free(BUF_ITEM(lip));
1050} 1011}
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 7f1a6f5b05a6..015b946c5808 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -20,7 +20,6 @@
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 23#include "xfs_trans.h"
25#include "xfs_sb.h" 24#include "xfs_sb.h"
26#include "xfs_ag.h" 25#include "xfs_ag.h"
@@ -2277,20 +2276,20 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps)
2277 if (nbuf == 1) { 2276 if (nbuf == 1) {
2278 dabuf->nbuf = 1; 2277 dabuf->nbuf = 1;
2279 bp = bps[0]; 2278 bp = bps[0];
2280 dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp)); 2279 dabuf->bbcount = bp->b_length;
2281 dabuf->data = bp->b_addr; 2280 dabuf->data = bp->b_addr;
2282 dabuf->bps[0] = bp; 2281 dabuf->bps[0] = bp;
2283 } else { 2282 } else {
2284 dabuf->nbuf = nbuf; 2283 dabuf->nbuf = nbuf;
2285 for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) { 2284 for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) {
2286 dabuf->bps[i] = bp = bps[i]; 2285 dabuf->bps[i] = bp = bps[i];
2287 dabuf->bbcount += BTOBB(XFS_BUF_COUNT(bp)); 2286 dabuf->bbcount += bp->b_length;
2288 } 2287 }
2289 dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP); 2288 dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP);
2290 for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) { 2289 for (i = off = 0; i < nbuf; i++, off += BBTOB(bp->b_length)) {
2291 bp = bps[i]; 2290 bp = bps[i];
2292 memcpy((char *)dabuf->data + off, bp->b_addr, 2291 memcpy((char *)dabuf->data + off, bp->b_addr,
2293 XFS_BUF_COUNT(bp)); 2292 BBTOB(bp->b_length));
2294 } 2293 }
2295 } 2294 }
2296 return dabuf; 2295 return dabuf;
@@ -2310,10 +2309,10 @@ xfs_da_buf_clean(xfs_dabuf_t *dabuf)
2310 ASSERT(dabuf->nbuf > 1); 2309 ASSERT(dabuf->nbuf > 1);
2311 dabuf->dirty = 0; 2310 dabuf->dirty = 0;
2312 for (i = off = 0; i < dabuf->nbuf; 2311 for (i = off = 0; i < dabuf->nbuf;
2313 i++, off += XFS_BUF_COUNT(bp)) { 2312 i++, off += BBTOB(bp->b_length)) {
2314 bp = dabuf->bps[i]; 2313 bp = dabuf->bps[i];
2315 memcpy(bp->b_addr, dabuf->data + off, 2314 memcpy(bp->b_addr, dabuf->data + off,
2316 XFS_BUF_COUNT(bp)); 2315 BBTOB(bp->b_length));
2317 } 2316 }
2318 } 2317 }
2319} 2318}
@@ -2356,10 +2355,10 @@ xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last)
2356 } 2355 }
2357 dabuf->dirty = 1; 2356 dabuf->dirty = 1;
2358 ASSERT(first <= last); 2357 ASSERT(first <= last);
2359 for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) { 2358 for (i = off = 0; i < dabuf->nbuf; i++, off += BBTOB(bp->b_length)) {
2360 bp = dabuf->bps[i]; 2359 bp = dabuf->bps[i];
2361 f = off; 2360 f = off;
2362 l = f + XFS_BUF_COUNT(bp) - 1; 2361 l = f + BBTOB(bp->b_length) - 1;
2363 if (f < first) 2362 if (f < first)
2364 f = first; 2363 f = first;
2365 if (l > last) 2364 if (l > last)
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 1137bbc5eccb..e00de08dc8ac 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -18,9 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h" 21#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 22#include "xfs_trans.h"
25#include "xfs_sb.h" 23#include "xfs_sb.h"
26#include "xfs_ag.h" 24#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index a2e27010c7fb..67a250c36d41 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -18,7 +18,6 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h" 21#include "xfs_log.h"
23#include "xfs_inum.h" 22#include "xfs_inum.h"
24#include "xfs_trans.h" 23#include "xfs_trans.h"
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index d3b63aefd01d..586732f2d80d 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -19,7 +19,6 @@
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_log.h" 21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h" 22#include "xfs_trans.h"
24#include "xfs_sb.h" 23#include "xfs_sb.h"
25#include "xfs_ag.h" 24#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 5bbe2a8a023f..2046988e9eb2 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -19,7 +19,6 @@
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_log.h" 21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h" 22#include "xfs_trans.h"
24#include "xfs_sb.h" 23#include "xfs_sb.h"
25#include "xfs_ag.h" 24#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 66e108f561a3..397ffbcbab1d 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -20,7 +20,6 @@
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 23#include "xfs_trans.h"
25#include "xfs_sb.h" 24#include "xfs_sb.h"
26#include "xfs_ag.h" 25#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 0179a41d9e5a..b0f26780449d 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -19,7 +19,6 @@
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_log.h" 21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h" 22#include "xfs_trans.h"
24#include "xfs_sb.h" 23#include "xfs_sb.h"
25#include "xfs_ag.h" 24#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 79d05e84e296..19bf0c5e38f4 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -19,7 +19,6 @@
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_log.h" 21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h" 22#include "xfs_trans.h"
24#include "xfs_sb.h" 23#include "xfs_sb.h"
25#include "xfs_ag.h" 24#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 1ad3a4b8ca40..f9c3fe304a17 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -17,7 +17,6 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_sb.h" 19#include "xfs_sb.h"
20#include "xfs_inum.h"
21#include "xfs_log.h" 20#include "xfs_log.h"
22#include "xfs_ag.h" 21#include "xfs_ag.h"
23#include "xfs_mount.h" 22#include "xfs_mount.h"
@@ -30,6 +29,7 @@
30#include "xfs_inode.h" 29#include "xfs_inode.h"
31#include "xfs_alloc.h" 30#include "xfs_alloc.h"
32#include "xfs_error.h" 31#include "xfs_error.h"
32#include "xfs_extent_busy.h"
33#include "xfs_discard.h" 33#include "xfs_discard.h"
34#include "xfs_trace.h" 34#include "xfs_trace.h"
35 35
@@ -118,7 +118,7 @@ xfs_trim_extents(
118 * If any blocks in the range are still busy, skip the 118 * If any blocks in the range are still busy, skip the
119 * discard and try again the next time. 119 * discard and try again the next time.
120 */ 120 */
121 if (xfs_alloc_busy_search(mp, agno, fbno, flen)) { 121 if (xfs_extent_busy_search(mp, agno, fbno, flen)) {
122 trace_xfs_discard_busy(mp, agno, fbno, flen); 122 trace_xfs_discard_busy(mp, agno, fbno, flen);
123 goto next_extent; 123 goto next_extent;
124 } 124 }
@@ -212,7 +212,7 @@ xfs_discard_extents(
212 struct xfs_mount *mp, 212 struct xfs_mount *mp,
213 struct list_head *list) 213 struct list_head *list)
214{ 214{
215 struct xfs_busy_extent *busyp; 215 struct xfs_extent_busy *busyp;
216 int error = 0; 216 int error = 0;
217 217
218 list_for_each_entry(busyp, list, list) { 218 list_for_each_entry(busyp, list, list) {
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 1155208fa830..bf27fcca4843 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -19,7 +19,6 @@
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_bit.h" 20#include "xfs_bit.h"
21#include "xfs_log.h" 21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h" 22#include "xfs_trans.h"
24#include "xfs_sb.h" 23#include "xfs_sb.h"
25#include "xfs_ag.h" 24#include "xfs_ag.h"
@@ -857,7 +856,7 @@ xfs_qm_dqflush_done(
857 /* xfs_trans_ail_delete() drops the AIL lock. */ 856 /* xfs_trans_ail_delete() drops the AIL lock. */
858 spin_lock(&ailp->xa_lock); 857 spin_lock(&ailp->xa_lock);
859 if (lip->li_lsn == qip->qli_flush_lsn) 858 if (lip->li_lsn == qip->qli_flush_lsn)
860 xfs_trans_ail_delete(ailp, lip); 859 xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
861 else 860 else
862 spin_unlock(&ailp->xa_lock); 861 spin_unlock(&ailp->xa_lock);
863 } 862 }
@@ -878,8 +877,8 @@ xfs_qm_dqflush_done(
878 */ 877 */
879int 878int
880xfs_qm_dqflush( 879xfs_qm_dqflush(
881 xfs_dquot_t *dqp, 880 struct xfs_dquot *dqp,
882 uint flags) 881 struct xfs_buf **bpp)
883{ 882{
884 struct xfs_mount *mp = dqp->q_mount; 883 struct xfs_mount *mp = dqp->q_mount;
885 struct xfs_buf *bp; 884 struct xfs_buf *bp;
@@ -891,25 +890,30 @@ xfs_qm_dqflush(
891 890
892 trace_xfs_dqflush(dqp); 891 trace_xfs_dqflush(dqp);
893 892
894 /* 893 *bpp = NULL;
895 * If not dirty, or it's pinned and we are not supposed to block, nada. 894
896 */
897 if (!XFS_DQ_IS_DIRTY(dqp) ||
898 ((flags & SYNC_TRYLOCK) && atomic_read(&dqp->q_pincount) > 0)) {
899 xfs_dqfunlock(dqp);
900 return 0;
901 }
902 xfs_qm_dqunpin_wait(dqp); 895 xfs_qm_dqunpin_wait(dqp);
903 896
904 /* 897 /*
905 * This may have been unpinned because the filesystem is shutting 898 * This may have been unpinned because the filesystem is shutting
906 * down forcibly. If that's the case we must not write this dquot 899 * down forcibly. If that's the case we must not write this dquot
907 * to disk, because the log record didn't make it to disk! 900 * to disk, because the log record didn't make it to disk.
901 *
902 * We also have to remove the log item from the AIL in this case,
903 * as we wait for an emptry AIL as part of the unmount process.
908 */ 904 */
909 if (XFS_FORCED_SHUTDOWN(mp)) { 905 if (XFS_FORCED_SHUTDOWN(mp)) {
906 struct xfs_log_item *lip = &dqp->q_logitem.qli_item;
910 dqp->dq_flags &= ~XFS_DQ_DIRTY; 907 dqp->dq_flags &= ~XFS_DQ_DIRTY;
911 xfs_dqfunlock(dqp); 908
912 return XFS_ERROR(EIO); 909 spin_lock(&mp->m_ail->xa_lock);
910 if (lip->li_flags & XFS_LI_IN_AIL)
911 xfs_trans_ail_delete(mp->m_ail, lip,
912 SHUTDOWN_CORRUPT_INCORE);
913 else
914 spin_unlock(&mp->m_ail->xa_lock);
915 error = XFS_ERROR(EIO);
916 goto out_unlock;
913 } 917 }
914 918
915 /* 919 /*
@@ -917,11 +921,8 @@ xfs_qm_dqflush(
917 */ 921 */
918 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, 922 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
919 mp->m_quotainfo->qi_dqchunklen, 0, &bp); 923 mp->m_quotainfo->qi_dqchunklen, 0, &bp);
920 if (error) { 924 if (error)
921 ASSERT(error != ENOENT); 925 goto out_unlock;
922 xfs_dqfunlock(dqp);
923 return error;
924 }
925 926
926 /* 927 /*
927 * Calculate the location of the dquot inside the buffer. 928 * Calculate the location of the dquot inside the buffer.
@@ -967,20 +968,13 @@ xfs_qm_dqflush(
967 xfs_log_force(mp, 0); 968 xfs_log_force(mp, 0);
968 } 969 }
969 970
970 if (flags & SYNC_WAIT)
971 error = xfs_bwrite(bp);
972 else
973 xfs_buf_delwri_queue(bp);
974
975 xfs_buf_relse(bp);
976
977 trace_xfs_dqflush_done(dqp); 971 trace_xfs_dqflush_done(dqp);
972 *bpp = bp;
973 return 0;
978 974
979 /* 975out_unlock:
980 * dqp is still locked, but caller is free to unlock it now. 976 xfs_dqfunlock(dqp);
981 */ 977 return XFS_ERROR(EIO);
982 return error;
983
984} 978}
985 979
986/* 980/*
@@ -1011,39 +1005,6 @@ xfs_dqlock2(
1011 } 1005 }
1012} 1006}
1013 1007
1014/*
1015 * Give the buffer a little push if it is incore and
1016 * wait on the flush lock.
1017 */
1018void
1019xfs_dqflock_pushbuf_wait(
1020 xfs_dquot_t *dqp)
1021{
1022 xfs_mount_t *mp = dqp->q_mount;
1023 xfs_buf_t *bp;
1024
1025 /*
1026 * Check to see if the dquot has been flushed delayed
1027 * write. If so, grab its buffer and send it
1028 * out immediately. We'll be able to acquire
1029 * the flush lock when the I/O completes.
1030 */
1031 bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno,
1032 mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
1033 if (!bp)
1034 goto out_lock;
1035
1036 if (XFS_BUF_ISDELAYWRITE(bp)) {
1037 if (xfs_buf_ispinned(bp))
1038 xfs_log_force(mp, 0);
1039 xfs_buf_delwri_promote(bp);
1040 wake_up_process(bp->b_target->bt_task);
1041 }
1042 xfs_buf_relse(bp);
1043out_lock:
1044 xfs_dqflock(dqp);
1045}
1046
1047int __init 1008int __init
1048xfs_qm_init(void) 1009xfs_qm_init(void)
1049{ 1010{
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index ef9190bd8b30..7d20af27346d 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -141,7 +141,7 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
141extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, 141extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
142 uint, struct xfs_dquot **); 142 uint, struct xfs_dquot **);
143extern void xfs_qm_dqdestroy(xfs_dquot_t *); 143extern void xfs_qm_dqdestroy(xfs_dquot_t *);
144extern int xfs_qm_dqflush(xfs_dquot_t *, uint); 144extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **);
145extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); 145extern void xfs_qm_dqunpin_wait(xfs_dquot_t *);
146extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, 146extern void xfs_qm_adjust_dqtimers(xfs_mount_t *,
147 xfs_disk_dquot_t *); 147 xfs_disk_dquot_t *);
@@ -152,7 +152,6 @@ extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
152extern void xfs_qm_dqput(xfs_dquot_t *); 152extern void xfs_qm_dqput(xfs_dquot_t *);
153 153
154extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); 154extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
155extern void xfs_dqflock_pushbuf_wait(struct xfs_dquot *dqp);
156 155
157static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) 156static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
158{ 157{
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 34baeae45265..57aa4b03720c 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -17,9 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_bit.h"
21#include "xfs_log.h" 20#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h" 21#include "xfs_trans.h"
24#include "xfs_sb.h" 22#include "xfs_sb.h"
25#include "xfs_ag.h" 23#include "xfs_ag.h"
@@ -108,38 +106,6 @@ xfs_qm_dquot_logitem_unpin(
108 wake_up(&dqp->q_pinwait); 106 wake_up(&dqp->q_pinwait);
109} 107}
110 108
111/*
112 * Given the logitem, this writes the corresponding dquot entry to disk
113 * asynchronously. This is called with the dquot entry securely locked;
114 * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot
115 * at the end.
116 */
117STATIC void
118xfs_qm_dquot_logitem_push(
119 struct xfs_log_item *lip)
120{
121 struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
122 int error;
123
124 ASSERT(XFS_DQ_IS_LOCKED(dqp));
125 ASSERT(!completion_done(&dqp->q_flush));
126
127 /*
128 * Since we were able to lock the dquot's flush lock and
129 * we found it on the AIL, the dquot must be dirty. This
130 * is because the dquot is removed from the AIL while still
131 * holding the flush lock in xfs_dqflush_done(). Thus, if
132 * we found it in the AIL and were able to obtain the flush
133 * lock without sleeping, then there must not have been
134 * anyone in the process of flushing the dquot.
135 */
136 error = xfs_qm_dqflush(dqp, SYNC_TRYLOCK);
137 if (error)
138 xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
139 __func__, error, dqp);
140 xfs_dqunlock(dqp);
141}
142
143STATIC xfs_lsn_t 109STATIC xfs_lsn_t
144xfs_qm_dquot_logitem_committed( 110xfs_qm_dquot_logitem_committed(
145 struct xfs_log_item *lip, 111 struct xfs_log_item *lip,
@@ -171,67 +137,15 @@ xfs_qm_dqunpin_wait(
171 wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); 137 wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
172} 138}
173 139
174/*
175 * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that
176 * the dquot is locked by us, but the flush lock isn't. So, here we are
177 * going to see if the relevant dquot buffer is incore, waiting on DELWRI.
178 * If so, we want to push it out to help us take this item off the AIL as soon
179 * as possible.
180 *
181 * We must not be holding the AIL lock at this point. Calling incore() to
182 * search the buffer cache can be a time consuming thing, and AIL lock is a
183 * spinlock.
184 */
185STATIC bool
186xfs_qm_dquot_logitem_pushbuf(
187 struct xfs_log_item *lip)
188{
189 struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
190 struct xfs_dquot *dqp = qlip->qli_dquot;
191 struct xfs_buf *bp;
192 bool ret = true;
193
194 ASSERT(XFS_DQ_IS_LOCKED(dqp));
195
196 /*
197 * If flushlock isn't locked anymore, chances are that the
198 * inode flush completed and the inode was taken off the AIL.
199 * So, just get out.
200 */
201 if (completion_done(&dqp->q_flush) ||
202 !(lip->li_flags & XFS_LI_IN_AIL)) {
203 xfs_dqunlock(dqp);
204 return true;
205 }
206
207 bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
208 dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
209 xfs_dqunlock(dqp);
210 if (!bp)
211 return true;
212 if (XFS_BUF_ISDELAYWRITE(bp))
213 xfs_buf_delwri_promote(bp);
214 if (xfs_buf_ispinned(bp))
215 ret = false;
216 xfs_buf_relse(bp);
217 return ret;
218}
219
220/*
221 * This is called to attempt to lock the dquot associated with this
222 * dquot log item. Don't sleep on the dquot lock or the flush lock.
223 * If the flush lock is already held, indicating that the dquot has
224 * been or is in the process of being flushed, then see if we can
225 * find the dquot's buffer in the buffer cache without sleeping. If
226 * we can and it is marked delayed write, then we want to send it out.
227 * We delay doing so until the push routine, though, to avoid sleeping
228 * in any device strategy routines.
229 */
230STATIC uint 140STATIC uint
231xfs_qm_dquot_logitem_trylock( 141xfs_qm_dquot_logitem_push(
232 struct xfs_log_item *lip) 142 struct xfs_log_item *lip,
143 struct list_head *buffer_list)
233{ 144{
234 struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; 145 struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
146 struct xfs_buf *bp = NULL;
147 uint rval = XFS_ITEM_SUCCESS;
148 int error;
235 149
236 if (atomic_read(&dqp->q_pincount) > 0) 150 if (atomic_read(&dqp->q_pincount) > 0)
237 return XFS_ITEM_PINNED; 151 return XFS_ITEM_PINNED;
@@ -239,16 +153,41 @@ xfs_qm_dquot_logitem_trylock(
239 if (!xfs_dqlock_nowait(dqp)) 153 if (!xfs_dqlock_nowait(dqp))
240 return XFS_ITEM_LOCKED; 154 return XFS_ITEM_LOCKED;
241 155
156 /*
157 * Re-check the pincount now that we stabilized the value by
158 * taking the quota lock.
159 */
160 if (atomic_read(&dqp->q_pincount) > 0) {
161 rval = XFS_ITEM_PINNED;
162 goto out_unlock;
163 }
164
165 /*
166 * Someone else is already flushing the dquot. Nothing we can do
167 * here but wait for the flush to finish and remove the item from
168 * the AIL.
169 */
242 if (!xfs_dqflock_nowait(dqp)) { 170 if (!xfs_dqflock_nowait(dqp)) {
243 /* 171 rval = XFS_ITEM_FLUSHING;
244 * dquot has already been flushed to the backing buffer, 172 goto out_unlock;
245 * leave it locked, pushbuf routine will unlock it.
246 */
247 return XFS_ITEM_PUSHBUF;
248 } 173 }
249 174
250 ASSERT(lip->li_flags & XFS_LI_IN_AIL); 175 spin_unlock(&lip->li_ailp->xa_lock);
251 return XFS_ITEM_SUCCESS; 176
177 error = xfs_qm_dqflush(dqp, &bp);
178 if (error) {
179 xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
180 __func__, error, dqp);
181 } else {
182 if (!xfs_buf_delwri_queue(bp, buffer_list))
183 rval = XFS_ITEM_FLUSHING;
184 xfs_buf_relse(bp);
185 }
186
187 spin_lock(&lip->li_ailp->xa_lock);
188out_unlock:
189 xfs_dqunlock(dqp);
190 return rval;
252} 191}
253 192
254/* 193/*
@@ -299,11 +238,9 @@ static const struct xfs_item_ops xfs_dquot_item_ops = {
299 .iop_format = xfs_qm_dquot_logitem_format, 238 .iop_format = xfs_qm_dquot_logitem_format,
300 .iop_pin = xfs_qm_dquot_logitem_pin, 239 .iop_pin = xfs_qm_dquot_logitem_pin,
301 .iop_unpin = xfs_qm_dquot_logitem_unpin, 240 .iop_unpin = xfs_qm_dquot_logitem_unpin,
302 .iop_trylock = xfs_qm_dquot_logitem_trylock,
303 .iop_unlock = xfs_qm_dquot_logitem_unlock, 241 .iop_unlock = xfs_qm_dquot_logitem_unlock,
304 .iop_committed = xfs_qm_dquot_logitem_committed, 242 .iop_committed = xfs_qm_dquot_logitem_committed,
305 .iop_push = xfs_qm_dquot_logitem_push, 243 .iop_push = xfs_qm_dquot_logitem_push,
306 .iop_pushbuf = xfs_qm_dquot_logitem_pushbuf,
307 .iop_committing = xfs_qm_dquot_logitem_committing 244 .iop_committing = xfs_qm_dquot_logitem_committing
308}; 245};
309 246
@@ -398,11 +335,13 @@ xfs_qm_qoff_logitem_unpin(
398} 335}
399 336
400/* 337/*
401 * Quotaoff items have no locking, so just return success. 338 * There isn't much you can do to push a quotaoff item. It is simply
339 * stuck waiting for the log to be flushed to disk.
402 */ 340 */
403STATIC uint 341STATIC uint
404xfs_qm_qoff_logitem_trylock( 342xfs_qm_qoff_logitem_push(
405 struct xfs_log_item *lip) 343 struct xfs_log_item *lip,
344 struct list_head *buffer_list)
406{ 345{
407 return XFS_ITEM_LOCKED; 346 return XFS_ITEM_LOCKED;
408} 347}
@@ -429,17 +368,6 @@ xfs_qm_qoff_logitem_committed(
429 return lsn; 368 return lsn;
430} 369}
431 370
432/*
433 * There isn't much you can do to push on an quotaoff item. It is simply
434 * stuck waiting for the log to be flushed to disk.
435 */
436STATIC void
437xfs_qm_qoff_logitem_push(
438 struct xfs_log_item *lip)
439{
440}
441
442
443STATIC xfs_lsn_t 371STATIC xfs_lsn_t
444xfs_qm_qoffend_logitem_committed( 372xfs_qm_qoffend_logitem_committed(
445 struct xfs_log_item *lip, 373 struct xfs_log_item *lip,
@@ -454,7 +382,7 @@ xfs_qm_qoffend_logitem_committed(
454 * xfs_trans_ail_delete() drops the AIL lock. 382 * xfs_trans_ail_delete() drops the AIL lock.
455 */ 383 */
456 spin_lock(&ailp->xa_lock); 384 spin_lock(&ailp->xa_lock);
457 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs); 385 xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
458 386
459 kmem_free(qfs); 387 kmem_free(qfs);
460 kmem_free(qfe); 388 kmem_free(qfe);
@@ -487,7 +415,6 @@ static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
487 .iop_format = xfs_qm_qoff_logitem_format, 415 .iop_format = xfs_qm_qoff_logitem_format,
488 .iop_pin = xfs_qm_qoff_logitem_pin, 416 .iop_pin = xfs_qm_qoff_logitem_pin,
489 .iop_unpin = xfs_qm_qoff_logitem_unpin, 417 .iop_unpin = xfs_qm_qoff_logitem_unpin,
490 .iop_trylock = xfs_qm_qoff_logitem_trylock,
491 .iop_unlock = xfs_qm_qoff_logitem_unlock, 418 .iop_unlock = xfs_qm_qoff_logitem_unlock,
492 .iop_committed = xfs_qm_qoffend_logitem_committed, 419 .iop_committed = xfs_qm_qoffend_logitem_committed,
493 .iop_push = xfs_qm_qoff_logitem_push, 420 .iop_push = xfs_qm_qoff_logitem_push,
@@ -502,7 +429,6 @@ static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
502 .iop_format = xfs_qm_qoff_logitem_format, 429 .iop_format = xfs_qm_qoff_logitem_format,
503 .iop_pin = xfs_qm_qoff_logitem_pin, 430 .iop_pin = xfs_qm_qoff_logitem_pin,
504 .iop_unpin = xfs_qm_qoff_logitem_unpin, 431 .iop_unpin = xfs_qm_qoff_logitem_unpin,
505 .iop_trylock = xfs_qm_qoff_logitem_trylock,
506 .iop_unlock = xfs_qm_qoff_logitem_unlock, 432 .iop_unlock = xfs_qm_qoff_logitem_unlock,
507 .iop_committed = xfs_qm_qoff_logitem_committed, 433 .iop_committed = xfs_qm_qoff_logitem_committed,
508 .iop_push = xfs_qm_qoff_logitem_push, 434 .iop_push = xfs_qm_qoff_logitem_push,
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 39f06336b99d..610456054dc2 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -19,7 +19,6 @@
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_log.h" 21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h" 22#include "xfs_trans.h"
24#include "xfs_sb.h" 23#include "xfs_sb.h"
25#include "xfs_ag.h" 24#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 558910f5e3c0..2d25d19c4ea1 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -17,7 +17,6 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_types.h" 19#include "xfs_types.h"
20#include "xfs_inum.h"
21#include "xfs_log.h" 20#include "xfs_log.h"
22#include "xfs_trans.h" 21#include "xfs_trans.h"
23#include "xfs_sb.h" 22#include "xfs_sb.h"
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
new file mode 100644
index 000000000000..85e9f87a1a7c
--- /dev/null
+++ b/fs/xfs/xfs_extent_busy.c
@@ -0,0 +1,603 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * Copyright (c) 2010 David Chinner.
4 * Copyright (c) 2011 Christoph Hellwig.
5 * All Rights Reserved.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it would be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20#include "xfs.h"
21#include "xfs_fs.h"
22#include "xfs_types.h"
23#include "xfs_log.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_bmap_btree.h"
29#include "xfs_alloc.h"
30#include "xfs_inode.h"
31#include "xfs_extent_busy.h"
32#include "xfs_trace.h"
33
34void
35xfs_extent_busy_insert(
36 struct xfs_trans *tp,
37 xfs_agnumber_t agno,
38 xfs_agblock_t bno,
39 xfs_extlen_t len,
40 unsigned int flags)
41{
42 struct xfs_extent_busy *new;
43 struct xfs_extent_busy *busyp;
44 struct xfs_perag *pag;
45 struct rb_node **rbp;
46 struct rb_node *parent = NULL;
47
48 new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL);
49 if (!new) {
50 /*
51 * No Memory! Since it is now not possible to track the free
52 * block, make this a synchronous transaction to insure that
53 * the block is not reused before this transaction commits.
54 */
55 trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len);
56 xfs_trans_set_sync(tp);
57 return;
58 }
59
60 new->agno = agno;
61 new->bno = bno;
62 new->length = len;
63 INIT_LIST_HEAD(&new->list);
64 new->flags = flags;
65
66 /* trace before insert to be able to see failed inserts */
67 trace_xfs_extent_busy(tp->t_mountp, agno, bno, len);
68
69 pag = xfs_perag_get(tp->t_mountp, new->agno);
70 spin_lock(&pag->pagb_lock);
71 rbp = &pag->pagb_tree.rb_node;
72 while (*rbp) {
73 parent = *rbp;
74 busyp = rb_entry(parent, struct xfs_extent_busy, rb_node);
75
76 if (new->bno < busyp->bno) {
77 rbp = &(*rbp)->rb_left;
78 ASSERT(new->bno + new->length <= busyp->bno);
79 } else if (new->bno > busyp->bno) {
80 rbp = &(*rbp)->rb_right;
81 ASSERT(bno >= busyp->bno + busyp->length);
82 } else {
83 ASSERT(0);
84 }
85 }
86
87 rb_link_node(&new->rb_node, parent, rbp);
88 rb_insert_color(&new->rb_node, &pag->pagb_tree);
89
90 list_add(&new->list, &tp->t_busy);
91 spin_unlock(&pag->pagb_lock);
92 xfs_perag_put(pag);
93}
94
95/*
96 * Search for a busy extent within the range of the extent we are about to
97 * allocate. You need to be holding the busy extent tree lock when calling
98 * xfs_extent_busy_search(). This function returns 0 for no overlapping busy
99 * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
100 * match. This is done so that a non-zero return indicates an overlap that
101 * will require a synchronous transaction, but it can still be
102 * used to distinguish between a partial or exact match.
103 */
104int
105xfs_extent_busy_search(
106 struct xfs_mount *mp,
107 xfs_agnumber_t agno,
108 xfs_agblock_t bno,
109 xfs_extlen_t len)
110{
111 struct xfs_perag *pag;
112 struct rb_node *rbp;
113 struct xfs_extent_busy *busyp;
114 int match = 0;
115
116 pag = xfs_perag_get(mp, agno);
117 spin_lock(&pag->pagb_lock);
118
119 rbp = pag->pagb_tree.rb_node;
120
121 /* find closest start bno overlap */
122 while (rbp) {
123 busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node);
124 if (bno < busyp->bno) {
125 /* may overlap, but exact start block is lower */
126 if (bno + len > busyp->bno)
127 match = -1;
128 rbp = rbp->rb_left;
129 } else if (bno > busyp->bno) {
130 /* may overlap, but exact start block is higher */
131 if (bno < busyp->bno + busyp->length)
132 match = -1;
133 rbp = rbp->rb_right;
134 } else {
135 /* bno matches busyp, length determines exact match */
136 match = (busyp->length == len) ? 1 : -1;
137 break;
138 }
139 }
140 spin_unlock(&pag->pagb_lock);
141 xfs_perag_put(pag);
142 return match;
143}
144
145/*
146 * The found free extent [fbno, fend] overlaps part or all of the given busy
147 * extent. If the overlap covers the beginning, the end, or all of the busy
148 * extent, the overlapping portion can be made unbusy and used for the
149 * allocation. We can't split a busy extent because we can't modify a
150 * transaction/CIL context busy list, but we can update an entries block
151 * number or length.
152 *
153 * Returns true if the extent can safely be reused, or false if the search
154 * needs to be restarted.
155 */
156STATIC bool
157xfs_extent_busy_update_extent(
158 struct xfs_mount *mp,
159 struct xfs_perag *pag,
160 struct xfs_extent_busy *busyp,
161 xfs_agblock_t fbno,
162 xfs_extlen_t flen,
163 bool userdata)
164{
165 xfs_agblock_t fend = fbno + flen;
166 xfs_agblock_t bbno = busyp->bno;
167 xfs_agblock_t bend = bbno + busyp->length;
168
169 /*
170 * This extent is currently being discarded. Give the thread
171 * performing the discard a chance to mark the extent unbusy
172 * and retry.
173 */
174 if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) {
175 spin_unlock(&pag->pagb_lock);
176 delay(1);
177 spin_lock(&pag->pagb_lock);
178 return false;
179 }
180
181 /*
182 * If there is a busy extent overlapping a user allocation, we have
183 * no choice but to force the log and retry the search.
184 *
185 * Fortunately this does not happen during normal operation, but
186 * only if the filesystem is very low on space and has to dip into
187 * the AGFL for normal allocations.
188 */
189 if (userdata)
190 goto out_force_log;
191
192 if (bbno < fbno && bend > fend) {
193 /*
194 * Case 1:
195 * bbno bend
196 * +BBBBBBBBBBBBBBBBB+
197 * +---------+
198 * fbno fend
199 */
200
201 /*
202 * We would have to split the busy extent to be able to track
203 * it correct, which we cannot do because we would have to
204 * modify the list of busy extents attached to the transaction
205 * or CIL context, which is immutable.
206 *
207 * Force out the log to clear the busy extent and retry the
208 * search.
209 */
210 goto out_force_log;
211 } else if (bbno >= fbno && bend <= fend) {
212 /*
213 * Case 2:
214 * bbno bend
215 * +BBBBBBBBBBBBBBBBB+
216 * +-----------------+
217 * fbno fend
218 *
219 * Case 3:
220 * bbno bend
221 * +BBBBBBBBBBBBBBBBB+
222 * +--------------------------+
223 * fbno fend
224 *
225 * Case 4:
226 * bbno bend
227 * +BBBBBBBBBBBBBBBBB+
228 * +--------------------------+
229 * fbno fend
230 *
231 * Case 5:
232 * bbno bend
233 * +BBBBBBBBBBBBBBBBB+
234 * +-----------------------------------+
235 * fbno fend
236 *
237 */
238
239 /*
240 * The busy extent is fully covered by the extent we are
241 * allocating, and can simply be removed from the rbtree.
242 * However we cannot remove it from the immutable list
243 * tracking busy extents in the transaction or CIL context,
244 * so set the length to zero to mark it invalid.
245 *
246 * We also need to restart the busy extent search from the
247 * tree root, because erasing the node can rearrange the
248 * tree topology.
249 */
250 rb_erase(&busyp->rb_node, &pag->pagb_tree);
251 busyp->length = 0;
252 return false;
253 } else if (fend < bend) {
254 /*
255 * Case 6:
256 * bbno bend
257 * +BBBBBBBBBBBBBBBBB+
258 * +---------+
259 * fbno fend
260 *
261 * Case 7:
262 * bbno bend
263 * +BBBBBBBBBBBBBBBBB+
264 * +------------------+
265 * fbno fend
266 *
267 */
268 busyp->bno = fend;
269 } else if (bbno < fbno) {
270 /*
271 * Case 8:
272 * bbno bend
273 * +BBBBBBBBBBBBBBBBB+
274 * +-------------+
275 * fbno fend
276 *
277 * Case 9:
278 * bbno bend
279 * +BBBBBBBBBBBBBBBBB+
280 * +----------------------+
281 * fbno fend
282 */
283 busyp->length = fbno - busyp->bno;
284 } else {
285 ASSERT(0);
286 }
287
288 trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen);
289 return true;
290
291out_force_log:
292 spin_unlock(&pag->pagb_lock);
293 xfs_log_force(mp, XFS_LOG_SYNC);
294 trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen);
295 spin_lock(&pag->pagb_lock);
296 return false;
297}
298
299
300/*
301 * For a given extent [fbno, flen], make sure we can reuse it safely.
302 */
303void
304xfs_extent_busy_reuse(
305 struct xfs_mount *mp,
306 xfs_agnumber_t agno,
307 xfs_agblock_t fbno,
308 xfs_extlen_t flen,
309 bool userdata)
310{
311 struct xfs_perag *pag;
312 struct rb_node *rbp;
313
314 ASSERT(flen > 0);
315
316 pag = xfs_perag_get(mp, agno);
317 spin_lock(&pag->pagb_lock);
318restart:
319 rbp = pag->pagb_tree.rb_node;
320 while (rbp) {
321 struct xfs_extent_busy *busyp =
322 rb_entry(rbp, struct xfs_extent_busy, rb_node);
323 xfs_agblock_t bbno = busyp->bno;
324 xfs_agblock_t bend = bbno + busyp->length;
325
326 if (fbno + flen <= bbno) {
327 rbp = rbp->rb_left;
328 continue;
329 } else if (fbno >= bend) {
330 rbp = rbp->rb_right;
331 continue;
332 }
333
334 if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen,
335 userdata))
336 goto restart;
337 }
338 spin_unlock(&pag->pagb_lock);
339 xfs_perag_put(pag);
340}
341
342/*
343 * For a given extent [fbno, flen], search the busy extent list to find a
344 * subset of the extent that is not busy. If *rlen is smaller than
345 * args->minlen no suitable extent could be found, and the higher level
346 * code needs to force out the log and retry the allocation.
347 */
348void
349xfs_extent_busy_trim(
350 struct xfs_alloc_arg *args,
351 xfs_agblock_t bno,
352 xfs_extlen_t len,
353 xfs_agblock_t *rbno,
354 xfs_extlen_t *rlen)
355{
356 xfs_agblock_t fbno;
357 xfs_extlen_t flen;
358 struct rb_node *rbp;
359
360 ASSERT(len > 0);
361
362 spin_lock(&args->pag->pagb_lock);
363restart:
364 fbno = bno;
365 flen = len;
366 rbp = args->pag->pagb_tree.rb_node;
367 while (rbp && flen >= args->minlen) {
368 struct xfs_extent_busy *busyp =
369 rb_entry(rbp, struct xfs_extent_busy, rb_node);
370 xfs_agblock_t fend = fbno + flen;
371 xfs_agblock_t bbno = busyp->bno;
372 xfs_agblock_t bend = bbno + busyp->length;
373
374 if (fend <= bbno) {
375 rbp = rbp->rb_left;
376 continue;
377 } else if (fbno >= bend) {
378 rbp = rbp->rb_right;
379 continue;
380 }
381
382 /*
383 * If this is a metadata allocation, try to reuse the busy
384 * extent instead of trimming the allocation.
385 */
386 if (!args->userdata &&
387 !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
388 if (!xfs_extent_busy_update_extent(args->mp, args->pag,
389 busyp, fbno, flen,
390 false))
391 goto restart;
392 continue;
393 }
394
395 if (bbno <= fbno) {
396 /* start overlap */
397
398 /*
399 * Case 1:
400 * bbno bend
401 * +BBBBBBBBBBBBBBBBB+
402 * +---------+
403 * fbno fend
404 *
405 * Case 2:
406 * bbno bend
407 * +BBBBBBBBBBBBBBBBB+
408 * +-------------+
409 * fbno fend
410 *
411 * Case 3:
412 * bbno bend
413 * +BBBBBBBBBBBBBBBBB+
414 * +-------------+
415 * fbno fend
416 *
417 * Case 4:
418 * bbno bend
419 * +BBBBBBBBBBBBBBBBB+
420 * +-----------------+
421 * fbno fend
422 *
423 * No unbusy region in extent, return failure.
424 */
425 if (fend <= bend)
426 goto fail;
427
428 /*
429 * Case 5:
430 * bbno bend
431 * +BBBBBBBBBBBBBBBBB+
432 * +----------------------+
433 * fbno fend
434 *
435 * Case 6:
436 * bbno bend
437 * +BBBBBBBBBBBBBBBBB+
438 * +--------------------------+
439 * fbno fend
440 *
441 * Needs to be trimmed to:
442 * +-------+
443 * fbno fend
444 */
445 fbno = bend;
446 } else if (bend >= fend) {
447 /* end overlap */
448
449 /*
450 * Case 7:
451 * bbno bend
452 * +BBBBBBBBBBBBBBBBB+
453 * +------------------+
454 * fbno fend
455 *
456 * Case 8:
457 * bbno bend
458 * +BBBBBBBBBBBBBBBBB+
459 * +--------------------------+
460 * fbno fend
461 *
462 * Needs to be trimmed to:
463 * +-------+
464 * fbno fend
465 */
466 fend = bbno;
467 } else {
468 /* middle overlap */
469
470 /*
471 * Case 9:
472 * bbno bend
473 * +BBBBBBBBBBBBBBBBB+
474 * +-----------------------------------+
475 * fbno fend
476 *
477 * Can be trimmed to:
478 * +-------+ OR +-------+
479 * fbno fend fbno fend
480 *
481 * Backward allocation leads to significant
482 * fragmentation of directories, which degrades
483 * directory performance, therefore we always want to
484 * choose the option that produces forward allocation
485 * patterns.
486 * Preferring the lower bno extent will make the next
487 * request use "fend" as the start of the next
488 * allocation; if the segment is no longer busy at
489 * that point, we'll get a contiguous allocation, but
490 * even if it is still busy, we will get a forward
491 * allocation.
492 * We try to avoid choosing the segment at "bend",
493 * because that can lead to the next allocation
494 * taking the segment at "fbno", which would be a
495 * backward allocation. We only use the segment at
496 * "fbno" if it is much larger than the current
497 * requested size, because in that case there's a
498 * good chance subsequent allocations will be
499 * contiguous.
500 */
501 if (bbno - fbno >= args->maxlen) {
502 /* left candidate fits perfect */
503 fend = bbno;
504 } else if (fend - bend >= args->maxlen * 4) {
505 /* right candidate has enough free space */
506 fbno = bend;
507 } else if (bbno - fbno >= args->minlen) {
508 /* left candidate fits minimum requirement */
509 fend = bbno;
510 } else {
511 goto fail;
512 }
513 }
514
515 flen = fend - fbno;
516 }
517 spin_unlock(&args->pag->pagb_lock);
518
519 if (fbno != bno || flen != len) {
520 trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len,
521 fbno, flen);
522 }
523 *rbno = fbno;
524 *rlen = flen;
525 return;
526fail:
527 /*
528 * Return a zero extent length as failure indications. All callers
529 * re-check if the trimmed extent satisfies the minlen requirement.
530 */
531 spin_unlock(&args->pag->pagb_lock);
532 trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
533 *rbno = fbno;
534 *rlen = 0;
535}
536
537STATIC void
538xfs_extent_busy_clear_one(
539 struct xfs_mount *mp,
540 struct xfs_perag *pag,
541 struct xfs_extent_busy *busyp)
542{
543 if (busyp->length) {
544 trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno,
545 busyp->length);
546 rb_erase(&busyp->rb_node, &pag->pagb_tree);
547 }
548
549 list_del_init(&busyp->list);
550 kmem_free(busyp);
551}
552
553/*
554 * Remove all extents on the passed in list from the busy extents tree.
555 * If do_discard is set skip extents that need to be discarded, and mark
556 * these as undergoing a discard operation instead.
557 */
558void
559xfs_extent_busy_clear(
560 struct xfs_mount *mp,
561 struct list_head *list,
562 bool do_discard)
563{
564 struct xfs_extent_busy *busyp, *n;
565 struct xfs_perag *pag = NULL;
566 xfs_agnumber_t agno = NULLAGNUMBER;
567
568 list_for_each_entry_safe(busyp, n, list, list) {
569 if (busyp->agno != agno) {
570 if (pag) {
571 spin_unlock(&pag->pagb_lock);
572 xfs_perag_put(pag);
573 }
574 pag = xfs_perag_get(mp, busyp->agno);
575 spin_lock(&pag->pagb_lock);
576 agno = busyp->agno;
577 }
578
579 if (do_discard && busyp->length &&
580 !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD))
581 busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
582 else
583 xfs_extent_busy_clear_one(mp, pag, busyp);
584 }
585
586 if (pag) {
587 spin_unlock(&pag->pagb_lock);
588 xfs_perag_put(pag);
589 }
590}
591
592/*
593 * Callback for list_sort to sort busy extents by the AG they reside in.
594 */
595int
596xfs_extent_busy_ag_cmp(
597 void *priv,
598 struct list_head *a,
599 struct list_head *b)
600{
601 return container_of(a, struct xfs_extent_busy, list)->agno -
602 container_of(b, struct xfs_extent_busy, list)->agno;
603}
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
new file mode 100644
index 000000000000..985412d65ba5
--- /dev/null
+++ b/fs/xfs/xfs_extent_busy.h
@@ -0,0 +1,69 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * Copyright (c) 2010 David Chinner.
4 * Copyright (c) 2011 Christoph Hellwig.
5 * All Rights Reserved.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it would be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20#ifndef __XFS_EXTENT_BUSY_H__
21#define __XFS_EXTENT_BUSY_H__
22
23/*
24 * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
25 * have been freed but whose transactions aren't committed to disk yet.
26 *
27 * Note that we use the transaction ID to record the transaction, not the
28 * transaction structure itself. See xfs_extent_busy_insert() for details.
29 */
30struct xfs_extent_busy {
31 struct rb_node rb_node; /* ag by-bno indexed search tree */
32 struct list_head list; /* transaction busy extent list */
33 xfs_agnumber_t agno;
34 xfs_agblock_t bno;
35 xfs_extlen_t length;
36 unsigned int flags;
37#define XFS_EXTENT_BUSY_DISCARDED 0x01 /* undergoing a discard op. */
38#define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */
39};
40
41void
42xfs_extent_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
43 xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
44
45void
46xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list,
47 bool do_discard);
48
49int
50xfs_extent_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
51 xfs_agblock_t bno, xfs_extlen_t len);
52
53void
54xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
55 xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
56
57void
58xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno,
59 xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen);
60
61int
62xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
63
64static inline void xfs_extent_busy_sort(struct list_head *list)
65{
66 list_sort(NULL, list, xfs_extent_busy_ag_cmp);
67}
68
69#endif /* __XFS_EXTENT_BUSY_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 35c2aff38b20..feb36d7551ae 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -19,7 +19,6 @@
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_log.h" 21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h" 22#include "xfs_trans.h"
24#include "xfs_buf_item.h" 23#include "xfs_buf_item.h"
25#include "xfs_sb.h" 24#include "xfs_sb.h"
@@ -64,7 +63,8 @@ __xfs_efi_release(
64 if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) { 63 if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
65 spin_lock(&ailp->xa_lock); 64 spin_lock(&ailp->xa_lock);
66 /* xfs_trans_ail_delete() drops the AIL lock. */ 65 /* xfs_trans_ail_delete() drops the AIL lock. */
67 xfs_trans_ail_delete(ailp, &efip->efi_item); 66 xfs_trans_ail_delete(ailp, &efip->efi_item,
67 SHUTDOWN_LOG_IO_ERROR);
68 xfs_efi_item_free(efip); 68 xfs_efi_item_free(efip);
69 } 69 }
70} 70}
@@ -147,22 +147,20 @@ xfs_efi_item_unpin(
147} 147}
148 148
149/* 149/*
150 * Efi items have no locking or pushing. However, since EFIs are 150 * Efi items have no locking or pushing. However, since EFIs are pulled from
151 * pulled from the AIL when their corresponding EFDs are committed 151 * the AIL when their corresponding EFDs are committed to disk, their situation
152 * to disk, their situation is very similar to being pinned. Return 152 * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller
153 * XFS_ITEM_PINNED so that the caller will eventually flush the log. 153 * will eventually flush the log. This should help in getting the EFI out of
154 * This should help in getting the EFI out of the AIL. 154 * the AIL.
155 */ 155 */
156STATIC uint 156STATIC uint
157xfs_efi_item_trylock( 157xfs_efi_item_push(
158 struct xfs_log_item *lip) 158 struct xfs_log_item *lip,
159 struct list_head *buffer_list)
159{ 160{
160 return XFS_ITEM_PINNED; 161 return XFS_ITEM_PINNED;
161} 162}
162 163
163/*
164 * Efi items have no locking, so just return.
165 */
166STATIC void 164STATIC void
167xfs_efi_item_unlock( 165xfs_efi_item_unlock(
168 struct xfs_log_item *lip) 166 struct xfs_log_item *lip)
@@ -190,17 +188,6 @@ xfs_efi_item_committed(
190} 188}
191 189
192/* 190/*
193 * There isn't much you can do to push on an efi item. It is simply
194 * stuck waiting for all of its corresponding efd items to be
195 * committed to disk.
196 */
197STATIC void
198xfs_efi_item_push(
199 struct xfs_log_item *lip)
200{
201}
202
203/*
204 * The EFI dependency tracking op doesn't do squat. It can't because 191 * The EFI dependency tracking op doesn't do squat. It can't because
205 * it doesn't know where the free extent is coming from. The dependency 192 * it doesn't know where the free extent is coming from. The dependency
206 * tracking has to be handled by the "enclosing" metadata object. For 193 * tracking has to be handled by the "enclosing" metadata object. For
@@ -222,7 +209,6 @@ static const struct xfs_item_ops xfs_efi_item_ops = {
222 .iop_format = xfs_efi_item_format, 209 .iop_format = xfs_efi_item_format,
223 .iop_pin = xfs_efi_item_pin, 210 .iop_pin = xfs_efi_item_pin,
224 .iop_unpin = xfs_efi_item_unpin, 211 .iop_unpin = xfs_efi_item_unpin,
225 .iop_trylock = xfs_efi_item_trylock,
226 .iop_unlock = xfs_efi_item_unlock, 212 .iop_unlock = xfs_efi_item_unlock,
227 .iop_committed = xfs_efi_item_committed, 213 .iop_committed = xfs_efi_item_committed,
228 .iop_push = xfs_efi_item_push, 214 .iop_push = xfs_efi_item_push,
@@ -404,19 +390,17 @@ xfs_efd_item_unpin(
404} 390}
405 391
406/* 392/*
407 * Efd items have no locking, so just return success. 393 * There isn't much you can do to push on an efd item. It is simply stuck
394 * waiting for the log to be flushed to disk.
408 */ 395 */
409STATIC uint 396STATIC uint
410xfs_efd_item_trylock( 397xfs_efd_item_push(
411 struct xfs_log_item *lip) 398 struct xfs_log_item *lip,
399 struct list_head *buffer_list)
412{ 400{
413 return XFS_ITEM_LOCKED; 401 return XFS_ITEM_PINNED;
414} 402}
415 403
416/*
417 * Efd items have no locking or pushing, so return failure
418 * so that the caller doesn't bother with us.
419 */
420STATIC void 404STATIC void
421xfs_efd_item_unlock( 405xfs_efd_item_unlock(
422 struct xfs_log_item *lip) 406 struct xfs_log_item *lip)
@@ -451,16 +435,6 @@ xfs_efd_item_committed(
451} 435}
452 436
453/* 437/*
454 * There isn't much you can do to push on an efd item. It is simply
455 * stuck waiting for the log to be flushed to disk.
456 */
457STATIC void
458xfs_efd_item_push(
459 struct xfs_log_item *lip)
460{
461}
462
463/*
464 * The EFD dependency tracking op doesn't do squat. It can't because 438 * The EFD dependency tracking op doesn't do squat. It can't because
465 * it doesn't know where the free extent is coming from. The dependency 439 * it doesn't know where the free extent is coming from. The dependency
466 * tracking has to be handled by the "enclosing" metadata object. For 440 * tracking has to be handled by the "enclosing" metadata object. For
@@ -482,7 +456,6 @@ static const struct xfs_item_ops xfs_efd_item_ops = {
482 .iop_format = xfs_efd_item_format, 456 .iop_format = xfs_efd_item_format,
483 .iop_pin = xfs_efd_item_pin, 457 .iop_pin = xfs_efd_item_pin,
484 .iop_unpin = xfs_efd_item_unpin, 458 .iop_unpin = xfs_efd_item_unpin,
485 .iop_trylock = xfs_efd_item_trylock,
486 .iop_unlock = xfs_efd_item_unlock, 459 .iop_unlock = xfs_efd_item_unlock,
487 .iop_committed = xfs_efd_item_committed, 460 .iop_committed = xfs_efd_item_committed,
488 .iop_push = xfs_efd_item_push, 461 .iop_push = xfs_efd_item_push,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 54a67dd9ac0a..8d214b87f6bb 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -17,9 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_bit.h"
21#include "xfs_log.h" 20#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_sb.h" 21#include "xfs_sb.h"
24#include "xfs_ag.h" 22#include "xfs_ag.h"
25#include "xfs_trans.h" 23#include "xfs_trans.h"
@@ -396,114 +394,96 @@ xfs_file_splice_write(
396} 394}
397 395
398/* 396/*
399 * This routine is called to handle zeroing any space in the last 397 * This routine is called to handle zeroing any space in the last block of the
400 * block of the file that is beyond the EOF. We do this since the 398 * file that is beyond the EOF. We do this since the size is being increased
401 * size is being increased without writing anything to that block 399 * without writing anything to that block and we don't want to read the
402 * and we don't want anyone to read the garbage on the disk. 400 * garbage on the disk.
403 */ 401 */
404STATIC int /* error (positive) */ 402STATIC int /* error (positive) */
405xfs_zero_last_block( 403xfs_zero_last_block(
406 xfs_inode_t *ip, 404 struct xfs_inode *ip,
407 xfs_fsize_t offset, 405 xfs_fsize_t offset,
408 xfs_fsize_t isize) 406 xfs_fsize_t isize)
409{ 407{
410 xfs_fileoff_t last_fsb; 408 struct xfs_mount *mp = ip->i_mount;
411 xfs_mount_t *mp = ip->i_mount; 409 xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize);
412 int nimaps; 410 int zero_offset = XFS_B_FSB_OFFSET(mp, isize);
413 int zero_offset; 411 int zero_len;
414 int zero_len; 412 int nimaps = 1;
415 int error = 0; 413 int error = 0;
416 xfs_bmbt_irec_t imap; 414 struct xfs_bmbt_irec imap;
417
418 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
419
420 zero_offset = XFS_B_FSB_OFFSET(mp, isize);
421 if (zero_offset == 0) {
422 /*
423 * There are no extra bytes in the last block on disk to
424 * zero, so return.
425 */
426 return 0;
427 }
428 415
429 last_fsb = XFS_B_TO_FSBT(mp, isize); 416 xfs_ilock(ip, XFS_ILOCK_EXCL);
430 nimaps = 1;
431 error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0); 417 error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
418 xfs_iunlock(ip, XFS_ILOCK_EXCL);
432 if (error) 419 if (error)
433 return error; 420 return error;
421
434 ASSERT(nimaps > 0); 422 ASSERT(nimaps > 0);
423
435 /* 424 /*
436 * If the block underlying isize is just a hole, then there 425 * If the block underlying isize is just a hole, then there
437 * is nothing to zero. 426 * is nothing to zero.
438 */ 427 */
439 if (imap.br_startblock == HOLESTARTBLOCK) { 428 if (imap.br_startblock == HOLESTARTBLOCK)
440 return 0; 429 return 0;
441 }
442 /*
443 * Zero the part of the last block beyond the EOF, and write it
444 * out sync. We need to drop the ilock while we do this so we
445 * don't deadlock when the buffer cache calls back to us.
446 */
447 xfs_iunlock(ip, XFS_ILOCK_EXCL);
448 430
449 zero_len = mp->m_sb.sb_blocksize - zero_offset; 431 zero_len = mp->m_sb.sb_blocksize - zero_offset;
450 if (isize + zero_len > offset) 432 if (isize + zero_len > offset)
451 zero_len = offset - isize; 433 zero_len = offset - isize;
452 error = xfs_iozero(ip, isize, zero_len); 434 return xfs_iozero(ip, isize, zero_len);
453
454 xfs_ilock(ip, XFS_ILOCK_EXCL);
455 ASSERT(error >= 0);
456 return error;
457} 435}
458 436
459/* 437/*
460 * Zero any on disk space between the current EOF and the new, 438 * Zero any on disk space between the current EOF and the new, larger EOF.
461 * larger EOF. This handles the normal case of zeroing the remainder 439 *
462 * of the last block in the file and the unusual case of zeroing blocks 440 * This handles the normal case of zeroing the remainder of the last block in
463 * out beyond the size of the file. This second case only happens 441 * the file and the unusual case of zeroing blocks out beyond the size of the
464 * with fixed size extents and when the system crashes before the inode 442 * file. This second case only happens with fixed size extents and when the
465 * size was updated but after blocks were allocated. If fill is set, 443 * system crashes before the inode size was updated but after blocks were
466 * then any holes in the range are filled and zeroed. If not, the holes 444 * allocated.
467 * are left alone as holes. 445 *
446 * Expects the iolock to be held exclusive, and will take the ilock internally.
468 */ 447 */
469
470int /* error (positive) */ 448int /* error (positive) */
471xfs_zero_eof( 449xfs_zero_eof(
472 xfs_inode_t *ip, 450 struct xfs_inode *ip,
473 xfs_off_t offset, /* starting I/O offset */ 451 xfs_off_t offset, /* starting I/O offset */
474 xfs_fsize_t isize) /* current inode size */ 452 xfs_fsize_t isize) /* current inode size */
475{ 453{
476 xfs_mount_t *mp = ip->i_mount; 454 struct xfs_mount *mp = ip->i_mount;
477 xfs_fileoff_t start_zero_fsb; 455 xfs_fileoff_t start_zero_fsb;
478 xfs_fileoff_t end_zero_fsb; 456 xfs_fileoff_t end_zero_fsb;
479 xfs_fileoff_t zero_count_fsb; 457 xfs_fileoff_t zero_count_fsb;
480 xfs_fileoff_t last_fsb; 458 xfs_fileoff_t last_fsb;
481 xfs_fileoff_t zero_off; 459 xfs_fileoff_t zero_off;
482 xfs_fsize_t zero_len; 460 xfs_fsize_t zero_len;
483 int nimaps; 461 int nimaps;
484 int error = 0; 462 int error = 0;
485 xfs_bmbt_irec_t imap; 463 struct xfs_bmbt_irec imap;
486 464
487 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 465 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
488 ASSERT(offset > isize); 466 ASSERT(offset > isize);
489 467
490 /* 468 /*
491 * First handle zeroing the block on which isize resides. 469 * First handle zeroing the block on which isize resides.
470 *
492 * We only zero a part of that block so it is handled specially. 471 * We only zero a part of that block so it is handled specially.
493 */ 472 */
494 error = xfs_zero_last_block(ip, offset, isize); 473 if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
495 if (error) { 474 error = xfs_zero_last_block(ip, offset, isize);
496 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 475 if (error)
497 return error; 476 return error;
498 } 477 }
499 478
500 /* 479 /*
501 * Calculate the range between the new size and the old 480 * Calculate the range between the new size and the old where blocks
502 * where blocks needing to be zeroed may exist. To get the 481 * needing to be zeroed may exist.
503 * block where the last byte in the file currently resides, 482 *
504 * we need to subtract one from the size and truncate back 483 * To get the block where the last byte in the file currently resides,
505 * to a block boundary. We subtract 1 in case the size is 484 * we need to subtract one from the size and truncate back to a block
506 * exactly on a block boundary. 485 * boundary. We subtract 1 in case the size is exactly on a block
486 * boundary.
507 */ 487 */
508 last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; 488 last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
509 start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); 489 start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
@@ -521,23 +501,18 @@ xfs_zero_eof(
521 while (start_zero_fsb <= end_zero_fsb) { 501 while (start_zero_fsb <= end_zero_fsb) {
522 nimaps = 1; 502 nimaps = 1;
523 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; 503 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
504
505 xfs_ilock(ip, XFS_ILOCK_EXCL);
524 error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, 506 error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
525 &imap, &nimaps, 0); 507 &imap, &nimaps, 0);
526 if (error) { 508 xfs_iunlock(ip, XFS_ILOCK_EXCL);
527 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 509 if (error)
528 return error; 510 return error;
529 } 511
530 ASSERT(nimaps > 0); 512 ASSERT(nimaps > 0);
531 513
532 if (imap.br_state == XFS_EXT_UNWRITTEN || 514 if (imap.br_state == XFS_EXT_UNWRITTEN ||
533 imap.br_startblock == HOLESTARTBLOCK) { 515 imap.br_startblock == HOLESTARTBLOCK) {
534 /*
535 * This loop handles initializing pages that were
536 * partially initialized by the code below this
537 * loop. It basically zeroes the part of the page
538 * that sits on a hole and sets the page as P_HOLE
539 * and calls remapf if it is a mapped file.
540 */
541 start_zero_fsb = imap.br_startoff + imap.br_blockcount; 516 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
542 ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); 517 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
543 continue; 518 continue;
@@ -545,11 +520,7 @@ xfs_zero_eof(
545 520
546 /* 521 /*
547 * There are blocks we need to zero. 522 * There are blocks we need to zero.
548 * Drop the inode lock while we're doing the I/O.
549 * We'll still have the iolock to protect us.
550 */ 523 */
551 xfs_iunlock(ip, XFS_ILOCK_EXCL);
552
553 zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); 524 zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
554 zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); 525 zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
555 526
@@ -557,22 +528,14 @@ xfs_zero_eof(
557 zero_len = offset - zero_off; 528 zero_len = offset - zero_off;
558 529
559 error = xfs_iozero(ip, zero_off, zero_len); 530 error = xfs_iozero(ip, zero_off, zero_len);
560 if (error) { 531 if (error)
561 goto out_lock; 532 return error;
562 }
563 533
564 start_zero_fsb = imap.br_startoff + imap.br_blockcount; 534 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
565 ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); 535 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
566
567 xfs_ilock(ip, XFS_ILOCK_EXCL);
568 } 536 }
569 537
570 return 0; 538 return 0;
571
572out_lock:
573 xfs_ilock(ip, XFS_ILOCK_EXCL);
574 ASSERT(error >= 0);
575 return error;
576} 539}
577 540
578/* 541/*
@@ -593,35 +556,29 @@ xfs_file_aio_write_checks(
593 struct xfs_inode *ip = XFS_I(inode); 556 struct xfs_inode *ip = XFS_I(inode);
594 int error = 0; 557 int error = 0;
595 558
596 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
597restart: 559restart:
598 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); 560 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
599 if (error) { 561 if (error)
600 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
601 return error; 562 return error;
602 }
603 563
604 /* 564 /*
605 * If the offset is beyond the size of the file, we need to zero any 565 * If the offset is beyond the size of the file, we need to zero any
606 * blocks that fall between the existing EOF and the start of this 566 * blocks that fall between the existing EOF and the start of this
607 * write. If zeroing is needed and we are currently holding the 567 * write. If zeroing is needed and we are currently holding the
608 * iolock shared, we need to update it to exclusive which involves 568 * iolock shared, we need to update it to exclusive which implies
609 * dropping all locks and relocking to maintain correct locking order. 569 * having to redo all checks before.
610 * If we do this, restart the function to ensure all checks and values
611 * are still valid.
612 */ 570 */
613 if (*pos > i_size_read(inode)) { 571 if (*pos > i_size_read(inode)) {
614 if (*iolock == XFS_IOLOCK_SHARED) { 572 if (*iolock == XFS_IOLOCK_SHARED) {
615 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); 573 xfs_rw_iunlock(ip, *iolock);
616 *iolock = XFS_IOLOCK_EXCL; 574 *iolock = XFS_IOLOCK_EXCL;
617 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); 575 xfs_rw_ilock(ip, *iolock);
618 goto restart; 576 goto restart;
619 } 577 }
620 error = -xfs_zero_eof(ip, *pos, i_size_read(inode)); 578 error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
579 if (error)
580 return error;
621 } 581 }
622 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
623 if (error)
624 return error;
625 582
626 /* 583 /*
627 * Updating the timestamps will grab the ilock again from 584 * Updating the timestamps will grab the ilock again from
@@ -638,7 +595,6 @@ restart:
638 * people from modifying setuid and setgid binaries. 595 * people from modifying setuid and setgid binaries.
639 */ 596 */
640 return file_remove_suid(file); 597 return file_remove_suid(file);
641
642} 598}
643 599
644/* 600/*
@@ -1007,8 +963,149 @@ xfs_vm_page_mkwrite(
1007 return block_page_mkwrite(vma, vmf, xfs_get_blocks); 963 return block_page_mkwrite(vma, vmf, xfs_get_blocks);
1008} 964}
1009 965
966STATIC loff_t
967xfs_seek_data(
968 struct file *file,
969 loff_t start,
970 u32 type)
971{
972 struct inode *inode = file->f_mapping->host;
973 struct xfs_inode *ip = XFS_I(inode);
974 struct xfs_mount *mp = ip->i_mount;
975 struct xfs_bmbt_irec map[2];
976 int nmap = 2;
977 loff_t uninitialized_var(offset);
978 xfs_fsize_t isize;
979 xfs_fileoff_t fsbno;
980 xfs_filblks_t end;
981 uint lock;
982 int error;
983
984 lock = xfs_ilock_map_shared(ip);
985
986 isize = i_size_read(inode);
987 if (start >= isize) {
988 error = ENXIO;
989 goto out_unlock;
990 }
991
992 fsbno = XFS_B_TO_FSBT(mp, start);
993
994 /*
995 * Try to read extents from the first block indicated
996 * by fsbno to the end block of the file.
997 */
998 end = XFS_B_TO_FSB(mp, isize);
999
1000 error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
1001 XFS_BMAPI_ENTIRE);
1002 if (error)
1003 goto out_unlock;
1004
1005 /*
1006 * Treat unwritten extent as data extent since it might
1007 * contains dirty data in page cache.
1008 */
1009 if (map[0].br_startblock != HOLESTARTBLOCK) {
1010 offset = max_t(loff_t, start,
1011 XFS_FSB_TO_B(mp, map[0].br_startoff));
1012 } else {
1013 if (nmap == 1) {
1014 error = ENXIO;
1015 goto out_unlock;
1016 }
1017
1018 offset = max_t(loff_t, start,
1019 XFS_FSB_TO_B(mp, map[1].br_startoff));
1020 }
1021
1022 if (offset != file->f_pos)
1023 file->f_pos = offset;
1024
1025out_unlock:
1026 xfs_iunlock_map_shared(ip, lock);
1027
1028 if (error)
1029 return -error;
1030 return offset;
1031}
1032
1033STATIC loff_t
1034xfs_seek_hole(
1035 struct file *file,
1036 loff_t start,
1037 u32 type)
1038{
1039 struct inode *inode = file->f_mapping->host;
1040 struct xfs_inode *ip = XFS_I(inode);
1041 struct xfs_mount *mp = ip->i_mount;
1042 loff_t uninitialized_var(offset);
1043 loff_t holeoff;
1044 xfs_fsize_t isize;
1045 xfs_fileoff_t fsbno;
1046 uint lock;
1047 int error;
1048
1049 if (XFS_FORCED_SHUTDOWN(mp))
1050 return -XFS_ERROR(EIO);
1051
1052 lock = xfs_ilock_map_shared(ip);
1053
1054 isize = i_size_read(inode);
1055 if (start >= isize) {
1056 error = ENXIO;
1057 goto out_unlock;
1058 }
1059
1060 fsbno = XFS_B_TO_FSBT(mp, start);
1061 error = xfs_bmap_first_unused(NULL, ip, 1, &fsbno, XFS_DATA_FORK);
1062 if (error)
1063 goto out_unlock;
1064
1065 holeoff = XFS_FSB_TO_B(mp, fsbno);
1066 if (holeoff <= start)
1067 offset = start;
1068 else {
1069 /*
1070 * xfs_bmap_first_unused() could return a value bigger than
1071 * isize if there are no more holes past the supplied offset.
1072 */
1073 offset = min_t(loff_t, holeoff, isize);
1074 }
1075
1076 if (offset != file->f_pos)
1077 file->f_pos = offset;
1078
1079out_unlock:
1080 xfs_iunlock_map_shared(ip, lock);
1081
1082 if (error)
1083 return -error;
1084 return offset;
1085}
1086
1087STATIC loff_t
1088xfs_file_llseek(
1089 struct file *file,
1090 loff_t offset,
1091 int origin)
1092{
1093 switch (origin) {
1094 case SEEK_END:
1095 case SEEK_CUR:
1096 case SEEK_SET:
1097 return generic_file_llseek(file, offset, origin);
1098 case SEEK_DATA:
1099 return xfs_seek_data(file, offset, origin);
1100 case SEEK_HOLE:
1101 return xfs_seek_hole(file, offset, origin);
1102 default:
1103 return -EINVAL;
1104 }
1105}
1106
1010const struct file_operations xfs_file_operations = { 1107const struct file_operations xfs_file_operations = {
1011 .llseek = generic_file_llseek, 1108 .llseek = xfs_file_llseek,
1012 .read = do_sync_read, 1109 .read = do_sync_read,
1013 .write = do_sync_write, 1110 .write = do_sync_write,
1014 .aio_read = xfs_file_aio_read, 1111 .aio_read = xfs_file_aio_read,
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 1c6fdeb702ff..c25b094efbf7 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -18,8 +18,6 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_inum.h"
23#include "xfs_log.h" 21#include "xfs_log.h"
24#include "xfs_trans.h" 22#include "xfs_trans.h"
25#include "xfs_sb.h" 23#include "xfs_sb.h"
@@ -39,7 +37,6 @@
39#include "xfs_itable.h" 37#include "xfs_itable.h"
40#include "xfs_trans_space.h" 38#include "xfs_trans_space.h"
41#include "xfs_rtalloc.h" 39#include "xfs_rtalloc.h"
42#include "xfs_rw.h"
43#include "xfs_filestream.h" 40#include "xfs_filestream.h"
44#include "xfs_trace.h" 41#include "xfs_trace.h"
45 42
@@ -147,9 +144,9 @@ xfs_growfs_data_private(
147 if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) 144 if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
148 return error; 145 return error;
149 dpct = pct - mp->m_sb.sb_imax_pct; 146 dpct = pct - mp->m_sb.sb_imax_pct;
150 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, 147 bp = xfs_buf_read_uncached(mp->m_ddev_targp,
151 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), 148 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
152 BBTOB(XFS_FSS_TO_BB(mp, 1)), 0); 149 XFS_FSS_TO_BB(mp, 1), 0);
153 if (!bp) 150 if (!bp)
154 return EIO; 151 return EIO;
155 xfs_buf_relse(bp); 152 xfs_buf_relse(bp);
@@ -193,7 +190,7 @@ xfs_growfs_data_private(
193 */ 190 */
194 bp = xfs_buf_get(mp->m_ddev_targp, 191 bp = xfs_buf_get(mp->m_ddev_targp,
195 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), 192 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
196 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); 193 XFS_FSS_TO_BB(mp, 1), 0);
197 if (!bp) { 194 if (!bp) {
198 error = ENOMEM; 195 error = ENOMEM;
199 goto error0; 196 goto error0;
@@ -230,7 +227,7 @@ xfs_growfs_data_private(
230 */ 227 */
231 bp = xfs_buf_get(mp->m_ddev_targp, 228 bp = xfs_buf_get(mp->m_ddev_targp,
232 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), 229 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
233 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); 230 XFS_FSS_TO_BB(mp, 1), 0);
234 if (!bp) { 231 if (!bp) {
235 error = ENOMEM; 232 error = ENOMEM;
236 goto error0; 233 goto error0;
@@ -259,8 +256,7 @@ xfs_growfs_data_private(
259 */ 256 */
260 bp = xfs_buf_get(mp->m_ddev_targp, 257 bp = xfs_buf_get(mp->m_ddev_targp,
261 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), 258 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
262 BTOBB(mp->m_sb.sb_blocksize), 259 BTOBB(mp->m_sb.sb_blocksize), 0);
263 XBF_LOCK | XBF_MAPPED);
264 if (!bp) { 260 if (!bp) {
265 error = ENOMEM; 261 error = ENOMEM;
266 goto error0; 262 goto error0;
@@ -286,8 +282,7 @@ xfs_growfs_data_private(
286 */ 282 */
287 bp = xfs_buf_get(mp->m_ddev_targp, 283 bp = xfs_buf_get(mp->m_ddev_targp,
288 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), 284 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
289 BTOBB(mp->m_sb.sb_blocksize), 285 BTOBB(mp->m_sb.sb_blocksize), 0);
290 XBF_LOCK | XBF_MAPPED);
291 if (!bp) { 286 if (!bp) {
292 error = ENOMEM; 287 error = ENOMEM;
293 goto error0; 288 goto error0;
@@ -314,8 +309,7 @@ xfs_growfs_data_private(
314 */ 309 */
315 bp = xfs_buf_get(mp->m_ddev_targp, 310 bp = xfs_buf_get(mp->m_ddev_targp,
316 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), 311 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
317 BTOBB(mp->m_sb.sb_blocksize), 312 BTOBB(mp->m_sb.sb_blocksize), 0);
318 XBF_LOCK | XBF_MAPPED);
319 if (!bp) { 313 if (!bp) {
320 error = ENOMEM; 314 error = ENOMEM;
321 goto error0; 315 goto error0;
@@ -405,7 +399,7 @@ xfs_growfs_data_private(
405 399
406 /* update secondary superblocks. */ 400 /* update secondary superblocks. */
407 for (agno = 1; agno < nagcount; agno++) { 401 for (agno = 1; agno < nagcount; agno++) {
408 error = xfs_read_buf(mp, mp->m_ddev_targp, 402 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
409 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), 403 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
410 XFS_FSS_TO_BB(mp, 1), 0, &bp); 404 XFS_FSS_TO_BB(mp, 1), 0, &bp);
411 if (error) { 405 if (error) {
@@ -693,3 +687,63 @@ xfs_fs_goingdown(
693 687
694 return 0; 688 return 0;
695} 689}
690
691/*
692 * Force a shutdown of the filesystem instantly while keeping the filesystem
693 * consistent. We don't do an unmount here; just shutdown the shop, make sure
694 * that absolutely nothing persistent happens to this filesystem after this
695 * point.
696 */
697void
698xfs_do_force_shutdown(
699 xfs_mount_t *mp,
700 int flags,
701 char *fname,
702 int lnnum)
703{
704 int logerror;
705
706 logerror = flags & SHUTDOWN_LOG_IO_ERROR;
707
708 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
709 xfs_notice(mp,
710 "%s(0x%x) called from line %d of file %s. Return address = 0x%p",
711 __func__, flags, lnnum, fname, __return_address);
712 }
713 /*
714 * No need to duplicate efforts.
715 */
716 if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
717 return;
718
719 /*
720 * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
721 * queue up anybody new on the log reservations, and wakes up
722 * everybody who's sleeping on log reservations to tell them
723 * the bad news.
724 */
725 if (xfs_log_force_umount(mp, logerror))
726 return;
727
728 if (flags & SHUTDOWN_CORRUPT_INCORE) {
729 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
730 "Corruption of in-memory data detected. Shutting down filesystem");
731 if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
732 xfs_stack_trace();
733 } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
734 if (logerror) {
735 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
736 "Log I/O Error Detected. Shutting down filesystem");
737 } else if (flags & SHUTDOWN_DEVICE_REQ) {
738 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
739 "All device paths lost. Shutting down filesystem");
740 } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
741 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
742 "I/O Error Detected. Shutting down filesystem");
743 }
744 }
745 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
746 xfs_alert(mp,
747 "Please umount the filesystem and rectify the problem(s)");
748 }
749}
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index dad1a31aa4fc..177a21a7ac49 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -200,8 +200,7 @@ xfs_ialloc_inode_init(
200 */ 200 */
201 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); 201 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
202 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, 202 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
203 mp->m_bsize * blks_per_cluster, 203 mp->m_bsize * blks_per_cluster, 0);
204 XBF_LOCK);
205 if (!fbuf) 204 if (!fbuf)
206 return ENOMEM; 205 return ENOMEM;
207 /* 206 /*
@@ -610,6 +609,13 @@ xfs_ialloc_get_rec(
610/* 609/*
611 * Visible inode allocation functions. 610 * Visible inode allocation functions.
612 */ 611 */
612/*
613 * Find a free (set) bit in the inode bitmask.
614 */
615static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
616{
617 return xfs_lowbit64(*fp);
618}
613 619
614/* 620/*
615 * Allocate an inode on disk. 621 * Allocate an inode on disk.
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 666a037398d6..65ac57c8063c 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -47,15 +47,6 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
47} 47}
48 48
49/* 49/*
50 * Find a free (set) bit in the inode bitmask.
51 */
52static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
53{
54 return xfs_lowbit64(*fp);
55}
56
57
58/*
59 * Allocate an inode on disk. 50 * Allocate an inode on disk.
60 * Mode is used to tell whether the new inode will need space, and whether 51 * Mode is used to tell whether the new inode will need space, and whether
61 * it is a directory. 52 * it is a directory.
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index c6a75815aea0..2b8b7a37aa18 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -20,7 +20,6 @@
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 23#include "xfs_trans.h"
25#include "xfs_sb.h" 24#include "xfs_sb.h"
26#include "xfs_ag.h" 25#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index bcc6c249b2c7..1bb4365e8c25 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -19,7 +19,6 @@
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_acl.h" 21#include "xfs_acl.h"
22#include "xfs_bit.h"
23#include "xfs_log.h" 22#include "xfs_log.h"
24#include "xfs_inum.h" 23#include "xfs_inum.h"
25#include "xfs_trans.h" 24#include "xfs_trans.h"
@@ -123,23 +122,7 @@ xfs_inode_free(
123 xfs_idestroy_fork(ip, XFS_ATTR_FORK); 122 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
124 123
125 if (ip->i_itemp) { 124 if (ip->i_itemp) {
126 /* 125 ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
127 * Only if we are shutting down the fs will we see an
128 * inode still in the AIL. If it is there, we should remove
129 * it to prevent a use-after-free from occurring.
130 */
131 xfs_log_item_t *lip = &ip->i_itemp->ili_item;
132 struct xfs_ail *ailp = lip->li_ailp;
133
134 ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
135 XFS_FORCED_SHUTDOWN(ip->i_mount));
136 if (lip->li_flags & XFS_LI_IN_AIL) {
137 spin_lock(&ailp->xa_lock);
138 if (lip->li_flags & XFS_LI_IN_AIL)
139 xfs_trans_ail_delete(ailp, lip);
140 else
141 spin_unlock(&ailp->xa_lock);
142 }
143 xfs_inode_item_destroy(ip); 126 xfs_inode_item_destroy(ip);
144 ip->i_itemp = NULL; 127 ip->i_itemp = NULL;
145 } 128 }
@@ -334,9 +317,10 @@ xfs_iget_cache_miss(
334 /* 317 /*
335 * Preload the radix tree so we can insert safely under the 318 * Preload the radix tree so we can insert safely under the
336 * write spinlock. Note that we cannot sleep inside the preload 319 * write spinlock. Note that we cannot sleep inside the preload
337 * region. 320 * region. Since we can be called from transaction context, don't
321 * recurse into the file system.
338 */ 322 */
339 if (radix_tree_preload(GFP_KERNEL)) { 323 if (radix_tree_preload(GFP_NOFS)) {
340 error = EAGAIN; 324 error = EAGAIN;
341 goto out_destroy; 325 goto out_destroy;
342 } 326 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index bc46c0a133d3..a59eea09930a 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -20,7 +20,6 @@
20#include "xfs.h" 20#include "xfs.h"
21#include "xfs_fs.h" 21#include "xfs_fs.h"
22#include "xfs_types.h" 22#include "xfs_types.h"
23#include "xfs_bit.h"
24#include "xfs_log.h" 23#include "xfs_log.h"
25#include "xfs_inum.h" 24#include "xfs_inum.h"
26#include "xfs_trans.h" 25#include "xfs_trans.h"
@@ -61,6 +60,20 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
61STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); 60STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
62STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); 61STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
63 62
63/*
64 * helper function to extract extent size hint from inode
65 */
66xfs_extlen_t
67xfs_get_extsz_hint(
68 struct xfs_inode *ip)
69{
70 if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
71 return ip->i_d.di_extsize;
72 if (XFS_IS_REALTIME_INODE(ip))
73 return ip->i_mount->m_sb.sb_rextsize;
74 return 0;
75}
76
64#ifdef DEBUG 77#ifdef DEBUG
65/* 78/*
66 * Make sure that the extents in the given memory buffer 79 * Make sure that the extents in the given memory buffer
@@ -137,6 +150,7 @@ xfs_imap_to_bp(
137 int ni; 150 int ni;
138 xfs_buf_t *bp; 151 xfs_buf_t *bp;
139 152
153 buf_flags |= XBF_UNMAPPED;
140 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, 154 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
141 (int)imap->im_len, buf_flags, &bp); 155 (int)imap->im_len, buf_flags, &bp);
142 if (error) { 156 if (error) {
@@ -226,7 +240,7 @@ xfs_inotobp(
226 if (error) 240 if (error)
227 return error; 241 return error;
228 242
229 error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags); 243 error = xfs_imap_to_bp(mp, tp, &imap, &bp, 0, imap_flags);
230 if (error) 244 if (error)
231 return error; 245 return error;
232 246
@@ -782,8 +796,7 @@ xfs_iread(
782 /* 796 /*
783 * Get pointers to the on-disk inode and the buffer containing it. 797 * Get pointers to the on-disk inode and the buffer containing it.
784 */ 798 */
785 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 799 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 0, iget_flags);
786 XBF_LOCK, iget_flags);
787 if (error) 800 if (error)
788 return error; 801 return error;
789 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 802 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -1342,7 +1355,7 @@ xfs_iunlink(
1342 * Here we put the head pointer into our next pointer, 1355 * Here we put the head pointer into our next pointer,
1343 * and then we fall through to point the head at us. 1356 * and then we fall through to point the head at us.
1344 */ 1357 */
1345 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); 1358 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
1346 if (error) 1359 if (error)
1347 return error; 1360 return error;
1348 1361
@@ -1423,7 +1436,7 @@ xfs_iunlink_remove(
1423 * of dealing with the buffer when there is no need to 1436 * of dealing with the buffer when there is no need to
1424 * change it. 1437 * change it.
1425 */ 1438 */
1426 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); 1439 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
1427 if (error) { 1440 if (error) {
1428 xfs_warn(mp, "%s: xfs_itobp() returned error %d.", 1441 xfs_warn(mp, "%s: xfs_itobp() returned error %d.",
1429 __func__, error); 1442 __func__, error);
@@ -1484,7 +1497,7 @@ xfs_iunlink_remove(
1484 * Now last_ibp points to the buffer previous to us on 1497 * Now last_ibp points to the buffer previous to us on
1485 * the unlinked list. Pull us from the list. 1498 * the unlinked list. Pull us from the list.
1486 */ 1499 */
1487 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); 1500 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
1488 if (error) { 1501 if (error) {
1489 xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.", 1502 xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.",
1490 __func__, error); 1503 __func__, error);
@@ -1566,8 +1579,7 @@ xfs_ifree_cluster(
1566 * to mark all the active inodes on the buffer stale. 1579 * to mark all the active inodes on the buffer stale.
1567 */ 1580 */
1568 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 1581 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
1569 mp->m_bsize * blks_per_cluster, 1582 mp->m_bsize * blks_per_cluster, 0);
1570 XBF_LOCK);
1571 1583
1572 if (!bp) 1584 if (!bp)
1573 return ENOMEM; 1585 return ENOMEM;
@@ -1737,7 +1749,7 @@ xfs_ifree(
1737 1749
1738 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1750 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1739 1751
1740 error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK); 1752 error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0);
1741 if (error) 1753 if (error)
1742 return error; 1754 return error;
1743 1755
@@ -2347,11 +2359,11 @@ cluster_corrupt_out:
2347 */ 2359 */
2348 rcu_read_unlock(); 2360 rcu_read_unlock();
2349 /* 2361 /*
2350 * Clean up the buffer. If it was B_DELWRI, just release it -- 2362 * Clean up the buffer. If it was delwri, just release it --
2351 * brelse can handle it with no problems. If not, shut down the 2363 * brelse can handle it with no problems. If not, shut down the
2352 * filesystem before releasing the buffer. 2364 * filesystem before releasing the buffer.
2353 */ 2365 */
2354 bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp); 2366 bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q);
2355 if (bufwasdelwri) 2367 if (bufwasdelwri)
2356 xfs_buf_relse(bp); 2368 xfs_buf_relse(bp);
2357 2369
@@ -2377,30 +2389,29 @@ cluster_corrupt_out:
2377 /* 2389 /*
2378 * Unlocks the flush lock 2390 * Unlocks the flush lock
2379 */ 2391 */
2380 xfs_iflush_abort(iq); 2392 xfs_iflush_abort(iq, false);
2381 kmem_free(ilist); 2393 kmem_free(ilist);
2382 xfs_perag_put(pag); 2394 xfs_perag_put(pag);
2383 return XFS_ERROR(EFSCORRUPTED); 2395 return XFS_ERROR(EFSCORRUPTED);
2384} 2396}
2385 2397
2386/* 2398/*
2387 * xfs_iflush() will write a modified inode's changes out to the 2399 * Flush dirty inode metadata into the backing buffer.
2388 * inode's on disk home. The caller must have the inode lock held 2400 *
2389 * in at least shared mode and the inode flush completion must be 2401 * The caller must have the inode lock and the inode flush lock held. The
2390 * active as well. The inode lock will still be held upon return from 2402 * inode lock will still be held upon return to the caller, and the inode
2391 * the call and the caller is free to unlock it. 2403 * flush lock will be released after the inode has reached the disk.
2392 * The inode flush will be completed when the inode reaches the disk. 2404 *
2393 * The flags indicate how the inode's buffer should be written out. 2405 * The caller must write out the buffer returned in *bpp and release it.
2394 */ 2406 */
2395int 2407int
2396xfs_iflush( 2408xfs_iflush(
2397 xfs_inode_t *ip, 2409 struct xfs_inode *ip,
2398 uint flags) 2410 struct xfs_buf **bpp)
2399{ 2411{
2400 xfs_inode_log_item_t *iip; 2412 struct xfs_mount *mp = ip->i_mount;
2401 xfs_buf_t *bp; 2413 struct xfs_buf *bp;
2402 xfs_dinode_t *dip; 2414 struct xfs_dinode *dip;
2403 xfs_mount_t *mp;
2404 int error; 2415 int error;
2405 2416
2406 XFS_STATS_INC(xs_iflush_count); 2417 XFS_STATS_INC(xs_iflush_count);
@@ -2410,25 +2421,8 @@ xfs_iflush(
2410 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 2421 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
2411 ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); 2422 ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
2412 2423
2413 iip = ip->i_itemp; 2424 *bpp = NULL;
2414 mp = ip->i_mount;
2415 2425
2416 /*
2417 * We can't flush the inode until it is unpinned, so wait for it if we
2418 * are allowed to block. We know no one new can pin it, because we are
2419 * holding the inode lock shared and you need to hold it exclusively to
2420 * pin the inode.
2421 *
2422 * If we are not allowed to block, force the log out asynchronously so
2423 * that when we come back the inode will be unpinned. If other inodes
2424 * in the same cluster are dirty, they will probably write the inode
2425 * out for us if they occur after the log force completes.
2426 */
2427 if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
2428 xfs_iunpin(ip);
2429 xfs_ifunlock(ip);
2430 return EAGAIN;
2431 }
2432 xfs_iunpin_wait(ip); 2426 xfs_iunpin_wait(ip);
2433 2427
2434 /* 2428 /*
@@ -2447,20 +2441,20 @@ xfs_iflush(
2447 /* 2441 /*
2448 * This may have been unpinned because the filesystem is shutting 2442 * This may have been unpinned because the filesystem is shutting
2449 * down forcibly. If that's the case we must not write this inode 2443 * down forcibly. If that's the case we must not write this inode
2450 * to disk, because the log record didn't make it to disk! 2444 * to disk, because the log record didn't make it to disk.
2445 *
2446 * We also have to remove the log item from the AIL in this case,
2447 * as we wait for an empty AIL as part of the unmount process.
2451 */ 2448 */
2452 if (XFS_FORCED_SHUTDOWN(mp)) { 2449 if (XFS_FORCED_SHUTDOWN(mp)) {
2453 if (iip) 2450 error = XFS_ERROR(EIO);
2454 iip->ili_fields = 0; 2451 goto abort_out;
2455 xfs_ifunlock(ip);
2456 return XFS_ERROR(EIO);
2457 } 2452 }
2458 2453
2459 /* 2454 /*
2460 * Get the buffer containing the on-disk inode. 2455 * Get the buffer containing the on-disk inode.
2461 */ 2456 */
2462 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 2457 error = xfs_itobp(mp, NULL, ip, &dip, &bp, XBF_TRYLOCK);
2463 (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK);
2464 if (error || !bp) { 2458 if (error || !bp) {
2465 xfs_ifunlock(ip); 2459 xfs_ifunlock(ip);
2466 return error; 2460 return error;
@@ -2488,23 +2482,20 @@ xfs_iflush(
2488 if (error) 2482 if (error)
2489 goto cluster_corrupt_out; 2483 goto cluster_corrupt_out;
2490 2484
2491 if (flags & SYNC_WAIT) 2485 *bpp = bp;
2492 error = xfs_bwrite(bp); 2486 return 0;
2493 else
2494 xfs_buf_delwri_queue(bp);
2495
2496 xfs_buf_relse(bp);
2497 return error;
2498 2487
2499corrupt_out: 2488corrupt_out:
2500 xfs_buf_relse(bp); 2489 xfs_buf_relse(bp);
2501 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 2490 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
2502cluster_corrupt_out: 2491cluster_corrupt_out:
2492 error = XFS_ERROR(EFSCORRUPTED);
2493abort_out:
2503 /* 2494 /*
2504 * Unlocks the flush lock 2495 * Unlocks the flush lock
2505 */ 2496 */
2506 xfs_iflush_abort(ip); 2497 xfs_iflush_abort(ip, false);
2507 return XFS_ERROR(EFSCORRUPTED); 2498 return error;
2508} 2499}
2509 2500
2510 2501
@@ -2706,27 +2697,6 @@ corrupt_out:
2706 return XFS_ERROR(EFSCORRUPTED); 2697 return XFS_ERROR(EFSCORRUPTED);
2707} 2698}
2708 2699
2709void
2710xfs_promote_inode(
2711 struct xfs_inode *ip)
2712{
2713 struct xfs_buf *bp;
2714
2715 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2716
2717 bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno,
2718 ip->i_imap.im_len, XBF_TRYLOCK);
2719 if (!bp)
2720 return;
2721
2722 if (XFS_BUF_ISDELAYWRITE(bp)) {
2723 xfs_buf_delwri_promote(bp);
2724 wake_up_process(ip->i_mount->m_ddev_targp->bt_task);
2725 }
2726
2727 xfs_buf_relse(bp);
2728}
2729
2730/* 2700/*
2731 * Return a pointer to the extent record at file index idx. 2701 * Return a pointer to the extent record at file index idx.
2732 */ 2702 */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 7fee3387e1c8..1efff36a75b6 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -529,11 +529,12 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
529 529
530void xfs_iext_realloc(xfs_inode_t *, int, int); 530void xfs_iext_realloc(xfs_inode_t *, int, int);
531void xfs_iunpin_wait(xfs_inode_t *); 531void xfs_iunpin_wait(xfs_inode_t *);
532int xfs_iflush(xfs_inode_t *, uint); 532int xfs_iflush(struct xfs_inode *, struct xfs_buf **);
533void xfs_promote_inode(struct xfs_inode *);
534void xfs_lock_inodes(xfs_inode_t **, int, uint); 533void xfs_lock_inodes(xfs_inode_t **, int, uint);
535void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 534void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
536 535
536xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
537
537#define IHOLD(ip) \ 538#define IHOLD(ip) \
538do { \ 539do { \
539 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ 540 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 05d924efceaf..6cdbf90c6f7b 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -18,9 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h" 21#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 22#include "xfs_trans.h"
25#include "xfs_sb.h" 23#include "xfs_sb.h"
26#include "xfs_ag.h" 24#include "xfs_ag.h"
@@ -480,25 +478,16 @@ xfs_inode_item_unpin(
480 wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); 478 wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
481} 479}
482 480
483/*
484 * This is called to attempt to lock the inode associated with this
485 * inode log item, in preparation for the push routine which does the actual
486 * iflush. Don't sleep on the inode lock or the flush lock.
487 *
488 * If the flush lock is already held, indicating that the inode has
489 * been or is in the process of being flushed, then (ideally) we'd like to
490 * see if the inode's buffer is still incore, and if so give it a nudge.
491 * We delay doing so until the pushbuf routine, though, to avoid holding
492 * the AIL lock across a call to the blackhole which is the buffer cache.
493 * Also we don't want to sleep in any device strategy routines, which can happen
494 * if we do the subsequent bawrite in here.
495 */
496STATIC uint 481STATIC uint
497xfs_inode_item_trylock( 482xfs_inode_item_push(
498 struct xfs_log_item *lip) 483 struct xfs_log_item *lip,
484 struct list_head *buffer_list)
499{ 485{
500 struct xfs_inode_log_item *iip = INODE_ITEM(lip); 486 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
501 struct xfs_inode *ip = iip->ili_inode; 487 struct xfs_inode *ip = iip->ili_inode;
488 struct xfs_buf *bp = NULL;
489 uint rval = XFS_ITEM_SUCCESS;
490 int error;
502 491
503 if (xfs_ipincount(ip) > 0) 492 if (xfs_ipincount(ip) > 0)
504 return XFS_ITEM_PINNED; 493 return XFS_ITEM_PINNED;
@@ -506,30 +495,50 @@ xfs_inode_item_trylock(
506 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) 495 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
507 return XFS_ITEM_LOCKED; 496 return XFS_ITEM_LOCKED;
508 497
498 /*
499 * Re-check the pincount now that we stabilized the value by
500 * taking the ilock.
501 */
502 if (xfs_ipincount(ip) > 0) {
503 rval = XFS_ITEM_PINNED;
504 goto out_unlock;
505 }
506
507 /*
508 * Someone else is already flushing the inode. Nothing we can do
509 * here but wait for the flush to finish and remove the item from
510 * the AIL.
511 */
509 if (!xfs_iflock_nowait(ip)) { 512 if (!xfs_iflock_nowait(ip)) {
510 /* 513 rval = XFS_ITEM_FLUSHING;
511 * inode has already been flushed to the backing buffer, 514 goto out_unlock;
512 * leave it locked in shared mode, pushbuf routine will
513 * unlock it.
514 */
515 return XFS_ITEM_PUSHBUF;
516 } 515 }
517 516
518 /* Stale items should force out the iclog */ 517 /*
518 * Stale inode items should force out the iclog.
519 */
519 if (ip->i_flags & XFS_ISTALE) { 520 if (ip->i_flags & XFS_ISTALE) {
520 xfs_ifunlock(ip); 521 xfs_ifunlock(ip);
521 xfs_iunlock(ip, XFS_ILOCK_SHARED); 522 xfs_iunlock(ip, XFS_ILOCK_SHARED);
522 return XFS_ITEM_PINNED; 523 return XFS_ITEM_PINNED;
523 } 524 }
524 525
525#ifdef DEBUG 526 ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
526 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 527 ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
527 ASSERT(iip->ili_fields != 0); 528
528 ASSERT(iip->ili_logged == 0); 529 spin_unlock(&lip->li_ailp->xa_lock);
529 ASSERT(lip->li_flags & XFS_LI_IN_AIL); 530
531 error = xfs_iflush(ip, &bp);
532 if (!error) {
533 if (!xfs_buf_delwri_queue(bp, buffer_list))
534 rval = XFS_ITEM_FLUSHING;
535 xfs_buf_relse(bp);
530 } 536 }
531#endif 537
532 return XFS_ITEM_SUCCESS; 538 spin_lock(&lip->li_ailp->xa_lock);
539out_unlock:
540 xfs_iunlock(ip, XFS_ILOCK_SHARED);
541 return rval;
533} 542}
534 543
535/* 544/*
@@ -614,86 +623,6 @@ xfs_inode_item_committed(
614} 623}
615 624
616/* 625/*
617 * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
618 * failed to get the inode flush lock but did get the inode locked SHARED.
619 * Here we're trying to see if the inode buffer is incore, and if so whether it's
620 * marked delayed write. If that's the case, we'll promote it and that will
621 * allow the caller to write the buffer by triggering the xfsbufd to run.
622 */
623STATIC bool
624xfs_inode_item_pushbuf(
625 struct xfs_log_item *lip)
626{
627 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
628 struct xfs_inode *ip = iip->ili_inode;
629 struct xfs_buf *bp;
630 bool ret = true;
631
632 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
633
634 /*
635 * If a flush is not in progress anymore, chances are that the
636 * inode was taken off the AIL. So, just get out.
637 */
638 if (!xfs_isiflocked(ip) ||
639 !(lip->li_flags & XFS_LI_IN_AIL)) {
640 xfs_iunlock(ip, XFS_ILOCK_SHARED);
641 return true;
642 }
643
644 bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno,
645 iip->ili_format.ilf_len, XBF_TRYLOCK);
646
647 xfs_iunlock(ip, XFS_ILOCK_SHARED);
648 if (!bp)
649 return true;
650 if (XFS_BUF_ISDELAYWRITE(bp))
651 xfs_buf_delwri_promote(bp);
652 if (xfs_buf_ispinned(bp))
653 ret = false;
654 xfs_buf_relse(bp);
655 return ret;
656}
657
658/*
659 * This is called to asynchronously write the inode associated with this
660 * inode log item out to disk. The inode will already have been locked by
661 * a successful call to xfs_inode_item_trylock().
662 */
663STATIC void
664xfs_inode_item_push(
665 struct xfs_log_item *lip)
666{
667 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
668 struct xfs_inode *ip = iip->ili_inode;
669
670 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
671 ASSERT(xfs_isiflocked(ip));
672
673 /*
674 * Since we were able to lock the inode's flush lock and
675 * we found it on the AIL, the inode must be dirty. This
676 * is because the inode is removed from the AIL while still
677 * holding the flush lock in xfs_iflush_done(). Thus, if
678 * we found it in the AIL and were able to obtain the flush
679 * lock without sleeping, then there must not have been
680 * anyone in the process of flushing the inode.
681 */
682 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || iip->ili_fields != 0);
683
684 /*
685 * Push the inode to it's backing buffer. This will not remove the
686 * inode from the AIL - a further push will be required to trigger a
687 * buffer push. However, this allows all the dirty inodes to be pushed
688 * to the buffer before it is pushed to disk. The buffer IO completion
689 * will pull the inode from the AIL, mark it clean and unlock the flush
690 * lock.
691 */
692 (void) xfs_iflush(ip, SYNC_TRYLOCK);
693 xfs_iunlock(ip, XFS_ILOCK_SHARED);
694}
695
696/*
697 * XXX rcc - this one really has to do something. Probably needs 626 * XXX rcc - this one really has to do something. Probably needs
698 * to stamp in a new field in the incore inode. 627 * to stamp in a new field in the incore inode.
699 */ 628 */
@@ -713,11 +642,9 @@ static const struct xfs_item_ops xfs_inode_item_ops = {
713 .iop_format = xfs_inode_item_format, 642 .iop_format = xfs_inode_item_format,
714 .iop_pin = xfs_inode_item_pin, 643 .iop_pin = xfs_inode_item_pin,
715 .iop_unpin = xfs_inode_item_unpin, 644 .iop_unpin = xfs_inode_item_unpin,
716 .iop_trylock = xfs_inode_item_trylock,
717 .iop_unlock = xfs_inode_item_unlock, 645 .iop_unlock = xfs_inode_item_unlock,
718 .iop_committed = xfs_inode_item_committed, 646 .iop_committed = xfs_inode_item_committed,
719 .iop_push = xfs_inode_item_push, 647 .iop_push = xfs_inode_item_push,
720 .iop_pushbuf = xfs_inode_item_pushbuf,
721 .iop_committing = xfs_inode_item_committing 648 .iop_committing = xfs_inode_item_committing
722}; 649};
723 650
@@ -848,7 +775,8 @@ xfs_iflush_done(
848 ASSERT(i <= need_ail); 775 ASSERT(i <= need_ail);
849 } 776 }
850 /* xfs_trans_ail_delete_bulk() drops the AIL lock. */ 777 /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
851 xfs_trans_ail_delete_bulk(ailp, log_items, i); 778 xfs_trans_ail_delete_bulk(ailp, log_items, i,
779 SHUTDOWN_CORRUPT_INCORE);
852 } 780 }
853 781
854 782
@@ -869,16 +797,15 @@ xfs_iflush_done(
869} 797}
870 798
871/* 799/*
872 * This is the inode flushing abort routine. It is called 800 * This is the inode flushing abort routine. It is called from xfs_iflush when
873 * from xfs_iflush when the filesystem is shutting down to clean 801 * the filesystem is shutting down to clean up the inode state. It is
874 * up the inode state. 802 * responsible for removing the inode item from the AIL if it has not been
875 * It is responsible for removing the inode item 803 * re-logged, and unlocking the inode's flush lock.
876 * from the AIL if it has not been re-logged, and unlocking the inode's
877 * flush lock.
878 */ 804 */
879void 805void
880xfs_iflush_abort( 806xfs_iflush_abort(
881 xfs_inode_t *ip) 807 xfs_inode_t *ip,
808 bool stale)
882{ 809{
883 xfs_inode_log_item_t *iip = ip->i_itemp; 810 xfs_inode_log_item_t *iip = ip->i_itemp;
884 811
@@ -888,7 +815,10 @@ xfs_iflush_abort(
888 spin_lock(&ailp->xa_lock); 815 spin_lock(&ailp->xa_lock);
889 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { 816 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
890 /* xfs_trans_ail_delete() drops the AIL lock. */ 817 /* xfs_trans_ail_delete() drops the AIL lock. */
891 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)iip); 818 xfs_trans_ail_delete(ailp, &iip->ili_item,
819 stale ?
820 SHUTDOWN_LOG_IO_ERROR :
821 SHUTDOWN_CORRUPT_INCORE);
892 } else 822 } else
893 spin_unlock(&ailp->xa_lock); 823 spin_unlock(&ailp->xa_lock);
894 } 824 }
@@ -915,7 +845,7 @@ xfs_istale_done(
915 struct xfs_buf *bp, 845 struct xfs_buf *bp,
916 struct xfs_log_item *lip) 846 struct xfs_log_item *lip)
917{ 847{
918 xfs_iflush_abort(INODE_ITEM(lip)->ili_inode); 848 xfs_iflush_abort(INODE_ITEM(lip)->ili_inode, true);
919} 849}
920 850
921/* 851/*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 41d61c3b7a36..376d4d0b2635 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -165,7 +165,7 @@ extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
165extern void xfs_inode_item_destroy(struct xfs_inode *); 165extern void xfs_inode_item_destroy(struct xfs_inode *);
166extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *); 166extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *);
167extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *); 167extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *);
168extern void xfs_iflush_abort(struct xfs_inode *); 168extern void xfs_iflush_abort(struct xfs_inode *, bool);
169extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, 169extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
170 xfs_inode_log_format_t *); 170 xfs_inode_log_format_t *);
171 171
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h
index b253c0ea5bec..90efdaf1706f 100644
--- a/fs/xfs/xfs_inum.h
+++ b/fs/xfs/xfs_inum.h
@@ -26,11 +26,6 @@
26 * high agno_log-agblklog-inopblog bits - 0 26 * high agno_log-agblklog-inopblog bits - 0
27 */ 27 */
28 28
29typedef __uint32_t xfs_agino_t; /* within allocation grp inode number */
30
31#define NULLFSINO ((xfs_ino_t)-1)
32#define NULLAGINO ((xfs_agino_t)-1)
33
34struct xfs_mount; 29struct xfs_mount;
35 30
36#define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1) 31#define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 91f8ff547ab3..3a05a41b5d76 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -17,9 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_bit.h"
21#include "xfs_log.h" 20#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h" 21#include "xfs_trans.h"
24#include "xfs_sb.h" 22#include "xfs_sb.h"
25#include "xfs_ag.h" 23#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index a849a5473aff..c4f2da0d2bf5 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -22,9 +22,7 @@
22#include <asm/uaccess.h> 22#include <asm/uaccess.h>
23#include "xfs.h" 23#include "xfs.h"
24#include "xfs_fs.h" 24#include "xfs_fs.h"
25#include "xfs_bit.h"
26#include "xfs_log.h" 25#include "xfs_log.h"
27#include "xfs_inum.h"
28#include "xfs_trans.h" 26#include "xfs_trans.h"
29#include "xfs_sb.h" 27#include "xfs_sb.h"
30#include "xfs_ag.h" 28#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 71a464503c43..aadfce6681ee 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -17,9 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_bit.h"
21#include "xfs_log.h" 20#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h" 21#include "xfs_trans.h"
24#include "xfs_sb.h" 22#include "xfs_sb.h"
25#include "xfs_ag.h" 23#include "xfs_ag.h"
@@ -37,7 +35,6 @@
37#include "xfs_rtalloc.h" 35#include "xfs_rtalloc.h"
38#include "xfs_error.h" 36#include "xfs_error.h"
39#include "xfs_itable.h" 37#include "xfs_itable.h"
40#include "xfs_rw.h"
41#include "xfs_attr.h" 38#include "xfs_attr.h"
42#include "xfs_buf_item.h" 39#include "xfs_buf_item.h"
43#include "xfs_trans_space.h" 40#include "xfs_trans_space.h"
@@ -142,11 +139,7 @@ xfs_iomap_write_direct(
142 int committed; 139 int committed;
143 int error; 140 int error;
144 141
145 /* 142 error = xfs_qm_dqattach(ip, 0);
146 * Make sure that the dquots are there. This doesn't hold
147 * the ilock across a disk read.
148 */
149 error = xfs_qm_dqattach_locked(ip, 0);
150 if (error) 143 if (error)
151 return XFS_ERROR(error); 144 return XFS_ERROR(error);
152 145
@@ -158,7 +151,7 @@ xfs_iomap_write_direct(
158 if ((offset + count) > XFS_ISIZE(ip)) { 151 if ((offset + count) > XFS_ISIZE(ip)) {
159 error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); 152 error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
160 if (error) 153 if (error)
161 goto error_out; 154 return XFS_ERROR(error);
162 } else { 155 } else {
163 if (nmaps && (imap->br_startblock == HOLESTARTBLOCK)) 156 if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
164 last_fsb = MIN(last_fsb, (xfs_fileoff_t) 157 last_fsb = MIN(last_fsb, (xfs_fileoff_t)
@@ -190,7 +183,6 @@ xfs_iomap_write_direct(
190 /* 183 /*
191 * Allocate and setup the transaction 184 * Allocate and setup the transaction
192 */ 185 */
193 xfs_iunlock(ip, XFS_ILOCK_EXCL);
194 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); 186 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
195 error = xfs_trans_reserve(tp, resblks, 187 error = xfs_trans_reserve(tp, resblks,
196 XFS_WRITE_LOG_RES(mp), resrtextents, 188 XFS_WRITE_LOG_RES(mp), resrtextents,
@@ -199,15 +191,16 @@ xfs_iomap_write_direct(
199 /* 191 /*
200 * Check for running out of space, note: need lock to return 192 * Check for running out of space, note: need lock to return
201 */ 193 */
202 if (error) 194 if (error) {
203 xfs_trans_cancel(tp, 0); 195 xfs_trans_cancel(tp, 0);
196 return XFS_ERROR(error);
197 }
198
204 xfs_ilock(ip, XFS_ILOCK_EXCL); 199 xfs_ilock(ip, XFS_ILOCK_EXCL);
205 if (error)
206 goto error_out;
207 200
208 error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); 201 error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
209 if (error) 202 if (error)
210 goto error1; 203 goto out_trans_cancel;
211 204
212 xfs_trans_ijoin(tp, ip, 0); 205 xfs_trans_ijoin(tp, ip, 0);
213 206
@@ -224,42 +217,39 @@ xfs_iomap_write_direct(
224 error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag, 217 error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag,
225 &firstfsb, 0, imap, &nimaps, &free_list); 218 &firstfsb, 0, imap, &nimaps, &free_list);
226 if (error) 219 if (error)
227 goto error0; 220 goto out_bmap_cancel;
228 221
229 /* 222 /*
230 * Complete the transaction 223 * Complete the transaction
231 */ 224 */
232 error = xfs_bmap_finish(&tp, &free_list, &committed); 225 error = xfs_bmap_finish(&tp, &free_list, &committed);
233 if (error) 226 if (error)
234 goto error0; 227 goto out_bmap_cancel;
235 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 228 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
236 if (error) 229 if (error)
237 goto error_out; 230 goto out_unlock;
238 231
239 /* 232 /*
240 * Copy any maps to caller's array and return any error. 233 * Copy any maps to caller's array and return any error.
241 */ 234 */
242 if (nimaps == 0) { 235 if (nimaps == 0) {
243 error = ENOSPC; 236 error = XFS_ERROR(ENOSPC);
244 goto error_out; 237 goto out_unlock;
245 } 238 }
246 239
247 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) { 240 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
248 error = xfs_alert_fsblock_zero(ip, imap); 241 error = xfs_alert_fsblock_zero(ip, imap);
249 goto error_out;
250 }
251 242
252 return 0; 243out_unlock:
244 xfs_iunlock(ip, XFS_ILOCK_EXCL);
245 return error;
253 246
254error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ 247out_bmap_cancel:
255 xfs_bmap_cancel(&free_list); 248 xfs_bmap_cancel(&free_list);
256 xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); 249 xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
257 250out_trans_cancel:
258error1: /* Just cancel transaction */
259 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 251 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
260 252 goto out_unlock;
261error_out:
262 return XFS_ERROR(error);
263} 253}
264 254
265/* 255/*
@@ -422,6 +412,15 @@ retry:
422 return error; 412 return error;
423 } 413 }
424 414
415 /*
416 * Make sure preallocation does not create extents beyond the range we
417 * actually support in this filesystem.
418 */
419 if (last_fsb > XFS_B_TO_FSB(mp, mp->m_maxioffset))
420 last_fsb = XFS_B_TO_FSB(mp, mp->m_maxioffset);
421
422 ASSERT(last_fsb > offset_fsb);
423
425 nimaps = XFS_WRITE_IMAPS; 424 nimaps = XFS_WRITE_IMAPS;
426 error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb, 425 error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb,
427 imap, &nimaps, XFS_BMAPI_ENTIRE); 426 imap, &nimaps, XFS_BMAPI_ENTIRE);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 3011b879f850..1a25fd802798 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -18,9 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_acl.h" 20#include "xfs_acl.h"
21#include "xfs_bit.h"
22#include "xfs_log.h" 21#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 22#include "xfs_trans.h"
25#include "xfs_sb.h" 23#include "xfs_sb.h"
26#include "xfs_ag.h" 24#include "xfs_ag.h"
@@ -34,7 +32,6 @@
34#include "xfs_rtalloc.h" 32#include "xfs_rtalloc.h"
35#include "xfs_error.h" 33#include "xfs_error.h"
36#include "xfs_itable.h" 34#include "xfs_itable.h"
37#include "xfs_rw.h"
38#include "xfs_attr.h" 35#include "xfs_attr.h"
39#include "xfs_buf_item.h" 36#include "xfs_buf_item.h"
40#include "xfs_utils.h" 37#include "xfs_utils.h"
@@ -700,7 +697,7 @@ xfs_setattr_size(
700 xfs_off_t oldsize, newsize; 697 xfs_off_t oldsize, newsize;
701 struct xfs_trans *tp; 698 struct xfs_trans *tp;
702 int error; 699 int error;
703 uint lock_flags; 700 uint lock_flags = 0;
704 uint commit_flags = 0; 701 uint commit_flags = 0;
705 702
706 trace_xfs_setattr(ip); 703 trace_xfs_setattr(ip);
@@ -720,10 +717,10 @@ xfs_setattr_size(
720 ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID| 717 ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID|
721 ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); 718 ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
722 719
723 lock_flags = XFS_ILOCK_EXCL; 720 if (!(flags & XFS_ATTR_NOLOCK)) {
724 if (!(flags & XFS_ATTR_NOLOCK))
725 lock_flags |= XFS_IOLOCK_EXCL; 721 lock_flags |= XFS_IOLOCK_EXCL;
726 xfs_ilock(ip, lock_flags); 722 xfs_ilock(ip, lock_flags);
723 }
727 724
728 oldsize = inode->i_size; 725 oldsize = inode->i_size;
729 newsize = iattr->ia_size; 726 newsize = iattr->ia_size;
@@ -746,7 +743,7 @@ xfs_setattr_size(
746 /* 743 /*
747 * Make sure that the dquots are attached to the inode. 744 * Make sure that the dquots are attached to the inode.
748 */ 745 */
749 error = xfs_qm_dqattach_locked(ip, 0); 746 error = xfs_qm_dqattach(ip, 0);
750 if (error) 747 if (error)
751 goto out_unlock; 748 goto out_unlock;
752 749
@@ -768,8 +765,6 @@ xfs_setattr_size(
768 if (error) 765 if (error)
769 goto out_unlock; 766 goto out_unlock;
770 } 767 }
771 xfs_iunlock(ip, XFS_ILOCK_EXCL);
772 lock_flags &= ~XFS_ILOCK_EXCL;
773 768
774 /* 769 /*
775 * We are going to log the inode size change in this transaction so 770 * We are going to log the inode size change in this transaction so
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index acc2bf264dab..eff577a9b67f 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -18,7 +18,6 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h" 21#include "xfs_log.h"
23#include "xfs_inum.h" 22#include "xfs_inum.h"
24#include "xfs_trans.h" 23#include "xfs_trans.h"
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 6db1fef38bff..6b965bf450e4 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -18,9 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h" 21#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 22#include "xfs_trans.h"
25#include "xfs_sb.h" 23#include "xfs_sb.h"
26#include "xfs_ag.h" 24#include "xfs_ag.h"
@@ -35,7 +33,6 @@
35#include "xfs_trans_priv.h" 33#include "xfs_trans_priv.h"
36#include "xfs_dinode.h" 34#include "xfs_dinode.h"
37#include "xfs_inode.h" 35#include "xfs_inode.h"
38#include "xfs_rw.h"
39#include "xfs_trace.h" 36#include "xfs_trace.h"
40 37
41kmem_zone_t *xfs_log_ticket_zone; 38kmem_zone_t *xfs_log_ticket_zone;
@@ -916,27 +913,42 @@ xfs_log_need_covered(xfs_mount_t *mp)
916 * We may be holding the log iclog lock upon entering this routine. 913 * We may be holding the log iclog lock upon entering this routine.
917 */ 914 */
918xfs_lsn_t 915xfs_lsn_t
919xlog_assign_tail_lsn( 916xlog_assign_tail_lsn_locked(
920 struct xfs_mount *mp) 917 struct xfs_mount *mp)
921{ 918{
922 xfs_lsn_t tail_lsn;
923 struct log *log = mp->m_log; 919 struct log *log = mp->m_log;
920 struct xfs_log_item *lip;
921 xfs_lsn_t tail_lsn;
922
923 assert_spin_locked(&mp->m_ail->xa_lock);
924 924
925 /* 925 /*
926 * To make sure we always have a valid LSN for the log tail we keep 926 * To make sure we always have a valid LSN for the log tail we keep
927 * track of the last LSN which was committed in log->l_last_sync_lsn, 927 * track of the last LSN which was committed in log->l_last_sync_lsn,
928 * and use that when the AIL was empty and xfs_ail_min_lsn returns 0. 928 * and use that when the AIL was empty.
929 *
930 * If the AIL has been emptied we also need to wake any process
931 * waiting for this condition.
932 */ 929 */
933 tail_lsn = xfs_ail_min_lsn(mp->m_ail); 930 lip = xfs_ail_min(mp->m_ail);
934 if (!tail_lsn) 931 if (lip)
932 tail_lsn = lip->li_lsn;
933 else
935 tail_lsn = atomic64_read(&log->l_last_sync_lsn); 934 tail_lsn = atomic64_read(&log->l_last_sync_lsn);
936 atomic64_set(&log->l_tail_lsn, tail_lsn); 935 atomic64_set(&log->l_tail_lsn, tail_lsn);
937 return tail_lsn; 936 return tail_lsn;
938} 937}
939 938
939xfs_lsn_t
940xlog_assign_tail_lsn(
941 struct xfs_mount *mp)
942{
943 xfs_lsn_t tail_lsn;
944
945 spin_lock(&mp->m_ail->xa_lock);
946 tail_lsn = xlog_assign_tail_lsn_locked(mp);
947 spin_unlock(&mp->m_ail->xa_lock);
948
949 return tail_lsn;
950}
951
940/* 952/*
941 * Return the space in the log between the tail and the head. The head 953 * Return the space in the log between the tail and the head. The head
942 * is passed in the cycle/bytes formal parms. In the special case where 954 * is passed in the cycle/bytes formal parms. In the special case where
@@ -1172,7 +1184,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1172 xlog_get_iclog_buffer_size(mp, log); 1184 xlog_get_iclog_buffer_size(mp, log);
1173 1185
1174 error = ENOMEM; 1186 error = ENOMEM;
1175 bp = xfs_buf_alloc(mp->m_logdev_targp, 0, log->l_iclog_size, 0); 1187 bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0);
1176 if (!bp) 1188 if (!bp)
1177 goto out_free_log; 1189 goto out_free_log;
1178 bp->b_iodone = xlog_iodone; 1190 bp->b_iodone = xlog_iodone;
@@ -1182,9 +1194,6 @@ xlog_alloc_log(xfs_mount_t *mp,
1182 spin_lock_init(&log->l_icloglock); 1194 spin_lock_init(&log->l_icloglock);
1183 init_waitqueue_head(&log->l_flush_wait); 1195 init_waitqueue_head(&log->l_flush_wait);
1184 1196
1185 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
1186 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
1187
1188 iclogp = &log->l_iclog; 1197 iclogp = &log->l_iclog;
1189 /* 1198 /*
1190 * The amount of memory to allocate for the iclog structure is 1199 * The amount of memory to allocate for the iclog structure is
@@ -1204,7 +1213,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1204 prev_iclog = iclog; 1213 prev_iclog = iclog;
1205 1214
1206 bp = xfs_buf_get_uncached(mp->m_logdev_targp, 1215 bp = xfs_buf_get_uncached(mp->m_logdev_targp,
1207 log->l_iclog_size, 0); 1216 BTOBB(log->l_iclog_size), 0);
1208 if (!bp) 1217 if (!bp)
1209 goto out_free_iclog; 1218 goto out_free_iclog;
1210 1219
@@ -1224,7 +1233,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1224 head->h_fmt = cpu_to_be32(XLOG_FMT); 1233 head->h_fmt = cpu_to_be32(XLOG_FMT);
1225 memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); 1234 memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
1226 1235
1227 iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize; 1236 iclog->ic_size = BBTOB(bp->b_length) - log->l_iclog_hsize;
1228 iclog->ic_state = XLOG_STATE_ACTIVE; 1237 iclog->ic_state = XLOG_STATE_ACTIVE;
1229 iclog->ic_log = log; 1238 iclog->ic_log = log;
1230 atomic_set(&iclog->ic_refcnt, 0); 1239 atomic_set(&iclog->ic_refcnt, 0);
@@ -1475,7 +1484,7 @@ xlog_sync(xlog_t *log,
1475 } else { 1484 } else {
1476 iclog->ic_bwritecnt = 1; 1485 iclog->ic_bwritecnt = 1;
1477 } 1486 }
1478 XFS_BUF_SET_COUNT(bp, count); 1487 bp->b_io_length = BTOBB(count);
1479 bp->b_fspriv = iclog; 1488 bp->b_fspriv = iclog;
1480 XFS_BUF_ZEROFLAGS(bp); 1489 XFS_BUF_ZEROFLAGS(bp);
1481 XFS_BUF_ASYNC(bp); 1490 XFS_BUF_ASYNC(bp);
@@ -1573,7 +1582,7 @@ xlog_dealloc_log(xlog_t *log)
1573 * always need to ensure that the extra buffer does not point to memory 1582 * always need to ensure that the extra buffer does not point to memory
1574 * owned by another log buffer before we free it. 1583 * owned by another log buffer before we free it.
1575 */ 1584 */
1576 xfs_buf_set_empty(log->l_xbuf, log->l_iclog_size); 1585 xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));
1577 xfs_buf_free(log->l_xbuf); 1586 xfs_buf_free(log->l_xbuf);
1578 1587
1579 iclog = log->l_iclog; 1588 iclog = log->l_iclog;
@@ -2932,6 +2941,7 @@ xfs_log_force(
2932{ 2941{
2933 int error; 2942 int error;
2934 2943
2944 trace_xfs_log_force(mp, 0);
2935 error = _xfs_log_force(mp, flags, NULL); 2945 error = _xfs_log_force(mp, flags, NULL);
2936 if (error) 2946 if (error)
2937 xfs_warn(mp, "%s: error %d returned.", __func__, error); 2947 xfs_warn(mp, "%s: error %d returned.", __func__, error);
@@ -3080,6 +3090,7 @@ xfs_log_force_lsn(
3080{ 3090{
3081 int error; 3091 int error;
3082 3092
3093 trace_xfs_log_force(mp, lsn);
3083 error = _xfs_log_force_lsn(mp, lsn, flags, NULL); 3094 error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
3084 if (error) 3095 if (error)
3085 xfs_warn(mp, "%s: error %d returned.", __func__, error); 3096 xfs_warn(mp, "%s: error %d returned.", __func__, error);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 2c622bedb302..748d312850e2 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -152,6 +152,7 @@ int xfs_log_mount(struct xfs_mount *mp,
152 int num_bblocks); 152 int num_bblocks);
153int xfs_log_mount_finish(struct xfs_mount *mp); 153int xfs_log_mount_finish(struct xfs_mount *mp);
154xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); 154xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
155xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
155void xfs_log_space_wake(struct xfs_mount *mp); 156void xfs_log_space_wake(struct xfs_mount *mp);
156int xfs_log_notify(struct xfs_mount *mp, 157int xfs_log_notify(struct xfs_mount *mp,
157 struct xlog_in_core *iclog, 158 struct xlog_in_core *iclog,
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index d4fadbe8ac90..7d6197c58493 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -18,9 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h" 21#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 22#include "xfs_trans.h"
25#include "xfs_trans_priv.h" 23#include "xfs_trans_priv.h"
26#include "xfs_log_priv.h" 24#include "xfs_log_priv.h"
@@ -29,61 +27,10 @@
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_error.h" 28#include "xfs_error.h"
31#include "xfs_alloc.h" 29#include "xfs_alloc.h"
30#include "xfs_extent_busy.h"
32#include "xfs_discard.h" 31#include "xfs_discard.h"
33 32
34/* 33/*
35 * Perform initial CIL structure initialisation.
36 */
37int
38xlog_cil_init(
39 struct log *log)
40{
41 struct xfs_cil *cil;
42 struct xfs_cil_ctx *ctx;
43
44 cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
45 if (!cil)
46 return ENOMEM;
47
48 ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
49 if (!ctx) {
50 kmem_free(cil);
51 return ENOMEM;
52 }
53
54 INIT_LIST_HEAD(&cil->xc_cil);
55 INIT_LIST_HEAD(&cil->xc_committing);
56 spin_lock_init(&cil->xc_cil_lock);
57 init_rwsem(&cil->xc_ctx_lock);
58 init_waitqueue_head(&cil->xc_commit_wait);
59
60 INIT_LIST_HEAD(&ctx->committing);
61 INIT_LIST_HEAD(&ctx->busy_extents);
62 ctx->sequence = 1;
63 ctx->cil = cil;
64 cil->xc_ctx = ctx;
65 cil->xc_current_sequence = ctx->sequence;
66
67 cil->xc_log = log;
68 log->l_cilp = cil;
69 return 0;
70}
71
72void
73xlog_cil_destroy(
74 struct log *log)
75{
76 if (log->l_cilp->xc_ctx) {
77 if (log->l_cilp->xc_ctx->ticket)
78 xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
79 kmem_free(log->l_cilp->xc_ctx);
80 }
81
82 ASSERT(list_empty(&log->l_cilp->xc_cil));
83 kmem_free(log->l_cilp);
84}
85
86/*
87 * Allocate a new ticket. Failing to get a new ticket makes it really hard to 34 * Allocate a new ticket. Failing to get a new ticket makes it really hard to
88 * recover, so we don't allow failure here. Also, we allocate in a context that 35 * recover, so we don't allow failure here. Also, we allocate in a context that
89 * we don't want to be issuing transactions from, so we need to tell the 36 * we don't want to be issuing transactions from, so we need to tell the
@@ -390,8 +337,8 @@ xlog_cil_committed(
390 xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, 337 xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
391 ctx->start_lsn, abort); 338 ctx->start_lsn, abort);
392 339
393 xfs_alloc_busy_sort(&ctx->busy_extents); 340 xfs_extent_busy_sort(&ctx->busy_extents);
394 xfs_alloc_busy_clear(mp, &ctx->busy_extents, 341 xfs_extent_busy_clear(mp, &ctx->busy_extents,
395 (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); 342 (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
396 343
397 spin_lock(&ctx->cil->xc_cil_lock); 344 spin_lock(&ctx->cil->xc_cil_lock);
@@ -404,7 +351,7 @@ xlog_cil_committed(
404 ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); 351 ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
405 352
406 xfs_discard_extents(mp, &ctx->busy_extents); 353 xfs_discard_extents(mp, &ctx->busy_extents);
407 xfs_alloc_busy_clear(mp, &ctx->busy_extents, false); 354 xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
408 } 355 }
409 356
410 kmem_free(ctx); 357 kmem_free(ctx);
@@ -426,8 +373,7 @@ xlog_cil_committed(
426 */ 373 */
427STATIC int 374STATIC int
428xlog_cil_push( 375xlog_cil_push(
429 struct log *log, 376 struct log *log)
430 xfs_lsn_t push_seq)
431{ 377{
432 struct xfs_cil *cil = log->l_cilp; 378 struct xfs_cil *cil = log->l_cilp;
433 struct xfs_log_vec *lv; 379 struct xfs_log_vec *lv;
@@ -443,39 +389,36 @@ xlog_cil_push(
443 struct xfs_log_iovec lhdr; 389 struct xfs_log_iovec lhdr;
444 struct xfs_log_vec lvhdr = { NULL }; 390 struct xfs_log_vec lvhdr = { NULL };
445 xfs_lsn_t commit_lsn; 391 xfs_lsn_t commit_lsn;
392 xfs_lsn_t push_seq;
446 393
447 if (!cil) 394 if (!cil)
448 return 0; 395 return 0;
449 396
450 ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
451
452 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); 397 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
453 new_ctx->ticket = xlog_cil_ticket_alloc(log); 398 new_ctx->ticket = xlog_cil_ticket_alloc(log);
454 399
455 /* 400 down_write(&cil->xc_ctx_lock);
456 * Lock out transaction commit, but don't block for background pushes
457 * unless we are well over the CIL space limit. See the definition of
458 * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic
459 * used here.
460 */
461 if (!down_write_trylock(&cil->xc_ctx_lock)) {
462 if (!push_seq &&
463 cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log))
464 goto out_free_ticket;
465 down_write(&cil->xc_ctx_lock);
466 }
467 ctx = cil->xc_ctx; 401 ctx = cil->xc_ctx;
468 402
469 /* check if we've anything to push */ 403 spin_lock(&cil->xc_cil_lock);
470 if (list_empty(&cil->xc_cil)) 404 push_seq = cil->xc_push_seq;
471 goto out_skip; 405 ASSERT(push_seq <= ctx->sequence);
472 406
473 /* check for spurious background flush */ 407 /*
474 if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) 408 * Check if we've anything to push. If there is nothing, then we don't
409 * move on to a new sequence number and so we have to be able to push
410 * this sequence again later.
411 */
412 if (list_empty(&cil->xc_cil)) {
413 cil->xc_push_seq = 0;
414 spin_unlock(&cil->xc_cil_lock);
475 goto out_skip; 415 goto out_skip;
416 }
417 spin_unlock(&cil->xc_cil_lock);
418
476 419
477 /* check for a previously pushed seqeunce */ 420 /* check for a previously pushed seqeunce */
478 if (push_seq && push_seq < cil->xc_ctx->sequence) 421 if (push_seq < cil->xc_ctx->sequence)
479 goto out_skip; 422 goto out_skip;
480 423
481 /* 424 /*
@@ -629,7 +572,6 @@ restart:
629 572
630out_skip: 573out_skip:
631 up_write(&cil->xc_ctx_lock); 574 up_write(&cil->xc_ctx_lock);
632out_free_ticket:
633 xfs_log_ticket_put(new_ctx->ticket); 575 xfs_log_ticket_put(new_ctx->ticket);
634 kmem_free(new_ctx); 576 kmem_free(new_ctx);
635 return 0; 577 return 0;
@@ -641,6 +583,82 @@ out_abort:
641 return XFS_ERROR(EIO); 583 return XFS_ERROR(EIO);
642} 584}
643 585
586static void
587xlog_cil_push_work(
588 struct work_struct *work)
589{
590 struct xfs_cil *cil = container_of(work, struct xfs_cil,
591 xc_push_work);
592 xlog_cil_push(cil->xc_log);
593}
594
595/*
596 * We need to push CIL every so often so we don't cache more than we can fit in
597 * the log. The limit really is that a checkpoint can't be more than half the
598 * log (the current checkpoint is not allowed to overwrite the previous
599 * checkpoint), but commit latency and memory usage limit this to a smaller
600 * size.
601 */
602static void
603xlog_cil_push_background(
604 struct log *log)
605{
606 struct xfs_cil *cil = log->l_cilp;
607
608 /*
609 * The cil won't be empty because we are called while holding the
610 * context lock so whatever we added to the CIL will still be there
611 */
612 ASSERT(!list_empty(&cil->xc_cil));
613
614 /*
615 * don't do a background push if we haven't used up all the
616 * space available yet.
617 */
618 if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
619 return;
620
621 spin_lock(&cil->xc_cil_lock);
622 if (cil->xc_push_seq < cil->xc_current_sequence) {
623 cil->xc_push_seq = cil->xc_current_sequence;
624 queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
625 }
626 spin_unlock(&cil->xc_cil_lock);
627
628}
629
630static void
631xlog_cil_push_foreground(
632 struct log *log,
633 xfs_lsn_t push_seq)
634{
635 struct xfs_cil *cil = log->l_cilp;
636
637 if (!cil)
638 return;
639
640 ASSERT(push_seq && push_seq <= cil->xc_current_sequence);
641
642 /* start on any pending background push to minimise wait time on it */
643 flush_work(&cil->xc_push_work);
644
645 /*
646 * If the CIL is empty or we've already pushed the sequence then
647 * there's no work we need to do.
648 */
649 spin_lock(&cil->xc_cil_lock);
650 if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
651 spin_unlock(&cil->xc_cil_lock);
652 return;
653 }
654
655 cil->xc_push_seq = push_seq;
656 spin_unlock(&cil->xc_cil_lock);
657
658 /* do the push now */
659 xlog_cil_push(log);
660}
661
644/* 662/*
645 * Commit a transaction with the given vector to the Committed Item List. 663 * Commit a transaction with the given vector to the Committed Item List.
646 * 664 *
@@ -667,7 +685,6 @@ xfs_log_commit_cil(
667{ 685{
668 struct log *log = mp->m_log; 686 struct log *log = mp->m_log;
669 int log_flags = 0; 687 int log_flags = 0;
670 int push = 0;
671 struct xfs_log_vec *log_vector; 688 struct xfs_log_vec *log_vector;
672 689
673 if (flags & XFS_TRANS_RELEASE_LOG_RES) 690 if (flags & XFS_TRANS_RELEASE_LOG_RES)
@@ -719,21 +736,9 @@ xfs_log_commit_cil(
719 */ 736 */
720 xfs_trans_free_items(tp, *commit_lsn, 0); 737 xfs_trans_free_items(tp, *commit_lsn, 0);
721 738
722 /* check for background commit before unlock */ 739 xlog_cil_push_background(log);
723 if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
724 push = 1;
725 740
726 up_read(&log->l_cilp->xc_ctx_lock); 741 up_read(&log->l_cilp->xc_ctx_lock);
727
728 /*
729 * We need to push CIL every so often so we don't cache more than we
730 * can fit in the log. The limit really is that a checkpoint can't be
731 * more than half the log (the current checkpoint is not allowed to
732 * overwrite the previous checkpoint), but commit latency and memory
733 * usage limit this to a smaller size in most cases.
734 */
735 if (push)
736 xlog_cil_push(log, 0);
737 return 0; 742 return 0;
738} 743}
739 744
@@ -746,9 +751,6 @@ xfs_log_commit_cil(
746 * 751 *
747 * We return the current commit lsn to allow the callers to determine if a 752 * We return the current commit lsn to allow the callers to determine if a
748 * iclog flush is necessary following this call. 753 * iclog flush is necessary following this call.
749 *
750 * XXX: Initially, just push the CIL unconditionally and return whatever
751 * commit lsn is there. It'll be empty, so this is broken for now.
752 */ 754 */
753xfs_lsn_t 755xfs_lsn_t
754xlog_cil_force_lsn( 756xlog_cil_force_lsn(
@@ -766,8 +768,7 @@ xlog_cil_force_lsn(
766 * xlog_cil_push() handles racing pushes for the same sequence, 768 * xlog_cil_push() handles racing pushes for the same sequence,
767 * so no need to deal with it here. 769 * so no need to deal with it here.
768 */ 770 */
769 if (sequence == cil->xc_current_sequence) 771 xlog_cil_push_foreground(log, sequence);
770 xlog_cil_push(log, sequence);
771 772
772 /* 773 /*
773 * See if we can find a previous sequence still committing. 774 * See if we can find a previous sequence still committing.
@@ -826,3 +827,57 @@ xfs_log_item_in_current_chkpt(
826 return false; 827 return false;
827 return true; 828 return true;
828} 829}
830
831/*
832 * Perform initial CIL structure initialisation.
833 */
834int
835xlog_cil_init(
836 struct log *log)
837{
838 struct xfs_cil *cil;
839 struct xfs_cil_ctx *ctx;
840
841 cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
842 if (!cil)
843 return ENOMEM;
844
845 ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
846 if (!ctx) {
847 kmem_free(cil);
848 return ENOMEM;
849 }
850
851 INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
852 INIT_LIST_HEAD(&cil->xc_cil);
853 INIT_LIST_HEAD(&cil->xc_committing);
854 spin_lock_init(&cil->xc_cil_lock);
855 init_rwsem(&cil->xc_ctx_lock);
856 init_waitqueue_head(&cil->xc_commit_wait);
857
858 INIT_LIST_HEAD(&ctx->committing);
859 INIT_LIST_HEAD(&ctx->busy_extents);
860 ctx->sequence = 1;
861 ctx->cil = cil;
862 cil->xc_ctx = ctx;
863 cil->xc_current_sequence = ctx->sequence;
864
865 cil->xc_log = log;
866 log->l_cilp = cil;
867 return 0;
868}
869
870void
871xlog_cil_destroy(
872 struct log *log)
873{
874 if (log->l_cilp->xc_ctx) {
875 if (log->l_cilp->xc_ctx->ticket)
876 xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
877 kmem_free(log->l_cilp->xc_ctx);
878 }
879
880 ASSERT(list_empty(&log->l_cilp->xc_cil));
881 kmem_free(log->l_cilp);
882}
883
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 2152900b79d4..735ff1ee53da 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -417,6 +417,8 @@ struct xfs_cil {
417 struct list_head xc_committing; 417 struct list_head xc_committing;
418 wait_queue_head_t xc_commit_wait; 418 wait_queue_head_t xc_commit_wait;
419 xfs_lsn_t xc_current_sequence; 419 xfs_lsn_t xc_current_sequence;
420 struct work_struct xc_push_work;
421 xfs_lsn_t xc_push_seq;
420}; 422};
421 423
422/* 424/*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 8ecad5bad66c..ca386909131a 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -40,7 +40,6 @@
40#include "xfs_extfree_item.h" 40#include "xfs_extfree_item.h"
41#include "xfs_trans_priv.h" 41#include "xfs_trans_priv.h"
42#include "xfs_quota.h" 42#include "xfs_quota.h"
43#include "xfs_rw.h"
44#include "xfs_utils.h" 43#include "xfs_utils.h"
45#include "xfs_trace.h" 44#include "xfs_trace.h"
46 45
@@ -120,7 +119,7 @@ xlog_get_bp(
120 nbblks += log->l_sectBBsize; 119 nbblks += log->l_sectBBsize;
121 nbblks = round_up(nbblks, log->l_sectBBsize); 120 nbblks = round_up(nbblks, log->l_sectBBsize);
122 121
123 bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, BBTOB(nbblks), 0); 122 bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0);
124 if (bp) 123 if (bp)
125 xfs_buf_unlock(bp); 124 xfs_buf_unlock(bp);
126 return bp; 125 return bp;
@@ -146,7 +145,7 @@ xlog_align(
146{ 145{
147 xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); 146 xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
148 147
149 ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp)); 148 ASSERT(offset + nbblks <= bp->b_length);
150 return bp->b_addr + BBTOB(offset); 149 return bp->b_addr + BBTOB(offset);
151} 150}
152 151
@@ -174,11 +173,12 @@ xlog_bread_noalign(
174 nbblks = round_up(nbblks, log->l_sectBBsize); 173 nbblks = round_up(nbblks, log->l_sectBBsize);
175 174
176 ASSERT(nbblks > 0); 175 ASSERT(nbblks > 0);
177 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); 176 ASSERT(nbblks <= bp->b_length);
178 177
179 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); 178 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
180 XFS_BUF_READ(bp); 179 XFS_BUF_READ(bp);
181 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); 180 bp->b_io_length = nbblks;
181 bp->b_error = 0;
182 182
183 xfsbdstrat(log->l_mp, bp); 183 xfsbdstrat(log->l_mp, bp);
184 error = xfs_buf_iowait(bp); 184 error = xfs_buf_iowait(bp);
@@ -218,7 +218,7 @@ xlog_bread_offset(
218 xfs_caddr_t offset) 218 xfs_caddr_t offset)
219{ 219{
220 xfs_caddr_t orig_offset = bp->b_addr; 220 xfs_caddr_t orig_offset = bp->b_addr;
221 int orig_len = bp->b_buffer_length; 221 int orig_len = BBTOB(bp->b_length);
222 int error, error2; 222 int error, error2;
223 223
224 error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks)); 224 error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
@@ -259,13 +259,14 @@ xlog_bwrite(
259 nbblks = round_up(nbblks, log->l_sectBBsize); 259 nbblks = round_up(nbblks, log->l_sectBBsize);
260 260
261 ASSERT(nbblks > 0); 261 ASSERT(nbblks > 0);
262 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); 262 ASSERT(nbblks <= bp->b_length);
263 263
264 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); 264 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
265 XFS_BUF_ZEROFLAGS(bp); 265 XFS_BUF_ZEROFLAGS(bp);
266 xfs_buf_hold(bp); 266 xfs_buf_hold(bp);
267 xfs_buf_lock(bp); 267 xfs_buf_lock(bp);
268 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); 268 bp->b_io_length = nbblks;
269 bp->b_error = 0;
269 270
270 error = xfs_bwrite(bp); 271 error = xfs_bwrite(bp);
271 if (error) 272 if (error)
@@ -440,6 +441,8 @@ xlog_find_verify_cycle(
440 * a log sector, or we're out of luck. 441 * a log sector, or we're out of luck.
441 */ 442 */
442 bufblks = 1 << ffs(nbblks); 443 bufblks = 1 << ffs(nbblks);
444 while (bufblks > log->l_logBBsize)
445 bufblks >>= 1;
443 while (!(bp = xlog_get_bp(log, bufblks))) { 446 while (!(bp = xlog_get_bp(log, bufblks))) {
444 bufblks >>= 1; 447 bufblks >>= 1;
445 if (bufblks < log->l_sectBBsize) 448 if (bufblks < log->l_sectBBsize)
@@ -1225,6 +1228,8 @@ xlog_write_log_records(
1225 * log sector, or we're out of luck. 1228 * log sector, or we're out of luck.
1226 */ 1229 */
1227 bufblks = 1 << ffs(blocks); 1230 bufblks = 1 << ffs(blocks);
1231 while (bufblks > log->l_logBBsize)
1232 bufblks >>= 1;
1228 while (!(bp = xlog_get_bp(log, bufblks))) { 1233 while (!(bp = xlog_get_bp(log, bufblks))) {
1229 bufblks >>= 1; 1234 bufblks >>= 1;
1230 if (bufblks < sectbb) 1235 if (bufblks < sectbb)
@@ -1772,7 +1777,7 @@ xlog_recover_do_inode_buffer(
1772 1777
1773 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); 1778 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
1774 1779
1775 inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog; 1780 inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
1776 for (i = 0; i < inodes_per_buf; i++) { 1781 for (i = 0; i < inodes_per_buf; i++) {
1777 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + 1782 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
1778 offsetof(xfs_dinode_t, di_next_unlinked); 1783 offsetof(xfs_dinode_t, di_next_unlinked);
@@ -1814,7 +1819,8 @@ xlog_recover_do_inode_buffer(
1814 1819
1815 ASSERT(item->ri_buf[item_index].i_addr != NULL); 1820 ASSERT(item->ri_buf[item_index].i_addr != NULL);
1816 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); 1821 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
1817 ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); 1822 ASSERT((reg_buf_offset + reg_buf_bytes) <=
1823 BBTOB(bp->b_io_length));
1818 1824
1819 /* 1825 /*
1820 * The current logged region contains a copy of the 1826 * The current logged region contains a copy of the
@@ -1873,8 +1879,8 @@ xlog_recover_do_reg_buffer(
1873 ASSERT(nbits > 0); 1879 ASSERT(nbits > 0);
1874 ASSERT(item->ri_buf[i].i_addr != NULL); 1880 ASSERT(item->ri_buf[i].i_addr != NULL);
1875 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); 1881 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
1876 ASSERT(XFS_BUF_COUNT(bp) >= 1882 ASSERT(BBTOB(bp->b_io_length) >=
1877 ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT)); 1883 ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
1878 1884
1879 /* 1885 /*
1880 * Do a sanity check if this is a dquot buffer. Just checking 1886 * Do a sanity check if this is a dquot buffer. Just checking
@@ -2103,6 +2109,7 @@ xlog_recover_do_dquot_buffer(
2103STATIC int 2109STATIC int
2104xlog_recover_buffer_pass2( 2110xlog_recover_buffer_pass2(
2105 xlog_t *log, 2111 xlog_t *log,
2112 struct list_head *buffer_list,
2106 xlog_recover_item_t *item) 2113 xlog_recover_item_t *item)
2107{ 2114{
2108 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; 2115 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
@@ -2123,9 +2130,9 @@ xlog_recover_buffer_pass2(
2123 2130
2124 trace_xfs_log_recover_buf_recover(log, buf_f); 2131 trace_xfs_log_recover_buf_recover(log, buf_f);
2125 2132
2126 buf_flags = XBF_LOCK; 2133 buf_flags = 0;
2127 if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF)) 2134 if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
2128 buf_flags |= XBF_MAPPED; 2135 buf_flags |= XBF_UNMAPPED;
2129 2136
2130 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, 2137 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2131 buf_flags); 2138 buf_flags);
@@ -2166,14 +2173,14 @@ xlog_recover_buffer_pass2(
2166 */ 2173 */
2167 if (XFS_DINODE_MAGIC == 2174 if (XFS_DINODE_MAGIC ==
2168 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && 2175 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
2169 (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize, 2176 (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,
2170 (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { 2177 (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
2171 xfs_buf_stale(bp); 2178 xfs_buf_stale(bp);
2172 error = xfs_bwrite(bp); 2179 error = xfs_bwrite(bp);
2173 } else { 2180 } else {
2174 ASSERT(bp->b_target->bt_mount == mp); 2181 ASSERT(bp->b_target->bt_mount == mp);
2175 bp->b_iodone = xlog_recover_iodone; 2182 bp->b_iodone = xlog_recover_iodone;
2176 xfs_buf_delwri_queue(bp); 2183 xfs_buf_delwri_queue(bp, buffer_list);
2177 } 2184 }
2178 2185
2179 xfs_buf_relse(bp); 2186 xfs_buf_relse(bp);
@@ -2183,6 +2190,7 @@ xlog_recover_buffer_pass2(
2183STATIC int 2190STATIC int
2184xlog_recover_inode_pass2( 2191xlog_recover_inode_pass2(
2185 xlog_t *log, 2192 xlog_t *log,
2193 struct list_head *buffer_list,
2186 xlog_recover_item_t *item) 2194 xlog_recover_item_t *item)
2187{ 2195{
2188 xfs_inode_log_format_t *in_f; 2196 xfs_inode_log_format_t *in_f;
@@ -2220,8 +2228,7 @@ xlog_recover_inode_pass2(
2220 } 2228 }
2221 trace_xfs_log_recover_inode_recover(log, in_f); 2229 trace_xfs_log_recover_inode_recover(log, in_f);
2222 2230
2223 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 2231 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0);
2224 XBF_LOCK);
2225 if (!bp) { 2232 if (!bp) {
2226 error = ENOMEM; 2233 error = ENOMEM;
2227 goto error; 2234 goto error;
@@ -2436,7 +2443,7 @@ xlog_recover_inode_pass2(
2436write_inode_buffer: 2443write_inode_buffer:
2437 ASSERT(bp->b_target->bt_mount == mp); 2444 ASSERT(bp->b_target->bt_mount == mp);
2438 bp->b_iodone = xlog_recover_iodone; 2445 bp->b_iodone = xlog_recover_iodone;
2439 xfs_buf_delwri_queue(bp); 2446 xfs_buf_delwri_queue(bp, buffer_list);
2440 xfs_buf_relse(bp); 2447 xfs_buf_relse(bp);
2441error: 2448error:
2442 if (need_free) 2449 if (need_free)
@@ -2477,6 +2484,7 @@ xlog_recover_quotaoff_pass1(
2477STATIC int 2484STATIC int
2478xlog_recover_dquot_pass2( 2485xlog_recover_dquot_pass2(
2479 xlog_t *log, 2486 xlog_t *log,
2487 struct list_head *buffer_list,
2480 xlog_recover_item_t *item) 2488 xlog_recover_item_t *item)
2481{ 2489{
2482 xfs_mount_t *mp = log->l_mp; 2490 xfs_mount_t *mp = log->l_mp;
@@ -2530,14 +2538,11 @@ xlog_recover_dquot_pass2(
2530 return XFS_ERROR(EIO); 2538 return XFS_ERROR(EIO);
2531 ASSERT(dq_f->qlf_len == 1); 2539 ASSERT(dq_f->qlf_len == 1);
2532 2540
2533 error = xfs_read_buf(mp, mp->m_ddev_targp, 2541 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
2534 dq_f->qlf_blkno, 2542 XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp);
2535 XFS_FSB_TO_BB(mp, dq_f->qlf_len), 2543 if (error)
2536 0, &bp);
2537 if (error) {
2538 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#3)");
2539 return error; 2544 return error;
2540 } 2545
2541 ASSERT(bp); 2546 ASSERT(bp);
2542 ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); 2547 ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
2543 2548
@@ -2558,7 +2563,7 @@ xlog_recover_dquot_pass2(
2558 ASSERT(dq_f->qlf_size == 2); 2563 ASSERT(dq_f->qlf_size == 2);
2559 ASSERT(bp->b_target->bt_mount == mp); 2564 ASSERT(bp->b_target->bt_mount == mp);
2560 bp->b_iodone = xlog_recover_iodone; 2565 bp->b_iodone = xlog_recover_iodone;
2561 xfs_buf_delwri_queue(bp); 2566 xfs_buf_delwri_queue(bp, buffer_list);
2562 xfs_buf_relse(bp); 2567 xfs_buf_relse(bp);
2563 2568
2564 return (0); 2569 return (0);
@@ -2642,7 +2647,8 @@ xlog_recover_efd_pass2(
2642 * xfs_trans_ail_delete() drops the 2647 * xfs_trans_ail_delete() drops the
2643 * AIL lock. 2648 * AIL lock.
2644 */ 2649 */
2645 xfs_trans_ail_delete(ailp, lip); 2650 xfs_trans_ail_delete(ailp, lip,
2651 SHUTDOWN_CORRUPT_INCORE);
2646 xfs_efi_item_free(efip); 2652 xfs_efi_item_free(efip);
2647 spin_lock(&ailp->xa_lock); 2653 spin_lock(&ailp->xa_lock);
2648 break; 2654 break;
@@ -2712,21 +2718,22 @@ STATIC int
2712xlog_recover_commit_pass2( 2718xlog_recover_commit_pass2(
2713 struct log *log, 2719 struct log *log,
2714 struct xlog_recover *trans, 2720 struct xlog_recover *trans,
2721 struct list_head *buffer_list,
2715 xlog_recover_item_t *item) 2722 xlog_recover_item_t *item)
2716{ 2723{
2717 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); 2724 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
2718 2725
2719 switch (ITEM_TYPE(item)) { 2726 switch (ITEM_TYPE(item)) {
2720 case XFS_LI_BUF: 2727 case XFS_LI_BUF:
2721 return xlog_recover_buffer_pass2(log, item); 2728 return xlog_recover_buffer_pass2(log, buffer_list, item);
2722 case XFS_LI_INODE: 2729 case XFS_LI_INODE:
2723 return xlog_recover_inode_pass2(log, item); 2730 return xlog_recover_inode_pass2(log, buffer_list, item);
2724 case XFS_LI_EFI: 2731 case XFS_LI_EFI:
2725 return xlog_recover_efi_pass2(log, item, trans->r_lsn); 2732 return xlog_recover_efi_pass2(log, item, trans->r_lsn);
2726 case XFS_LI_EFD: 2733 case XFS_LI_EFD:
2727 return xlog_recover_efd_pass2(log, item); 2734 return xlog_recover_efd_pass2(log, item);
2728 case XFS_LI_DQUOT: 2735 case XFS_LI_DQUOT:
2729 return xlog_recover_dquot_pass2(log, item); 2736 return xlog_recover_dquot_pass2(log, buffer_list, item);
2730 case XFS_LI_QUOTAOFF: 2737 case XFS_LI_QUOTAOFF:
2731 /* nothing to do in pass2 */ 2738 /* nothing to do in pass2 */
2732 return 0; 2739 return 0;
@@ -2750,8 +2757,9 @@ xlog_recover_commit_trans(
2750 struct xlog_recover *trans, 2757 struct xlog_recover *trans,
2751 int pass) 2758 int pass)
2752{ 2759{
2753 int error = 0; 2760 int error = 0, error2;
2754 xlog_recover_item_t *item; 2761 xlog_recover_item_t *item;
2762 LIST_HEAD (buffer_list);
2755 2763
2756 hlist_del(&trans->r_list); 2764 hlist_del(&trans->r_list);
2757 2765
@@ -2760,16 +2768,27 @@ xlog_recover_commit_trans(
2760 return error; 2768 return error;
2761 2769
2762 list_for_each_entry(item, &trans->r_itemq, ri_list) { 2770 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2763 if (pass == XLOG_RECOVER_PASS1) 2771 switch (pass) {
2772 case XLOG_RECOVER_PASS1:
2764 error = xlog_recover_commit_pass1(log, trans, item); 2773 error = xlog_recover_commit_pass1(log, trans, item);
2765 else 2774 break;
2766 error = xlog_recover_commit_pass2(log, trans, item); 2775 case XLOG_RECOVER_PASS2:
2776 error = xlog_recover_commit_pass2(log, trans,
2777 &buffer_list, item);
2778 break;
2779 default:
2780 ASSERT(0);
2781 }
2782
2767 if (error) 2783 if (error)
2768 return error; 2784 goto out;
2769 } 2785 }
2770 2786
2771 xlog_recover_free_trans(trans); 2787 xlog_recover_free_trans(trans);
2772 return 0; 2788
2789out:
2790 error2 = xfs_buf_delwri_submit(&buffer_list);
2791 return error ? error : error2;
2773} 2792}
2774 2793
2775STATIC int 2794STATIC int
@@ -3079,7 +3098,7 @@ xlog_recover_process_one_iunlink(
3079 /* 3098 /*
3080 * Get the on disk inode to find the next inode in the bucket. 3099 * Get the on disk inode to find the next inode in the bucket.
3081 */ 3100 */
3082 error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK); 3101 error = xfs_itobp(mp, NULL, ip, &dip, &ibp, 0);
3083 if (error) 3102 if (error)
3084 goto fail_iput; 3103 goto fail_iput;
3085 3104
@@ -3639,11 +3658,8 @@ xlog_do_recover(
3639 * First replay the images in the log. 3658 * First replay the images in the log.
3640 */ 3659 */
3641 error = xlog_do_log_recovery(log, head_blk, tail_blk); 3660 error = xlog_do_log_recovery(log, head_blk, tail_blk);
3642 if (error) { 3661 if (error)
3643 return error; 3662 return error;
3644 }
3645
3646 xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1);
3647 3663
3648 /* 3664 /*
3649 * If IO errors happened during recovery, bail out. 3665 * If IO errors happened during recovery, bail out.
@@ -3670,7 +3686,6 @@ xlog_do_recover(
3670 bp = xfs_getsb(log->l_mp, 0); 3686 bp = xfs_getsb(log->l_mp, 0);
3671 XFS_BUF_UNDONE(bp); 3687 XFS_BUF_UNDONE(bp);
3672 ASSERT(!(XFS_BUF_ISWRITE(bp))); 3688 ASSERT(!(XFS_BUF_ISWRITE(bp)));
3673 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
3674 XFS_BUF_READ(bp); 3689 XFS_BUF_READ(bp);
3675 XFS_BUF_UNASYNC(bp); 3690 XFS_BUF_UNASYNC(bp);
3676 xfsbdstrat(log->l_mp, bp); 3691 xfsbdstrat(log->l_mp, bp);
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index bd672def95ac..331cd9f83a7f 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -19,7 +19,6 @@
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_log.h" 21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h" 22#include "xfs_trans.h"
24#include "xfs_sb.h" 23#include "xfs_sb.h"
25#include "xfs_ag.h" 24#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 1ffead4b2296..536021fb3d4e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -22,6 +22,7 @@
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h" 23#include "xfs_inum.h"
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
25#include "xfs_sb.h" 26#include "xfs_sb.h"
26#include "xfs_ag.h" 27#include "xfs_ag.h"
27#include "xfs_dir2.h" 28#include "xfs_dir2.h"
@@ -37,7 +38,6 @@
37#include "xfs_rtalloc.h" 38#include "xfs_rtalloc.h"
38#include "xfs_bmap.h" 39#include "xfs_bmap.h"
39#include "xfs_error.h" 40#include "xfs_error.h"
40#include "xfs_rw.h"
41#include "xfs_quota.h" 41#include "xfs_quota.h"
42#include "xfs_fsops.h" 42#include "xfs_fsops.h"
43#include "xfs_utils.h" 43#include "xfs_utils.h"
@@ -683,8 +683,8 @@ xfs_readsb(xfs_mount_t *mp, int flags)
683 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); 683 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
684 684
685reread: 685reread:
686 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, 686 bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
687 XFS_SB_DADDR, sector_size, 0); 687 BTOBB(sector_size), 0);
688 if (!bp) { 688 if (!bp) {
689 if (loud) 689 if (loud)
690 xfs_warn(mp, "SB buffer read failed"); 690 xfs_warn(mp, "SB buffer read failed");
@@ -1032,9 +1032,9 @@ xfs_check_sizes(xfs_mount_t *mp)
1032 xfs_warn(mp, "filesystem size mismatch detected"); 1032 xfs_warn(mp, "filesystem size mismatch detected");
1033 return XFS_ERROR(EFBIG); 1033 return XFS_ERROR(EFBIG);
1034 } 1034 }
1035 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, 1035 bp = xfs_buf_read_uncached(mp->m_ddev_targp,
1036 d - XFS_FSS_TO_BB(mp, 1), 1036 d - XFS_FSS_TO_BB(mp, 1),
1037 BBTOB(XFS_FSS_TO_BB(mp, 1)), 0); 1037 XFS_FSS_TO_BB(mp, 1), 0);
1038 if (!bp) { 1038 if (!bp) {
1039 xfs_warn(mp, "last sector read failed"); 1039 xfs_warn(mp, "last sector read failed");
1040 return EIO; 1040 return EIO;
@@ -1047,9 +1047,9 @@ xfs_check_sizes(xfs_mount_t *mp)
1047 xfs_warn(mp, "log size mismatch detected"); 1047 xfs_warn(mp, "log size mismatch detected");
1048 return XFS_ERROR(EFBIG); 1048 return XFS_ERROR(EFBIG);
1049 } 1049 }
1050 bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp, 1050 bp = xfs_buf_read_uncached(mp->m_logdev_targp,
1051 d - XFS_FSB_TO_BB(mp, 1), 1051 d - XFS_FSB_TO_BB(mp, 1),
1052 XFS_FSB_TO_B(mp, 1), 0); 1052 XFS_FSB_TO_BB(mp, 1), 0);
1053 if (!bp) { 1053 if (!bp) {
1054 xfs_warn(mp, "log device read failed"); 1054 xfs_warn(mp, "log device read failed");
1055 return EIO; 1055 return EIO;
@@ -1288,7 +1288,7 @@ xfs_mountfs(
1288 XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); 1288 XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
1289 if (error) { 1289 if (error) {
1290 xfs_warn(mp, "log mount failed"); 1290 xfs_warn(mp, "log mount failed");
1291 goto out_free_perag; 1291 goto out_fail_wait;
1292 } 1292 }
1293 1293
1294 /* 1294 /*
@@ -1315,7 +1315,7 @@ xfs_mountfs(
1315 !mp->m_sb.sb_inprogress) { 1315 !mp->m_sb.sb_inprogress) {
1316 error = xfs_initialize_perag_data(mp, sbp->sb_agcount); 1316 error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
1317 if (error) 1317 if (error)
1318 goto out_free_perag; 1318 goto out_fail_wait;
1319 } 1319 }
1320 1320
1321 /* 1321 /*
@@ -1439,6 +1439,10 @@ xfs_mountfs(
1439 IRELE(rip); 1439 IRELE(rip);
1440 out_log_dealloc: 1440 out_log_dealloc:
1441 xfs_log_unmount(mp); 1441 xfs_log_unmount(mp);
1442 out_fail_wait:
1443 if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
1444 xfs_wait_buftarg(mp->m_logdev_targp);
1445 xfs_wait_buftarg(mp->m_ddev_targp);
1442 out_free_perag: 1446 out_free_perag:
1443 xfs_free_perag(mp); 1447 xfs_free_perag(mp);
1444 out_remove_uuid: 1448 out_remove_uuid:
@@ -1475,15 +1479,15 @@ xfs_unmountfs(
1475 xfs_log_force(mp, XFS_LOG_SYNC); 1479 xfs_log_force(mp, XFS_LOG_SYNC);
1476 1480
1477 /* 1481 /*
1478 * Do a delwri reclaim pass first so that as many dirty inodes are 1482 * Flush all pending changes from the AIL.
1479 * queued up for IO as possible. Then flush the buffers before making 1483 */
1480 * a synchronous path to catch all the remaining inodes are reclaimed. 1484 xfs_ail_push_all_sync(mp->m_ail);
1481 * This makes the reclaim process as quick as possible by avoiding 1485
1482 * synchronous writeout and blocking on inodes already in the delwri 1486 /*
1483 * state as much as possible. 1487 * And reclaim all inodes. At this point there should be no dirty
1488 * inode, and none should be pinned or locked, but use synchronous
1489 * reclaim just to be sure.
1484 */ 1490 */
1485 xfs_reclaim_inodes(mp, 0);
1486 xfs_flush_buftarg(mp->m_ddev_targp, 1);
1487 xfs_reclaim_inodes(mp, SYNC_WAIT); 1491 xfs_reclaim_inodes(mp, SYNC_WAIT);
1488 1492
1489 xfs_qm_unmount(mp); 1493 xfs_qm_unmount(mp);
@@ -1519,15 +1523,12 @@ xfs_unmountfs(
1519 if (error) 1523 if (error)
1520 xfs_warn(mp, "Unable to update superblock counters. " 1524 xfs_warn(mp, "Unable to update superblock counters. "
1521 "Freespace may not be correct on next mount."); 1525 "Freespace may not be correct on next mount.");
1522 xfs_unmountfs_writesb(mp);
1523 1526
1524 /* 1527 /*
1525 * Make sure all buffers have been flushed and completed before 1528 * At this point we might have modified the superblock again and thus
1526 * unmounting the log. 1529 * added an item to the AIL, thus flush it again.
1527 */ 1530 */
1528 error = xfs_flush_buftarg(mp->m_ddev_targp, 1); 1531 xfs_ail_push_all_sync(mp->m_ail);
1529 if (error)
1530 xfs_warn(mp, "%d busy buffers during unmount.", error);
1531 xfs_wait_buftarg(mp->m_ddev_targp); 1532 xfs_wait_buftarg(mp->m_ddev_targp);
1532 1533
1533 xfs_log_unmount_write(mp); 1534 xfs_log_unmount_write(mp);
@@ -1588,36 +1589,6 @@ xfs_log_sbcount(xfs_mount_t *mp)
1588 return error; 1589 return error;
1589} 1590}
1590 1591
1591int
1592xfs_unmountfs_writesb(xfs_mount_t *mp)
1593{
1594 xfs_buf_t *sbp;
1595 int error = 0;
1596
1597 /*
1598 * skip superblock write if fs is read-only, or
1599 * if we are doing a forced umount.
1600 */
1601 if (!((mp->m_flags & XFS_MOUNT_RDONLY) ||
1602 XFS_FORCED_SHUTDOWN(mp))) {
1603
1604 sbp = xfs_getsb(mp, 0);
1605
1606 XFS_BUF_UNDONE(sbp);
1607 XFS_BUF_UNREAD(sbp);
1608 xfs_buf_delwri_dequeue(sbp);
1609 XFS_BUF_WRITE(sbp);
1610 XFS_BUF_UNASYNC(sbp);
1611 ASSERT(sbp->b_target == mp->m_ddev_targp);
1612 xfsbdstrat(mp, sbp);
1613 error = xfs_buf_iowait(sbp);
1614 if (error)
1615 xfs_buf_ioerror_alert(sbp, __func__);
1616 xfs_buf_relse(sbp);
1617 }
1618 return error;
1619}
1620
1621/* 1592/*
1622 * xfs_mod_sb() can be used to copy arbitrary changes to the 1593 * xfs_mod_sb() can be used to copy arbitrary changes to the
1623 * in-core superblock into the superblock buffer to be logged. 1594 * in-core superblock into the superblock buffer to be logged.
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 9eba73887829..8b89c5ac72d9 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -214,6 +214,7 @@ typedef struct xfs_mount {
214 214
215 struct workqueue_struct *m_data_workqueue; 215 struct workqueue_struct *m_data_workqueue;
216 struct workqueue_struct *m_unwritten_workqueue; 216 struct workqueue_struct *m_unwritten_workqueue;
217 struct workqueue_struct *m_cil_workqueue;
217} xfs_mount_t; 218} xfs_mount_t;
218 219
219/* 220/*
@@ -378,7 +379,6 @@ extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
378extern int xfs_mountfs(xfs_mount_t *mp); 379extern int xfs_mountfs(xfs_mount_t *mp);
379 380
380extern void xfs_unmountfs(xfs_mount_t *); 381extern void xfs_unmountfs(xfs_mount_t *);
381extern int xfs_unmountfs_writesb(xfs_mount_t *);
382extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); 382extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
383extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, 383extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
384 uint, int); 384 uint, int);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 55c6afedc879..249db1987764 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -19,7 +19,6 @@
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_bit.h" 20#include "xfs_bit.h"
21#include "xfs_log.h" 21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h" 22#include "xfs_trans.h"
24#include "xfs_sb.h" 23#include "xfs_sb.h"
25#include "xfs_ag.h" 24#include "xfs_ag.h"
@@ -65,7 +64,8 @@ STATIC int
65xfs_qm_dquot_walk( 64xfs_qm_dquot_walk(
66 struct xfs_mount *mp, 65 struct xfs_mount *mp,
67 int type, 66 int type,
68 int (*execute)(struct xfs_dquot *dqp)) 67 int (*execute)(struct xfs_dquot *dqp, void *data),
68 void *data)
69{ 69{
70 struct xfs_quotainfo *qi = mp->m_quotainfo; 70 struct xfs_quotainfo *qi = mp->m_quotainfo;
71 struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); 71 struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
@@ -97,7 +97,7 @@ restart:
97 97
98 next_index = be32_to_cpu(dqp->q_core.d_id) + 1; 98 next_index = be32_to_cpu(dqp->q_core.d_id) + 1;
99 99
100 error = execute(batch[i]); 100 error = execute(batch[i], data);
101 if (error == EAGAIN) { 101 if (error == EAGAIN) {
102 skipped++; 102 skipped++;
103 continue; 103 continue;
@@ -129,7 +129,8 @@ restart:
129 */ 129 */
130STATIC int 130STATIC int
131xfs_qm_dqpurge( 131xfs_qm_dqpurge(
132 struct xfs_dquot *dqp) 132 struct xfs_dquot *dqp,
133 void *data)
133{ 134{
134 struct xfs_mount *mp = dqp->q_mount; 135 struct xfs_mount *mp = dqp->q_mount;
135 struct xfs_quotainfo *qi = mp->m_quotainfo; 136 struct xfs_quotainfo *qi = mp->m_quotainfo;
@@ -153,21 +154,7 @@ xfs_qm_dqpurge(
153 154
154 dqp->dq_flags |= XFS_DQ_FREEING; 155 dqp->dq_flags |= XFS_DQ_FREEING;
155 156
156 /* 157 xfs_dqflock(dqp);
157 * If we're turning off quotas, we have to make sure that, for
158 * example, we don't delete quota disk blocks while dquots are
159 * in the process of getting written to those disk blocks.
160 * This dquot might well be on AIL, and we can't leave it there
161 * if we're turning off quotas. Basically, we need this flush
162 * lock, and are willing to block on it.
163 */
164 if (!xfs_dqflock_nowait(dqp)) {
165 /*
166 * Block on the flush lock after nudging dquot buffer,
167 * if it is incore.
168 */
169 xfs_dqflock_pushbuf_wait(dqp);
170 }
171 158
172 /* 159 /*
173 * If we are turning this type of quotas off, we don't care 160 * If we are turning this type of quotas off, we don't care
@@ -175,16 +162,21 @@ xfs_qm_dqpurge(
175 * we're unmounting, we do care, so we flush it and wait. 162 * we're unmounting, we do care, so we flush it and wait.
176 */ 163 */
177 if (XFS_DQ_IS_DIRTY(dqp)) { 164 if (XFS_DQ_IS_DIRTY(dqp)) {
178 int error; 165 struct xfs_buf *bp = NULL;
166 int error;
179 167
180 /* 168 /*
181 * We don't care about getting disk errors here. We need 169 * We don't care about getting disk errors here. We need
182 * to purge this dquot anyway, so we go ahead regardless. 170 * to purge this dquot anyway, so we go ahead regardless.
183 */ 171 */
184 error = xfs_qm_dqflush(dqp, SYNC_WAIT); 172 error = xfs_qm_dqflush(dqp, &bp);
185 if (error) 173 if (error) {
186 xfs_warn(mp, "%s: dquot %p flush failed", 174 xfs_warn(mp, "%s: dquot %p flush failed",
187 __func__, dqp); 175 __func__, dqp);
176 } else {
177 error = xfs_bwrite(bp);
178 xfs_buf_relse(bp);
179 }
188 xfs_dqflock(dqp); 180 xfs_dqflock(dqp);
189 } 181 }
190 182
@@ -226,11 +218,11 @@ xfs_qm_dqpurge_all(
226 uint flags) 218 uint flags)
227{ 219{
228 if (flags & XFS_QMOPT_UQUOTA) 220 if (flags & XFS_QMOPT_UQUOTA)
229 xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge); 221 xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL);
230 if (flags & XFS_QMOPT_GQUOTA) 222 if (flags & XFS_QMOPT_GQUOTA)
231 xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge); 223 xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL);
232 if (flags & XFS_QMOPT_PQUOTA) 224 if (flags & XFS_QMOPT_PQUOTA)
233 xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge); 225 xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL);
234} 226}
235 227
236/* 228/*
@@ -483,6 +475,23 @@ done:
483 xfs_dqunlock(udq); 475 xfs_dqunlock(udq);
484} 476}
485 477
478static bool
479xfs_qm_need_dqattach(
480 struct xfs_inode *ip)
481{
482 struct xfs_mount *mp = ip->i_mount;
483
484 if (!XFS_IS_QUOTA_RUNNING(mp))
485 return false;
486 if (!XFS_IS_QUOTA_ON(mp))
487 return false;
488 if (!XFS_NOT_DQATTACHED(mp, ip))
489 return false;
490 if (ip->i_ino == mp->m_sb.sb_uquotino ||
491 ip->i_ino == mp->m_sb.sb_gquotino)
492 return false;
493 return true;
494}
486 495
487/* 496/*
488 * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON 497 * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
@@ -500,11 +509,7 @@ xfs_qm_dqattach_locked(
500 uint nquotas = 0; 509 uint nquotas = 0;
501 int error = 0; 510 int error = 0;
502 511
503 if (!XFS_IS_QUOTA_RUNNING(mp) || 512 if (!xfs_qm_need_dqattach(ip))
504 !XFS_IS_QUOTA_ON(mp) ||
505 !XFS_NOT_DQATTACHED(mp, ip) ||
506 ip->i_ino == mp->m_sb.sb_uquotino ||
507 ip->i_ino == mp->m_sb.sb_gquotino)
508 return 0; 513 return 0;
509 514
510 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 515 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -575,6 +580,9 @@ xfs_qm_dqattach(
575{ 580{
576 int error; 581 int error;
577 582
583 if (!xfs_qm_need_dqattach(ip))
584 return 0;
585
578 xfs_ilock(ip, XFS_ILOCK_EXCL); 586 xfs_ilock(ip, XFS_ILOCK_EXCL);
579 error = xfs_qm_dqattach_locked(ip, flags); 587 error = xfs_qm_dqattach_locked(ip, flags);
580 xfs_iunlock(ip, XFS_ILOCK_EXCL); 588 xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -855,15 +863,16 @@ xfs_qm_reset_dqcounts(
855 863
856STATIC int 864STATIC int
857xfs_qm_dqiter_bufs( 865xfs_qm_dqiter_bufs(
858 xfs_mount_t *mp, 866 struct xfs_mount *mp,
859 xfs_dqid_t firstid, 867 xfs_dqid_t firstid,
860 xfs_fsblock_t bno, 868 xfs_fsblock_t bno,
861 xfs_filblks_t blkcnt, 869 xfs_filblks_t blkcnt,
862 uint flags) 870 uint flags,
871 struct list_head *buffer_list)
863{ 872{
864 xfs_buf_t *bp; 873 struct xfs_buf *bp;
865 int error; 874 int error;
866 int type; 875 int type;
867 876
868 ASSERT(blkcnt > 0); 877 ASSERT(blkcnt > 0);
869 type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : 878 type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
@@ -887,7 +896,7 @@ xfs_qm_dqiter_bufs(
887 break; 896 break;
888 897
889 xfs_qm_reset_dqcounts(mp, bp, firstid, type); 898 xfs_qm_reset_dqcounts(mp, bp, firstid, type);
890 xfs_buf_delwri_queue(bp); 899 xfs_buf_delwri_queue(bp, buffer_list);
891 xfs_buf_relse(bp); 900 xfs_buf_relse(bp);
892 /* 901 /*
893 * goto the next block. 902 * goto the next block.
@@ -895,6 +904,7 @@ xfs_qm_dqiter_bufs(
895 bno++; 904 bno++;
896 firstid += mp->m_quotainfo->qi_dqperchunk; 905 firstid += mp->m_quotainfo->qi_dqperchunk;
897 } 906 }
907
898 return error; 908 return error;
899} 909}
900 910
@@ -904,11 +914,12 @@ xfs_qm_dqiter_bufs(
904 */ 914 */
905STATIC int 915STATIC int
906xfs_qm_dqiterate( 916xfs_qm_dqiterate(
907 xfs_mount_t *mp, 917 struct xfs_mount *mp,
908 xfs_inode_t *qip, 918 struct xfs_inode *qip,
909 uint flags) 919 uint flags,
920 struct list_head *buffer_list)
910{ 921{
911 xfs_bmbt_irec_t *map; 922 struct xfs_bmbt_irec *map;
912 int i, nmaps; /* number of map entries */ 923 int i, nmaps; /* number of map entries */
913 int error; /* return value */ 924 int error; /* return value */
914 xfs_fileoff_t lblkno; 925 xfs_fileoff_t lblkno;
@@ -975,21 +986,17 @@ xfs_qm_dqiterate(
975 * Iterate thru all the blks in the extent and 986 * Iterate thru all the blks in the extent and
976 * reset the counters of all the dquots inside them. 987 * reset the counters of all the dquots inside them.
977 */ 988 */
978 if ((error = xfs_qm_dqiter_bufs(mp, 989 error = xfs_qm_dqiter_bufs(mp, firstid,
979 firstid, 990 map[i].br_startblock,
980 map[i].br_startblock, 991 map[i].br_blockcount,
981 map[i].br_blockcount, 992 flags, buffer_list);
982 flags))) { 993 if (error)
983 break; 994 goto out;
984 }
985 } 995 }
986
987 if (error)
988 break;
989 } while (nmaps > 0); 996 } while (nmaps > 0);
990 997
998out:
991 kmem_free(map); 999 kmem_free(map);
992
993 return error; 1000 return error;
994} 1001}
995 1002
@@ -1182,8 +1189,11 @@ error0:
1182 1189
1183STATIC int 1190STATIC int
1184xfs_qm_flush_one( 1191xfs_qm_flush_one(
1185 struct xfs_dquot *dqp) 1192 struct xfs_dquot *dqp,
1193 void *data)
1186{ 1194{
1195 struct list_head *buffer_list = data;
1196 struct xfs_buf *bp = NULL;
1187 int error = 0; 1197 int error = 0;
1188 1198
1189 xfs_dqlock(dqp); 1199 xfs_dqlock(dqp);
@@ -1192,11 +1202,13 @@ xfs_qm_flush_one(
1192 if (!XFS_DQ_IS_DIRTY(dqp)) 1202 if (!XFS_DQ_IS_DIRTY(dqp))
1193 goto out_unlock; 1203 goto out_unlock;
1194 1204
1195 if (!xfs_dqflock_nowait(dqp)) 1205 xfs_dqflock(dqp);
1196 xfs_dqflock_pushbuf_wait(dqp); 1206 error = xfs_qm_dqflush(dqp, &bp);
1197 1207 if (error)
1198 error = xfs_qm_dqflush(dqp, 0); 1208 goto out_unlock;
1199 1209
1210 xfs_buf_delwri_queue(bp, buffer_list);
1211 xfs_buf_relse(bp);
1200out_unlock: 1212out_unlock:
1201 xfs_dqunlock(dqp); 1213 xfs_dqunlock(dqp);
1202 return error; 1214 return error;
@@ -1215,6 +1227,7 @@ xfs_qm_quotacheck(
1215 size_t structsz; 1227 size_t structsz;
1216 xfs_inode_t *uip, *gip; 1228 xfs_inode_t *uip, *gip;
1217 uint flags; 1229 uint flags;
1230 LIST_HEAD (buffer_list);
1218 1231
1219 count = INT_MAX; 1232 count = INT_MAX;
1220 structsz = 1; 1233 structsz = 1;
@@ -1233,7 +1246,8 @@ xfs_qm_quotacheck(
1233 */ 1246 */
1234 uip = mp->m_quotainfo->qi_uquotaip; 1247 uip = mp->m_quotainfo->qi_uquotaip;
1235 if (uip) { 1248 if (uip) {
1236 error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA); 1249 error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA,
1250 &buffer_list);
1237 if (error) 1251 if (error)
1238 goto error_return; 1252 goto error_return;
1239 flags |= XFS_UQUOTA_CHKD; 1253 flags |= XFS_UQUOTA_CHKD;
@@ -1242,7 +1256,8 @@ xfs_qm_quotacheck(
1242 gip = mp->m_quotainfo->qi_gquotaip; 1256 gip = mp->m_quotainfo->qi_gquotaip;
1243 if (gip) { 1257 if (gip) {
1244 error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? 1258 error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
1245 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA); 1259 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA,
1260 &buffer_list);
1246 if (error) 1261 if (error)
1247 goto error_return; 1262 goto error_return;
1248 flags |= XFS_OQUOTA_CHKD; 1263 flags |= XFS_OQUOTA_CHKD;
@@ -1265,19 +1280,27 @@ xfs_qm_quotacheck(
1265 * We've made all the changes that we need to make incore. Flush them 1280 * We've made all the changes that we need to make incore. Flush them
1266 * down to disk buffers if everything was updated successfully. 1281 * down to disk buffers if everything was updated successfully.
1267 */ 1282 */
1268 if (XFS_IS_UQUOTA_ON(mp)) 1283 if (XFS_IS_UQUOTA_ON(mp)) {
1269 error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one); 1284 error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one,
1285 &buffer_list);
1286 }
1270 if (XFS_IS_GQUOTA_ON(mp)) { 1287 if (XFS_IS_GQUOTA_ON(mp)) {
1271 error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one); 1288 error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one,
1289 &buffer_list);
1272 if (!error) 1290 if (!error)
1273 error = error2; 1291 error = error2;
1274 } 1292 }
1275 if (XFS_IS_PQUOTA_ON(mp)) { 1293 if (XFS_IS_PQUOTA_ON(mp)) {
1276 error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one); 1294 error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one,
1295 &buffer_list);
1277 if (!error) 1296 if (!error)
1278 error = error2; 1297 error = error2;
1279 } 1298 }
1280 1299
1300 error2 = xfs_buf_delwri_submit(&buffer_list);
1301 if (!error)
1302 error = error2;
1303
1281 /* 1304 /*
1282 * We can get this error if we couldn't do a dquot allocation inside 1305 * We can get this error if we couldn't do a dquot allocation inside
1283 * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the 1306 * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
@@ -1291,15 +1314,6 @@ xfs_qm_quotacheck(
1291 } 1314 }
1292 1315
1293 /* 1316 /*
1294 * We didn't log anything, because if we crashed, we'll have to
1295 * start the quotacheck from scratch anyway. However, we must make
1296 * sure that our dquot changes are secure before we put the
1297 * quotacheck'd stamp on the superblock. So, here we do a synchronous
1298 * flush.
1299 */
1300 xfs_flush_buftarg(mp->m_ddev_targp, 1);
1301
1302 /*
1303 * If one type of quotas is off, then it will lose its 1317 * If one type of quotas is off, then it will lose its
1304 * quotachecked status, since we won't be doing accounting for 1318 * quotachecked status, since we won't be doing accounting for
1305 * that type anymore. 1319 * that type anymore.
@@ -1308,6 +1322,13 @@ xfs_qm_quotacheck(
1308 mp->m_qflags |= flags; 1322 mp->m_qflags |= flags;
1309 1323
1310 error_return: 1324 error_return:
1325 while (!list_empty(&buffer_list)) {
1326 struct xfs_buf *bp =
1327 list_first_entry(&buffer_list, struct xfs_buf, b_list);
1328 list_del_init(&bp->b_list);
1329 xfs_buf_relse(bp);
1330 }
1331
1311 if (error) { 1332 if (error) {
1312 xfs_warn(mp, 1333 xfs_warn(mp,
1313 "Quotacheck: Unsuccessful (Error %d): Disabling quotas.", 1334 "Quotacheck: Unsuccessful (Error %d): Disabling quotas.",
@@ -1424,6 +1445,7 @@ xfs_qm_dqfree_one(
1424STATIC void 1445STATIC void
1425xfs_qm_dqreclaim_one( 1446xfs_qm_dqreclaim_one(
1426 struct xfs_dquot *dqp, 1447 struct xfs_dquot *dqp,
1448 struct list_head *buffer_list,
1427 struct list_head *dispose_list) 1449 struct list_head *dispose_list)
1428{ 1450{
1429 struct xfs_mount *mp = dqp->q_mount; 1451 struct xfs_mount *mp = dqp->q_mount;
@@ -1456,25 +1478,20 @@ xfs_qm_dqreclaim_one(
1456 if (!xfs_dqflock_nowait(dqp)) 1478 if (!xfs_dqflock_nowait(dqp))
1457 goto out_busy; 1479 goto out_busy;
1458 1480
1459 /*
1460 * We have the flush lock so we know that this is not in the
1461 * process of being flushed. So, if this is dirty, flush it
1462 * DELWRI so that we don't get a freelist infested with
1463 * dirty dquots.
1464 */
1465 if (XFS_DQ_IS_DIRTY(dqp)) { 1481 if (XFS_DQ_IS_DIRTY(dqp)) {
1482 struct xfs_buf *bp = NULL;
1483
1466 trace_xfs_dqreclaim_dirty(dqp); 1484 trace_xfs_dqreclaim_dirty(dqp);
1467 1485
1468 /* 1486 error = xfs_qm_dqflush(dqp, &bp);
1469 * We flush it delayed write, so don't bother releasing the
1470 * freelist lock.
1471 */
1472 error = xfs_qm_dqflush(dqp, 0);
1473 if (error) { 1487 if (error) {
1474 xfs_warn(mp, "%s: dquot %p flush failed", 1488 xfs_warn(mp, "%s: dquot %p flush failed",
1475 __func__, dqp); 1489 __func__, dqp);
1490 goto out_busy;
1476 } 1491 }
1477 1492
1493 xfs_buf_delwri_queue(bp, buffer_list);
1494 xfs_buf_relse(bp);
1478 /* 1495 /*
1479 * Give the dquot another try on the freelist, as the 1496 * Give the dquot another try on the freelist, as the
1480 * flushing will take some time. 1497 * flushing will take some time.
@@ -1518,8 +1535,10 @@ xfs_qm_shake(
1518 struct xfs_quotainfo *qi = 1535 struct xfs_quotainfo *qi =
1519 container_of(shrink, struct xfs_quotainfo, qi_shrinker); 1536 container_of(shrink, struct xfs_quotainfo, qi_shrinker);
1520 int nr_to_scan = sc->nr_to_scan; 1537 int nr_to_scan = sc->nr_to_scan;
1538 LIST_HEAD (buffer_list);
1521 LIST_HEAD (dispose_list); 1539 LIST_HEAD (dispose_list);
1522 struct xfs_dquot *dqp; 1540 struct xfs_dquot *dqp;
1541 int error;
1523 1542
1524 if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) 1543 if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
1525 return 0; 1544 return 0;
@@ -1532,15 +1551,20 @@ xfs_qm_shake(
1532 break; 1551 break;
1533 dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot, 1552 dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot,
1534 q_lru); 1553 q_lru);
1535 xfs_qm_dqreclaim_one(dqp, &dispose_list); 1554 xfs_qm_dqreclaim_one(dqp, &buffer_list, &dispose_list);
1536 } 1555 }
1537 mutex_unlock(&qi->qi_lru_lock); 1556 mutex_unlock(&qi->qi_lru_lock);
1538 1557
1558 error = xfs_buf_delwri_submit(&buffer_list);
1559 if (error)
1560 xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
1561
1539 while (!list_empty(&dispose_list)) { 1562 while (!list_empty(&dispose_list)) {
1540 dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru); 1563 dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru);
1541 list_del_init(&dqp->q_lru); 1564 list_del_init(&dqp->q_lru);
1542 xfs_qm_dqfree_one(dqp); 1565 xfs_qm_dqfree_one(dqp);
1543 } 1566 }
1567
1544out: 1568out:
1545 return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure; 1569 return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure;
1546} 1570}
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index e6986b5d80d8..6b39115bf145 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -17,9 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_bit.h"
21#include "xfs_log.h" 20#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h" 21#include "xfs_trans.h"
24#include "xfs_sb.h" 22#include "xfs_sb.h"
25#include "xfs_ag.h" 23#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index c4f396e437a8..858a3b186110 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -22,7 +22,6 @@
22#include "xfs_fs.h" 22#include "xfs_fs.h"
23#include "xfs_bit.h" 23#include "xfs_bit.h"
24#include "xfs_log.h" 24#include "xfs_log.h"
25#include "xfs_inum.h"
26#include "xfs_trans.h" 25#include "xfs_trans.h"
27#include "xfs_sb.h" 26#include "xfs_sb.h"
28#include "xfs_ag.h" 27#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 7e76f537abb7..fed504fc2999 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -17,7 +17,6 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_sb.h" 19#include "xfs_sb.h"
20#include "xfs_inum.h"
21#include "xfs_log.h" 20#include "xfs_log.h"
22#include "xfs_ag.h" 21#include "xfs_ag.h"
23#include "xfs_mount.h" 22#include "xfs_mount.h"
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index e44ef7ee8ce8..30ff5f401d28 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -19,7 +19,6 @@
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_log.h" 21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h" 22#include "xfs_trans.h"
24#include "xfs_sb.h" 23#include "xfs_sb.h"
25#include "xfs_ag.h" 24#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ca4f31534a0a..92d4331cd4f1 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -20,7 +20,6 @@
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 23#include "xfs_trans.h"
25#include "xfs_sb.h" 24#include "xfs_sb.h"
26#include "xfs_ag.h" 25#include "xfs_ag.h"
@@ -34,7 +33,6 @@
34#include "xfs_rtalloc.h" 33#include "xfs_rtalloc.h"
35#include "xfs_fsops.h" 34#include "xfs_fsops.h"
36#include "xfs_error.h" 35#include "xfs_error.h"
37#include "xfs_rw.h"
38#include "xfs_inode_item.h" 36#include "xfs_inode_item.h"
39#include "xfs_trans_space.h" 37#include "xfs_trans_space.h"
40#include "xfs_utils.h" 38#include "xfs_utils.h"
@@ -1872,9 +1870,9 @@ xfs_growfs_rt(
1872 /* 1870 /*
1873 * Read in the last block of the device, make sure it exists. 1871 * Read in the last block of the device, make sure it exists.
1874 */ 1872 */
1875 bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp, 1873 bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
1876 XFS_FSB_TO_BB(mp, nrblocks - 1), 1874 XFS_FSB_TO_BB(mp, nrblocks - 1),
1877 XFS_FSB_TO_B(mp, 1), 0); 1875 XFS_FSB_TO_BB(mp, 1), 0);
1878 if (!bp) 1876 if (!bp)
1879 return EIO; 1877 return EIO;
1880 xfs_buf_relse(bp); 1878 xfs_buf_relse(bp);
@@ -2219,9 +2217,9 @@ xfs_rtmount_init(
2219 (unsigned long long) mp->m_sb.sb_rblocks); 2217 (unsigned long long) mp->m_sb.sb_rblocks);
2220 return XFS_ERROR(EFBIG); 2218 return XFS_ERROR(EFBIG);
2221 } 2219 }
2222 bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp, 2220 bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
2223 d - XFS_FSB_TO_BB(mp, 1), 2221 d - XFS_FSB_TO_BB(mp, 1),
2224 XFS_FSB_TO_B(mp, 1), 0); 2222 XFS_FSB_TO_BB(mp, 1), 0);
2225 if (!bp) { 2223 if (!bp) {
2226 xfs_warn(mp, "realtime device size check failed"); 2224 xfs_warn(mp, "realtime device size check failed");
2227 return EIO; 2225 return EIO;
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
deleted file mode 100644
index 597d044a09a1..000000000000
--- a/fs/xfs/xfs_rw.c
+++ /dev/null
@@ -1,156 +0,0 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_bmap_btree.h"
29#include "xfs_dinode.h"
30#include "xfs_inode.h"
31#include "xfs_error.h"
32#include "xfs_rw.h"
33
34/*
35 * Force a shutdown of the filesystem instantly while keeping
36 * the filesystem consistent. We don't do an unmount here; just shutdown
37 * the shop, make sure that absolutely nothing persistent happens to
38 * this filesystem after this point.
39 */
40void
41xfs_do_force_shutdown(
42 xfs_mount_t *mp,
43 int flags,
44 char *fname,
45 int lnnum)
46{
47 int logerror;
48
49 logerror = flags & SHUTDOWN_LOG_IO_ERROR;
50
51 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
52 xfs_notice(mp,
53 "%s(0x%x) called from line %d of file %s. Return address = 0x%p",
54 __func__, flags, lnnum, fname, __return_address);
55 }
56 /*
57 * No need to duplicate efforts.
58 */
59 if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
60 return;
61
62 /*
63 * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
64 * queue up anybody new on the log reservations, and wakes up
65 * everybody who's sleeping on log reservations to tell them
66 * the bad news.
67 */
68 if (xfs_log_force_umount(mp, logerror))
69 return;
70
71 if (flags & SHUTDOWN_CORRUPT_INCORE) {
72 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
73 "Corruption of in-memory data detected. Shutting down filesystem");
74 if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
75 xfs_stack_trace();
76 } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
77 if (logerror) {
78 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
79 "Log I/O Error Detected. Shutting down filesystem");
80 } else if (flags & SHUTDOWN_DEVICE_REQ) {
81 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
82 "All device paths lost. Shutting down filesystem");
83 } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
84 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
85 "I/O Error Detected. Shutting down filesystem");
86 }
87 }
88 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
89 xfs_alert(mp,
90 "Please umount the filesystem and rectify the problem(s)");
91 }
92}
93
94/*
95 * This isn't an absolute requirement, but it is
96 * just a good idea to call xfs_read_buf instead of
97 * directly doing a read_buf call. For one, we shouldn't
98 * be doing this disk read if we are in SHUTDOWN state anyway,
99 * so this stops that from happening. Secondly, this does all
100 * the error checking stuff and the brelse if appropriate for
101 * the caller, so the code can be a little leaner.
102 */
103
104int
105xfs_read_buf(
106 struct xfs_mount *mp,
107 xfs_buftarg_t *target,
108 xfs_daddr_t blkno,
109 int len,
110 uint flags,
111 xfs_buf_t **bpp)
112{
113 xfs_buf_t *bp;
114 int error;
115
116 if (!flags)
117 flags = XBF_LOCK | XBF_MAPPED;
118
119 bp = xfs_buf_read(target, blkno, len, flags);
120 if (!bp)
121 return XFS_ERROR(EIO);
122 error = bp->b_error;
123 if (!error && !XFS_FORCED_SHUTDOWN(mp)) {
124 *bpp = bp;
125 } else {
126 *bpp = NULL;
127 if (error) {
128 xfs_buf_ioerror_alert(bp, __func__);
129 } else {
130 error = XFS_ERROR(EIO);
131 }
132 if (bp) {
133 XFS_BUF_UNDONE(bp);
134 xfs_buf_stale(bp);
135 /*
136 * brelse clears B_ERROR and b_error
137 */
138 xfs_buf_relse(bp);
139 }
140 }
141 return (error);
142}
143
144/*
145 * helper function to extract extent size hint from inode
146 */
147xfs_extlen_t
148xfs_get_extsz_hint(
149 struct xfs_inode *ip)
150{
151 if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
152 return ip->i_d.di_extsize;
153 if (XFS_IS_REALTIME_INODE(ip))
154 return ip->i_mount->m_sb.sb_rextsize;
155 return 0;
156}
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
deleted file mode 100644
index bbdb9ad6a4ba..000000000000
--- a/fs/xfs/xfs_rw.h
+++ /dev/null
@@ -1,47 +0,0 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_RW_H__
19#define __XFS_RW_H__
20
21struct xfs_buf;
22struct xfs_inode;
23struct xfs_mount;
24
25/*
26 * Convert the given file system block to a disk block.
27 * We have to treat it differently based on whether the
28 * file is a real time file or not, because the bmap code
29 * does.
30 */
31static inline xfs_daddr_t
32xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
33{
34 return (XFS_IS_REALTIME_INODE(ip) ? \
35 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
36 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
37}
38
39/*
40 * Prototypes for functions in xfs_rw.c.
41 */
42extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
43 xfs_daddr_t blkno, int len, uint flags,
44 struct xfs_buf **bpp);
45extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
46
47#endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index dab9a5f6dfd6..2fcfd5b0b046 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -17,7 +17,6 @@
17 */ 17 */
18 18
19#include "xfs.h" 19#include "xfs.h"
20#include "xfs_bit.h"
21#include "xfs_log.h" 20#include "xfs_log.h"
22#include "xfs_inum.h" 21#include "xfs_inum.h"
23#include "xfs_trans.h" 22#include "xfs_trans.h"
@@ -622,7 +621,7 @@ void
622xfs_blkdev_issue_flush( 621xfs_blkdev_issue_flush(
623 xfs_buftarg_t *buftarg) 622 xfs_buftarg_t *buftarg)
624{ 623{
625 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL); 624 blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS, NULL);
626} 625}
627 626
628STATIC void 627STATIC void
@@ -773,8 +772,14 @@ xfs_init_mount_workqueues(
773 if (!mp->m_unwritten_workqueue) 772 if (!mp->m_unwritten_workqueue)
774 goto out_destroy_data_iodone_queue; 773 goto out_destroy_data_iodone_queue;
775 774
775 mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
776 WQ_MEM_RECLAIM, 0, mp->m_fsname);
777 if (!mp->m_cil_workqueue)
778 goto out_destroy_unwritten;
776 return 0; 779 return 0;
777 780
781out_destroy_unwritten:
782 destroy_workqueue(mp->m_unwritten_workqueue);
778out_destroy_data_iodone_queue: 783out_destroy_data_iodone_queue:
779 destroy_workqueue(mp->m_data_workqueue); 784 destroy_workqueue(mp->m_data_workqueue);
780out: 785out:
@@ -785,6 +790,7 @@ STATIC void
785xfs_destroy_mount_workqueues( 790xfs_destroy_mount_workqueues(
786 struct xfs_mount *mp) 791 struct xfs_mount *mp)
787{ 792{
793 destroy_workqueue(mp->m_cil_workqueue);
788 destroy_workqueue(mp->m_data_workqueue); 794 destroy_workqueue(mp->m_data_workqueue);
789 destroy_workqueue(mp->m_unwritten_workqueue); 795 destroy_workqueue(mp->m_unwritten_workqueue);
790} 796}
@@ -981,18 +987,9 @@ xfs_fs_put_super(
981{ 987{
982 struct xfs_mount *mp = XFS_M(sb); 988 struct xfs_mount *mp = XFS_M(sb);
983 989
984 xfs_syncd_stop(mp);
985
986 /*
987 * Blow away any referenced inode in the filestreams cache.
988 * This can and will cause log traffic as inodes go inactive
989 * here.
990 */
991 xfs_filestream_unmount(mp); 990 xfs_filestream_unmount(mp);
992
993 xfs_flush_buftarg(mp->m_ddev_targp, 1);
994
995 xfs_unmountfs(mp); 991 xfs_unmountfs(mp);
992 xfs_syncd_stop(mp);
996 xfs_freesb(mp); 993 xfs_freesb(mp);
997 xfs_icsb_destroy_counters(mp); 994 xfs_icsb_destroy_counters(mp);
998 xfs_destroy_mount_workqueues(mp); 995 xfs_destroy_mount_workqueues(mp);
@@ -1072,7 +1069,7 @@ xfs_fs_statfs(
1072 1069
1073 spin_unlock(&mp->m_sb_lock); 1070 spin_unlock(&mp->m_sb_lock);
1074 1071
1075 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) || 1072 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1076 ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) == 1073 ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
1077 (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD)) 1074 (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
1078 xfs_qm_statvfs(ip, statp); 1075 xfs_qm_statvfs(ip, statp);
@@ -1362,31 +1359,32 @@ xfs_fs_fill_super(
1362 sb->s_time_gran = 1; 1359 sb->s_time_gran = 1;
1363 set_posix_acl_flag(sb); 1360 set_posix_acl_flag(sb);
1364 1361
1365 error = xfs_mountfs(mp); 1362 error = xfs_syncd_init(mp);
1366 if (error) 1363 if (error)
1367 goto out_filestream_unmount; 1364 goto out_filestream_unmount;
1368 1365
1369 error = xfs_syncd_init(mp); 1366 error = xfs_mountfs(mp);
1370 if (error) 1367 if (error)
1371 goto out_unmount; 1368 goto out_syncd_stop;
1372 1369
1373 root = igrab(VFS_I(mp->m_rootip)); 1370 root = igrab(VFS_I(mp->m_rootip));
1374 if (!root) { 1371 if (!root) {
1375 error = ENOENT; 1372 error = ENOENT;
1376 goto out_syncd_stop; 1373 goto out_unmount;
1377 } 1374 }
1378 if (is_bad_inode(root)) { 1375 if (is_bad_inode(root)) {
1379 error = EINVAL; 1376 error = EINVAL;
1380 goto out_syncd_stop; 1377 goto out_unmount;
1381 } 1378 }
1382 sb->s_root = d_make_root(root); 1379 sb->s_root = d_make_root(root);
1383 if (!sb->s_root) { 1380 if (!sb->s_root) {
1384 error = ENOMEM; 1381 error = ENOMEM;
1385 goto out_syncd_stop; 1382 goto out_unmount;
1386 } 1383 }
1387 1384
1388 return 0; 1385 return 0;
1389 1386 out_syncd_stop:
1387 xfs_syncd_stop(mp);
1390 out_filestream_unmount: 1388 out_filestream_unmount:
1391 xfs_filestream_unmount(mp); 1389 xfs_filestream_unmount(mp);
1392 out_free_sb: 1390 out_free_sb:
@@ -1403,19 +1401,10 @@ out_destroy_workqueues:
1403 out: 1401 out:
1404 return -error; 1402 return -error;
1405 1403
1406 out_syncd_stop:
1407 xfs_syncd_stop(mp);
1408 out_unmount: 1404 out_unmount:
1409 /*
1410 * Blow away any referenced inode in the filestreams cache.
1411 * This can and will cause log traffic as inodes go inactive
1412 * here.
1413 */
1414 xfs_filestream_unmount(mp); 1405 xfs_filestream_unmount(mp);
1415
1416 xfs_flush_buftarg(mp->m_ddev_targp, 1);
1417
1418 xfs_unmountfs(mp); 1406 xfs_unmountfs(mp);
1407 xfs_syncd_stop(mp);
1419 goto out_free_sb; 1408 goto out_free_sb;
1420} 1409}
1421 1410
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 205ebcb34d9e..c9d3409c5ca3 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -18,7 +18,6 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h" 21#include "xfs_log.h"
23#include "xfs_inum.h" 22#include "xfs_inum.h"
24#include "xfs_trans.h" 23#include "xfs_trans.h"
@@ -241,45 +240,6 @@ xfs_sync_inode_data(
241 return error; 240 return error;
242} 241}
243 242
244STATIC int
245xfs_sync_inode_attr(
246 struct xfs_inode *ip,
247 struct xfs_perag *pag,
248 int flags)
249{
250 int error = 0;
251
252 xfs_ilock(ip, XFS_ILOCK_SHARED);
253 if (xfs_inode_clean(ip))
254 goto out_unlock;
255 if (!xfs_iflock_nowait(ip)) {
256 if (!(flags & SYNC_WAIT))
257 goto out_unlock;
258 xfs_iflock(ip);
259 }
260
261 if (xfs_inode_clean(ip)) {
262 xfs_ifunlock(ip);
263 goto out_unlock;
264 }
265
266 error = xfs_iflush(ip, flags);
267
268 /*
269 * We don't want to try again on non-blocking flushes that can't run
270 * again immediately. If an inode really must be written, then that's
271 * what the SYNC_WAIT flag is for.
272 */
273 if (error == EAGAIN) {
274 ASSERT(!(flags & SYNC_WAIT));
275 error = 0;
276 }
277
278 out_unlock:
279 xfs_iunlock(ip, XFS_ILOCK_SHARED);
280 return error;
281}
282
283/* 243/*
284 * Write out pagecache data for the whole filesystem. 244 * Write out pagecache data for the whole filesystem.
285 */ 245 */
@@ -300,19 +260,6 @@ xfs_sync_data(
300 return 0; 260 return 0;
301} 261}
302 262
303/*
304 * Write out inode metadata (attributes) for the whole filesystem.
305 */
306STATIC int
307xfs_sync_attr(
308 struct xfs_mount *mp,
309 int flags)
310{
311 ASSERT((flags & ~SYNC_WAIT) == 0);
312
313 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
314}
315
316STATIC int 263STATIC int
317xfs_sync_fsdata( 264xfs_sync_fsdata(
318 struct xfs_mount *mp) 265 struct xfs_mount *mp)
@@ -350,7 +297,7 @@ xfs_sync_fsdata(
350 * First stage of freeze - no writers will make progress now we are here, 297 * First stage of freeze - no writers will make progress now we are here,
351 * so we flush delwri and delalloc buffers here, then wait for all I/O to 298 * so we flush delwri and delalloc buffers here, then wait for all I/O to
352 * complete. Data is frozen at that point. Metadata is not frozen, 299 * complete. Data is frozen at that point. Metadata is not frozen,
353 * transactions can still occur here so don't bother flushing the buftarg 300 * transactions can still occur here so don't bother emptying the AIL
354 * because it'll just get dirty again. 301 * because it'll just get dirty again.
355 */ 302 */
356int 303int
@@ -365,47 +312,13 @@ xfs_quiesce_data(
365 /* write superblock and hoover up shutdown errors */ 312 /* write superblock and hoover up shutdown errors */
366 error = xfs_sync_fsdata(mp); 313 error = xfs_sync_fsdata(mp);
367 314
368 /* make sure all delwri buffers are written out */
369 xfs_flush_buftarg(mp->m_ddev_targp, 1);
370
371 /* mark the log as covered if needed */ 315 /* mark the log as covered if needed */
372 if (xfs_log_need_covered(mp)) 316 if (xfs_log_need_covered(mp))
373 error2 = xfs_fs_log_dummy(mp); 317 error2 = xfs_fs_log_dummy(mp);
374 318
375 /* flush data-only devices */
376 if (mp->m_rtdev_targp)
377 xfs_flush_buftarg(mp->m_rtdev_targp, 1);
378
379 return error ? error : error2; 319 return error ? error : error2;
380} 320}
381 321
382STATIC void
383xfs_quiesce_fs(
384 struct xfs_mount *mp)
385{
386 int count = 0, pincount;
387
388 xfs_reclaim_inodes(mp, 0);
389 xfs_flush_buftarg(mp->m_ddev_targp, 0);
390
391 /*
392 * This loop must run at least twice. The first instance of the loop
393 * will flush most meta data but that will generate more meta data
394 * (typically directory updates). Which then must be flushed and
395 * logged before we can write the unmount record. We also so sync
396 * reclaim of inodes to catch any that the above delwri flush skipped.
397 */
398 do {
399 xfs_reclaim_inodes(mp, SYNC_WAIT);
400 xfs_sync_attr(mp, SYNC_WAIT);
401 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
402 if (!pincount) {
403 delay(50);
404 count++;
405 }
406 } while (count < 2);
407}
408
409/* 322/*
410 * Second stage of a quiesce. The data is already synced, now we have to take 323 * Second stage of a quiesce. The data is already synced, now we have to take
411 * care of the metadata. New transactions are already blocked, so we need to 324 * care of the metadata. New transactions are already blocked, so we need to
@@ -421,8 +334,12 @@ xfs_quiesce_attr(
421 while (atomic_read(&mp->m_active_trans) > 0) 334 while (atomic_read(&mp->m_active_trans) > 0)
422 delay(100); 335 delay(100);
423 336
424 /* flush inodes and push all remaining buffers out to disk */ 337 /* reclaim inodes to do any IO before the freeze completes */
425 xfs_quiesce_fs(mp); 338 xfs_reclaim_inodes(mp, 0);
339 xfs_reclaim_inodes(mp, SYNC_WAIT);
340
341 /* flush all pending changes from the AIL */
342 xfs_ail_push_all_sync(mp->m_ail);
426 343
427 /* 344 /*
428 * Just warn here till VFS can correctly support 345 * Just warn here till VFS can correctly support
@@ -436,7 +353,12 @@ xfs_quiesce_attr(
436 xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " 353 xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
437 "Frozen image may not be consistent."); 354 "Frozen image may not be consistent.");
438 xfs_log_unmount_write(mp); 355 xfs_log_unmount_write(mp);
439 xfs_unmountfs_writesb(mp); 356
357 /*
358 * At this point we might have modified the superblock again and thus
359 * added an item to the AIL, thus flush it again.
360 */
361 xfs_ail_push_all_sync(mp->m_ail);
440} 362}
441 363
442static void 364static void
@@ -460,16 +382,27 @@ xfs_sync_worker(
460 struct xfs_mount, m_sync_work); 382 struct xfs_mount, m_sync_work);
461 int error; 383 int error;
462 384
463 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 385 /*
464 /* dgc: errors ignored here */ 386 * We shouldn't write/force the log if we are in the mount/unmount
465 if (mp->m_super->s_frozen == SB_UNFROZEN && 387 * process or on a read only filesystem. The workqueue still needs to be
466 xfs_log_need_covered(mp)) 388 * active in both cases, however, because it is used for inode reclaim
467 error = xfs_fs_log_dummy(mp); 389 * during these times. Use the s_umount semaphore to provide exclusion
468 else 390 * with unmount.
469 xfs_log_force(mp, 0); 391 */
470 392 if (down_read_trylock(&mp->m_super->s_umount)) {
471 /* start pushing all the metadata that is currently dirty */ 393 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
472 xfs_ail_push_all(mp->m_ail); 394 /* dgc: errors ignored here */
395 if (mp->m_super->s_frozen == SB_UNFROZEN &&
396 xfs_log_need_covered(mp))
397 error = xfs_fs_log_dummy(mp);
398 else
399 xfs_log_force(mp, 0);
400
401 /* start pushing all the metadata that is currently
402 * dirty */
403 xfs_ail_push_all(mp->m_ail);
404 }
405 up_read(&mp->m_super->s_umount);
473 } 406 }
474 407
475 /* queue us up again */ 408 /* queue us up again */
@@ -488,14 +421,6 @@ xfs_syncd_queue_reclaim(
488 struct xfs_mount *mp) 421 struct xfs_mount *mp)
489{ 422{
490 423
491 /*
492 * We can have inodes enter reclaim after we've shut down the syncd
493 * workqueue during unmount, so don't allow reclaim work to be queued
494 * during unmount.
495 */
496 if (!(mp->m_super->s_flags & MS_ACTIVE))
497 return;
498
499 rcu_read_lock(); 424 rcu_read_lock();
500 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 425 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
501 queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, 426 queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
@@ -564,7 +489,6 @@ xfs_syncd_init(
564 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); 489 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
565 490
566 xfs_syncd_queue_sync(mp); 491 xfs_syncd_queue_sync(mp);
567 xfs_syncd_queue_reclaim(mp);
568 492
569 return 0; 493 return 0;
570} 494}
@@ -702,11 +626,8 @@ xfs_reclaim_inode_grab(
702} 626}
703 627
704/* 628/*
705 * Inodes in different states need to be treated differently, and the return 629 * Inodes in different states need to be treated differently. The following
706 * value of xfs_iflush is not sufficient to get this right. The following table 630 * table lists the inode states and the reclaim actions necessary:
707 * lists the inode states and the reclaim actions necessary for non-blocking
708 * reclaim:
709 *
710 * 631 *
711 * inode state iflush ret required action 632 * inode state iflush ret required action
712 * --------------- ---------- --------------- 633 * --------------- ---------- ---------------
@@ -716,39 +637,31 @@ xfs_reclaim_inode_grab(
716 * stale, unpinned 0 reclaim 637 * stale, unpinned 0 reclaim
717 * clean, pinned(*) 0 requeue 638 * clean, pinned(*) 0 requeue
718 * stale, pinned EAGAIN requeue 639 * stale, pinned EAGAIN requeue
719 * dirty, delwri ok 0 requeue 640 * dirty, async - requeue
720 * dirty, delwri blocked EAGAIN requeue 641 * dirty, sync 0 reclaim
721 * dirty, sync flush 0 reclaim
722 * 642 *
723 * (*) dgc: I don't think the clean, pinned state is possible but it gets 643 * (*) dgc: I don't think the clean, pinned state is possible but it gets
724 * handled anyway given the order of checks implemented. 644 * handled anyway given the order of checks implemented.
725 * 645 *
726 * As can be seen from the table, the return value of xfs_iflush() is not
727 * sufficient to correctly decide the reclaim action here. The checks in
728 * xfs_iflush() might look like duplicates, but they are not.
729 *
730 * Also, because we get the flush lock first, we know that any inode that has 646 * Also, because we get the flush lock first, we know that any inode that has
731 * been flushed delwri has had the flush completed by the time we check that 647 * been flushed delwri has had the flush completed by the time we check that
732 * the inode is clean. The clean inode check needs to be done before flushing 648 * the inode is clean.
733 * the inode delwri otherwise we would loop forever requeuing clean inodes as
734 * we cannot tell apart a successful delwri flush and a clean inode from the
735 * return value of xfs_iflush().
736 * 649 *
737 * Note that because the inode is flushed delayed write by background 650 * Note that because the inode is flushed delayed write by AIL pushing, the
738 * writeback, the flush lock may already be held here and waiting on it can 651 * flush lock may already be held here and waiting on it can result in very
739 * result in very long latencies. Hence for sync reclaims, where we wait on the 652 * long latencies. Hence for sync reclaims, where we wait on the flush lock,
740 * flush lock, the caller should push out delayed write inodes first before 653 * the caller should push the AIL first before trying to reclaim inodes to
741 * trying to reclaim them to minimise the amount of time spent waiting. For 654 * minimise the amount of time spent waiting. For background relaim, we only
742 * background relaim, we just requeue the inode for the next pass. 655 * bother to reclaim clean inodes anyway.
743 * 656 *
744 * Hence the order of actions after gaining the locks should be: 657 * Hence the order of actions after gaining the locks should be:
745 * bad => reclaim 658 * bad => reclaim
746 * shutdown => unpin and reclaim 659 * shutdown => unpin and reclaim
747 * pinned, delwri => requeue 660 * pinned, async => requeue
748 * pinned, sync => unpin 661 * pinned, sync => unpin
749 * stale => reclaim 662 * stale => reclaim
750 * clean => reclaim 663 * clean => reclaim
751 * dirty, delwri => flush and requeue 664 * dirty, async => requeue
752 * dirty, sync => flush, wait and reclaim 665 * dirty, sync => flush, wait and reclaim
753 */ 666 */
754STATIC int 667STATIC int
@@ -757,7 +670,8 @@ xfs_reclaim_inode(
757 struct xfs_perag *pag, 670 struct xfs_perag *pag,
758 int sync_mode) 671 int sync_mode)
759{ 672{
760 int error; 673 struct xfs_buf *bp = NULL;
674 int error;
761 675
762restart: 676restart:
763 error = 0; 677 error = 0;
@@ -765,17 +679,6 @@ restart:
765 if (!xfs_iflock_nowait(ip)) { 679 if (!xfs_iflock_nowait(ip)) {
766 if (!(sync_mode & SYNC_WAIT)) 680 if (!(sync_mode & SYNC_WAIT))
767 goto out; 681 goto out;
768
769 /*
770 * If we only have a single dirty inode in a cluster there is
771 * a fair chance that the AIL push may have pushed it into
772 * the buffer, but xfsbufd won't touch it until 30 seconds
773 * from now, and thus we will lock up here.
774 *
775 * Promote the inode buffer to the front of the delwri list
776 * and wake up xfsbufd now.
777 */
778 xfs_promote_inode(ip);
779 xfs_iflock(ip); 682 xfs_iflock(ip);
780 } 683 }
781 684
@@ -783,13 +686,12 @@ restart:
783 goto reclaim; 686 goto reclaim;
784 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 687 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
785 xfs_iunpin_wait(ip); 688 xfs_iunpin_wait(ip);
689 xfs_iflush_abort(ip, false);
786 goto reclaim; 690 goto reclaim;
787 } 691 }
788 if (xfs_ipincount(ip)) { 692 if (xfs_ipincount(ip)) {
789 if (!(sync_mode & SYNC_WAIT)) { 693 if (!(sync_mode & SYNC_WAIT))
790 xfs_ifunlock(ip); 694 goto out_ifunlock;
791 goto out;
792 }
793 xfs_iunpin_wait(ip); 695 xfs_iunpin_wait(ip);
794 } 696 }
795 if (xfs_iflags_test(ip, XFS_ISTALE)) 697 if (xfs_iflags_test(ip, XFS_ISTALE))
@@ -798,60 +700,42 @@ restart:
798 goto reclaim; 700 goto reclaim;
799 701
800 /* 702 /*
703 * Never flush out dirty data during non-blocking reclaim, as it would
704 * just contend with AIL pushing trying to do the same job.
705 */
706 if (!(sync_mode & SYNC_WAIT))
707 goto out_ifunlock;
708
709 /*
801 * Now we have an inode that needs flushing. 710 * Now we have an inode that needs flushing.
802 * 711 *
803 * We do a nonblocking flush here even if we are doing a SYNC_WAIT 712 * Note that xfs_iflush will never block on the inode buffer lock, as
804 * reclaim as we can deadlock with inode cluster removal.
805 * xfs_ifree_cluster() can lock the inode buffer before it locks the 713 * xfs_ifree_cluster() can lock the inode buffer before it locks the
806 * ip->i_lock, and we are doing the exact opposite here. As a result, 714 * ip->i_lock, and we are doing the exact opposite here. As a result,
807 * doing a blocking xfs_itobp() to get the cluster buffer will result 715 * doing a blocking xfs_itobp() to get the cluster buffer would result
808 * in an ABBA deadlock with xfs_ifree_cluster(). 716 * in an ABBA deadlock with xfs_ifree_cluster().
809 * 717 *
810 * As xfs_ifree_cluser() must gather all inodes that are active in the 718 * As xfs_ifree_cluser() must gather all inodes that are active in the
811 * cache to mark them stale, if we hit this case we don't actually want 719 * cache to mark them stale, if we hit this case we don't actually want
812 * to do IO here - we want the inode marked stale so we can simply 720 * to do IO here - we want the inode marked stale so we can simply
813 * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush, 721 * reclaim it. Hence if we get an EAGAIN error here, just unlock the
814 * just unlock the inode, back off and try again. Hopefully the next 722 * inode, back off and try again. Hopefully the next pass through will
815 * pass through will see the stale flag set on the inode. 723 * see the stale flag set on the inode.
816 */ 724 */
817 error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode); 725 error = xfs_iflush(ip, &bp);
818 if (sync_mode & SYNC_WAIT) { 726 if (error == EAGAIN) {
819 if (error == EAGAIN) { 727 xfs_iunlock(ip, XFS_ILOCK_EXCL);
820 xfs_iunlock(ip, XFS_ILOCK_EXCL); 728 /* backoff longer than in xfs_ifree_cluster */
821 /* backoff longer than in xfs_ifree_cluster */ 729 delay(2);
822 delay(2); 730 goto restart;
823 goto restart;
824 }
825 xfs_iflock(ip);
826 goto reclaim;
827 } 731 }
828 732
829 /* 733 if (!error) {
830 * When we have to flush an inode but don't have SYNC_WAIT set, we 734 error = xfs_bwrite(bp);
831 * flush the inode out using a delwri buffer and wait for the next 735 xfs_buf_relse(bp);
832 * call into reclaim to find it in a clean state instead of waiting for
833 * it now. We also don't return errors here - if the error is transient
834 * then the next reclaim pass will flush the inode, and if the error
835 * is permanent then the next sync reclaim will reclaim the inode and
836 * pass on the error.
837 */
838 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
839 xfs_warn(ip->i_mount,
840 "inode 0x%llx background reclaim flush failed with %d",
841 (long long)ip->i_ino, error);
842 } 736 }
843out:
844 xfs_iflags_clear(ip, XFS_IRECLAIM);
845 xfs_iunlock(ip, XFS_ILOCK_EXCL);
846 /*
847 * We could return EAGAIN here to make reclaim rescan the inode tree in
848 * a short while. However, this just burns CPU time scanning the tree
849 * waiting for IO to complete and xfssyncd never goes back to the idle
850 * state. Instead, return 0 to let the next scheduled background reclaim
851 * attempt to reclaim the inode again.
852 */
853 return 0;
854 737
738 xfs_iflock(ip);
855reclaim: 739reclaim:
856 xfs_ifunlock(ip); 740 xfs_ifunlock(ip);
857 xfs_iunlock(ip, XFS_ILOCK_EXCL); 741 xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -884,8 +768,21 @@ reclaim:
884 xfs_iunlock(ip, XFS_ILOCK_EXCL); 768 xfs_iunlock(ip, XFS_ILOCK_EXCL);
885 769
886 xfs_inode_free(ip); 770 xfs_inode_free(ip);
887
888 return error; 771 return error;
772
773out_ifunlock:
774 xfs_ifunlock(ip);
775out:
776 xfs_iflags_clear(ip, XFS_IRECLAIM);
777 xfs_iunlock(ip, XFS_ILOCK_EXCL);
778 /*
779 * We could return EAGAIN here to make reclaim rescan the inode tree in
780 * a short while. However, this just burns CPU time scanning the tree
781 * waiting for IO to complete and xfssyncd never goes back to the idle
782 * state. Instead, return 0 to let the next scheduled background reclaim
783 * attempt to reclaim the inode again.
784 */
785 return 0;
889} 786}
890 787
891/* 788/*
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 9010ce885e6a..624bedd81357 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -18,9 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h" 21#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 22#include "xfs_trans.h"
25#include "xfs_sb.h" 23#include "xfs_sb.h"
26#include "xfs_ag.h" 24#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 06838c42b2a0..7cf9d3529e51 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -281,7 +281,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
281 TP_STRUCT__entry( 281 TP_STRUCT__entry(
282 __field(dev_t, dev) 282 __field(dev_t, dev)
283 __field(xfs_daddr_t, bno) 283 __field(xfs_daddr_t, bno)
284 __field(size_t, buffer_length) 284 __field(int, nblks)
285 __field(int, hold) 285 __field(int, hold)
286 __field(int, pincount) 286 __field(int, pincount)
287 __field(unsigned, lockval) 287 __field(unsigned, lockval)
@@ -291,18 +291,18 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
291 TP_fast_assign( 291 TP_fast_assign(
292 __entry->dev = bp->b_target->bt_dev; 292 __entry->dev = bp->b_target->bt_dev;
293 __entry->bno = bp->b_bn; 293 __entry->bno = bp->b_bn;
294 __entry->buffer_length = bp->b_buffer_length; 294 __entry->nblks = bp->b_length;
295 __entry->hold = atomic_read(&bp->b_hold); 295 __entry->hold = atomic_read(&bp->b_hold);
296 __entry->pincount = atomic_read(&bp->b_pin_count); 296 __entry->pincount = atomic_read(&bp->b_pin_count);
297 __entry->lockval = bp->b_sema.count; 297 __entry->lockval = bp->b_sema.count;
298 __entry->flags = bp->b_flags; 298 __entry->flags = bp->b_flags;
299 __entry->caller_ip = caller_ip; 299 __entry->caller_ip = caller_ip;
300 ), 300 ),
301 TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " 301 TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d "
302 "lock %d flags %s caller %pf", 302 "lock %d flags %s caller %pf",
303 MAJOR(__entry->dev), MINOR(__entry->dev), 303 MAJOR(__entry->dev), MINOR(__entry->dev),
304 (unsigned long long)__entry->bno, 304 (unsigned long long)__entry->bno,
305 __entry->buffer_length, 305 __entry->nblks,
306 __entry->hold, 306 __entry->hold,
307 __entry->pincount, 307 __entry->pincount,
308 __entry->lockval, 308 __entry->lockval,
@@ -328,7 +328,7 @@ DEFINE_BUF_EVENT(xfs_buf_unlock);
328DEFINE_BUF_EVENT(xfs_buf_iowait); 328DEFINE_BUF_EVENT(xfs_buf_iowait);
329DEFINE_BUF_EVENT(xfs_buf_iowait_done); 329DEFINE_BUF_EVENT(xfs_buf_iowait_done);
330DEFINE_BUF_EVENT(xfs_buf_delwri_queue); 330DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
331DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue); 331DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
332DEFINE_BUF_EVENT(xfs_buf_delwri_split); 332DEFINE_BUF_EVENT(xfs_buf_delwri_split);
333DEFINE_BUF_EVENT(xfs_buf_get_uncached); 333DEFINE_BUF_EVENT(xfs_buf_get_uncached);
334DEFINE_BUF_EVENT(xfs_bdstrat_shut); 334DEFINE_BUF_EVENT(xfs_bdstrat_shut);
@@ -362,7 +362,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
362 TP_fast_assign( 362 TP_fast_assign(
363 __entry->dev = bp->b_target->bt_dev; 363 __entry->dev = bp->b_target->bt_dev;
364 __entry->bno = bp->b_bn; 364 __entry->bno = bp->b_bn;
365 __entry->buffer_length = bp->b_buffer_length; 365 __entry->buffer_length = BBTOB(bp->b_length);
366 __entry->flags = flags; 366 __entry->flags = flags;
367 __entry->hold = atomic_read(&bp->b_hold); 367 __entry->hold = atomic_read(&bp->b_hold);
368 __entry->pincount = atomic_read(&bp->b_pin_count); 368 __entry->pincount = atomic_read(&bp->b_pin_count);
@@ -406,7 +406,7 @@ TRACE_EVENT(xfs_buf_ioerror,
406 TP_fast_assign( 406 TP_fast_assign(
407 __entry->dev = bp->b_target->bt_dev; 407 __entry->dev = bp->b_target->bt_dev;
408 __entry->bno = bp->b_bn; 408 __entry->bno = bp->b_bn;
409 __entry->buffer_length = bp->b_buffer_length; 409 __entry->buffer_length = BBTOB(bp->b_length);
410 __entry->hold = atomic_read(&bp->b_hold); 410 __entry->hold = atomic_read(&bp->b_hold);
411 __entry->pincount = atomic_read(&bp->b_pin_count); 411 __entry->pincount = atomic_read(&bp->b_pin_count);
412 __entry->lockval = bp->b_sema.count; 412 __entry->lockval = bp->b_sema.count;
@@ -450,7 +450,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
450 __entry->bli_recur = bip->bli_recur; 450 __entry->bli_recur = bip->bli_recur;
451 __entry->bli_refcount = atomic_read(&bip->bli_refcount); 451 __entry->bli_refcount = atomic_read(&bip->bli_refcount);
452 __entry->buf_bno = bip->bli_buf->b_bn; 452 __entry->buf_bno = bip->bli_buf->b_bn;
453 __entry->buf_len = bip->bli_buf->b_buffer_length; 453 __entry->buf_len = BBTOB(bip->bli_buf->b_length);
454 __entry->buf_flags = bip->bli_buf->b_flags; 454 __entry->buf_flags = bip->bli_buf->b_flags;
455 __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); 455 __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
456 __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); 456 __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
@@ -486,12 +486,10 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
486DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); 486DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
487DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); 487DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
488DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); 488DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
489DEFINE_BUF_ITEM_EVENT(xfs_buf_item_trylock);
490DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock); 489DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
491DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); 490DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
492DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); 491DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
493DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); 492DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
494DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf);
495DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); 493DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
496DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur); 494DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
497DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb); 495DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
@@ -876,15 +874,30 @@ DECLARE_EVENT_CLASS(xfs_log_item_class,
876 __print_flags(__entry->flags, "|", XFS_LI_FLAGS)) 874 __print_flags(__entry->flags, "|", XFS_LI_FLAGS))
877) 875)
878 876
877TRACE_EVENT(xfs_log_force,
878 TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn),
879 TP_ARGS(mp, lsn),
880 TP_STRUCT__entry(
881 __field(dev_t, dev)
882 __field(xfs_lsn_t, lsn)
883 ),
884 TP_fast_assign(
885 __entry->dev = mp->m_super->s_dev;
886 __entry->lsn = lsn;
887 ),
888 TP_printk("dev %d:%d lsn 0x%llx",
889 MAJOR(__entry->dev), MINOR(__entry->dev),
890 __entry->lsn)
891)
892
879#define DEFINE_LOG_ITEM_EVENT(name) \ 893#define DEFINE_LOG_ITEM_EVENT(name) \
880DEFINE_EVENT(xfs_log_item_class, name, \ 894DEFINE_EVENT(xfs_log_item_class, name, \
881 TP_PROTO(struct xfs_log_item *lip), \ 895 TP_PROTO(struct xfs_log_item *lip), \
882 TP_ARGS(lip)) 896 TP_ARGS(lip))
883DEFINE_LOG_ITEM_EVENT(xfs_ail_push); 897DEFINE_LOG_ITEM_EVENT(xfs_ail_push);
884DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf);
885DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf_pinned);
886DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned); 898DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
887DEFINE_LOG_ITEM_EVENT(xfs_ail_locked); 899DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
900DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing);
888 901
889 902
890DECLARE_EVENT_CLASS(xfs_file_class, 903DECLARE_EVENT_CLASS(xfs_file_class,
@@ -1145,7 +1158,7 @@ TRACE_EVENT(xfs_bunmap,
1145 1158
1146); 1159);
1147 1160
1148DECLARE_EVENT_CLASS(xfs_busy_class, 1161DECLARE_EVENT_CLASS(xfs_extent_busy_class,
1149 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, 1162 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1150 xfs_agblock_t agbno, xfs_extlen_t len), 1163 xfs_agblock_t agbno, xfs_extlen_t len),
1151 TP_ARGS(mp, agno, agbno, len), 1164 TP_ARGS(mp, agno, agbno, len),
@@ -1168,17 +1181,17 @@ DECLARE_EVENT_CLASS(xfs_busy_class,
1168 __entry->len) 1181 __entry->len)
1169); 1182);
1170#define DEFINE_BUSY_EVENT(name) \ 1183#define DEFINE_BUSY_EVENT(name) \
1171DEFINE_EVENT(xfs_busy_class, name, \ 1184DEFINE_EVENT(xfs_extent_busy_class, name, \
1172 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ 1185 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
1173 xfs_agblock_t agbno, xfs_extlen_t len), \ 1186 xfs_agblock_t agbno, xfs_extlen_t len), \
1174 TP_ARGS(mp, agno, agbno, len)) 1187 TP_ARGS(mp, agno, agbno, len))
1175DEFINE_BUSY_EVENT(xfs_alloc_busy); 1188DEFINE_BUSY_EVENT(xfs_extent_busy);
1176DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem); 1189DEFINE_BUSY_EVENT(xfs_extent_busy_enomem);
1177DEFINE_BUSY_EVENT(xfs_alloc_busy_force); 1190DEFINE_BUSY_EVENT(xfs_extent_busy_force);
1178DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse); 1191DEFINE_BUSY_EVENT(xfs_extent_busy_reuse);
1179DEFINE_BUSY_EVENT(xfs_alloc_busy_clear); 1192DEFINE_BUSY_EVENT(xfs_extent_busy_clear);
1180 1193
1181TRACE_EVENT(xfs_alloc_busy_trim, 1194TRACE_EVENT(xfs_extent_busy_trim,
1182 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, 1195 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1183 xfs_agblock_t agbno, xfs_extlen_t len, 1196 xfs_agblock_t agbno, xfs_extlen_t len,
1184 xfs_agblock_t tbno, xfs_extlen_t tlen), 1197 xfs_agblock_t tbno, xfs_extlen_t tlen),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 103b00c90004..cdf896fcbfa4 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -19,9 +19,7 @@
19#include "xfs.h" 19#include "xfs.h"
20#include "xfs_fs.h" 20#include "xfs_fs.h"
21#include "xfs_types.h" 21#include "xfs_types.h"
22#include "xfs_bit.h"
23#include "xfs_log.h" 22#include "xfs_log.h"
24#include "xfs_inum.h"
25#include "xfs_trans.h" 23#include "xfs_trans.h"
26#include "xfs_sb.h" 24#include "xfs_sb.h"
27#include "xfs_ag.h" 25#include "xfs_ag.h"
@@ -36,6 +34,7 @@
36#include "xfs_btree.h" 34#include "xfs_btree.h"
37#include "xfs_ialloc.h" 35#include "xfs_ialloc.h"
38#include "xfs_alloc.h" 36#include "xfs_alloc.h"
37#include "xfs_extent_busy.h"
39#include "xfs_bmap.h" 38#include "xfs_bmap.h"
40#include "xfs_quota.h" 39#include "xfs_quota.h"
41#include "xfs_trans_priv.h" 40#include "xfs_trans_priv.h"
@@ -608,8 +607,8 @@ STATIC void
608xfs_trans_free( 607xfs_trans_free(
609 struct xfs_trans *tp) 608 struct xfs_trans *tp)
610{ 609{
611 xfs_alloc_busy_sort(&tp->t_busy); 610 xfs_extent_busy_sort(&tp->t_busy);
612 xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false); 611 xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
613 612
614 atomic_dec(&tp->t_mountp->m_active_trans); 613 atomic_dec(&tp->t_mountp->m_active_trans);
615 xfs_trans_free_dqinfo(tp); 614 xfs_trans_free_dqinfo(tp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index f6118703f20d..7ab99e1898c8 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -345,11 +345,9 @@ struct xfs_item_ops {
345 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); 345 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
346 void (*iop_pin)(xfs_log_item_t *); 346 void (*iop_pin)(xfs_log_item_t *);
347 void (*iop_unpin)(xfs_log_item_t *, int remove); 347 void (*iop_unpin)(xfs_log_item_t *, int remove);
348 uint (*iop_trylock)(xfs_log_item_t *); 348 uint (*iop_push)(struct xfs_log_item *, struct list_head *);
349 void (*iop_unlock)(xfs_log_item_t *); 349 void (*iop_unlock)(xfs_log_item_t *);
350 xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); 350 xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
351 void (*iop_push)(xfs_log_item_t *);
352 bool (*iop_pushbuf)(xfs_log_item_t *);
353 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); 351 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
354}; 352};
355 353
@@ -357,20 +355,18 @@ struct xfs_item_ops {
357#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) 355#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
358#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) 356#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip)
359#define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove) 357#define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove)
360#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) 358#define IOP_PUSH(ip, list) (*(ip)->li_ops->iop_push)(ip, list)
361#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) 359#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip)
362#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn) 360#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
363#define IOP_PUSH(ip) (*(ip)->li_ops->iop_push)(ip)
364#define IOP_PUSHBUF(ip) (*(ip)->li_ops->iop_pushbuf)(ip)
365#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn) 361#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
366 362
367/* 363/*
368 * Return values for the IOP_TRYLOCK() routines. 364 * Return values for the IOP_PUSH() routines.
369 */ 365 */
370#define XFS_ITEM_SUCCESS 0 366#define XFS_ITEM_SUCCESS 0
371#define XFS_ITEM_PINNED 1 367#define XFS_ITEM_PINNED 1
372#define XFS_ITEM_LOCKED 2 368#define XFS_ITEM_LOCKED 2
373#define XFS_ITEM_PUSHBUF 3 369#define XFS_ITEM_FLUSHING 3
374 370
375/* 371/*
376 * This is the type of function which can be given to xfs_trans_callback() 372 * This is the type of function which can be given to xfs_trans_callback()
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 1dead07f092c..9c514483e599 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -20,7 +20,6 @@
20#include "xfs_fs.h" 20#include "xfs_fs.h"
21#include "xfs_types.h" 21#include "xfs_types.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 23#include "xfs_trans.h"
25#include "xfs_sb.h" 24#include "xfs_sb.h"
26#include "xfs_ag.h" 25#include "xfs_ag.h"
@@ -79,7 +78,7 @@ xfs_ail_check(
79 * Return a pointer to the first item in the AIL. If the AIL is empty, then 78 * Return a pointer to the first item in the AIL. If the AIL is empty, then
80 * return NULL. 79 * return NULL.
81 */ 80 */
82static xfs_log_item_t * 81xfs_log_item_t *
83xfs_ail_min( 82xfs_ail_min(
84 struct xfs_ail *ailp) 83 struct xfs_ail *ailp)
85{ 84{
@@ -364,30 +363,31 @@ xfsaild_push(
364 xfs_log_item_t *lip; 363 xfs_log_item_t *lip;
365 xfs_lsn_t lsn; 364 xfs_lsn_t lsn;
366 xfs_lsn_t target; 365 xfs_lsn_t target;
367 long tout = 10; 366 long tout;
368 int stuck = 0; 367 int stuck = 0;
368 int flushing = 0;
369 int count = 0; 369 int count = 0;
370 int push_xfsbufd = 0;
371 370
372 /* 371 /*
373 * If last time we ran we encountered pinned items, force the log first 372 * If we encountered pinned items or did not finish writing out all
374 * and wait for it before pushing again. 373 * buffers the last time we ran, force the log first and wait for it
374 * before pushing again.
375 */ 375 */
376 spin_lock(&ailp->xa_lock); 376 if (ailp->xa_log_flush && ailp->xa_last_pushed_lsn == 0 &&
377 if (ailp->xa_last_pushed_lsn == 0 && ailp->xa_log_flush && 377 (!list_empty_careful(&ailp->xa_buf_list) ||
378 !list_empty(&ailp->xa_ail)) { 378 xfs_ail_min_lsn(ailp))) {
379 ailp->xa_log_flush = 0; 379 ailp->xa_log_flush = 0;
380 spin_unlock(&ailp->xa_lock); 380
381 XFS_STATS_INC(xs_push_ail_flush); 381 XFS_STATS_INC(xs_push_ail_flush);
382 xfs_log_force(mp, XFS_LOG_SYNC); 382 xfs_log_force(mp, XFS_LOG_SYNC);
383 spin_lock(&ailp->xa_lock);
384 } 383 }
385 384
386 target = ailp->xa_target; 385 spin_lock(&ailp->xa_lock);
387 lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn); 386 lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn);
388 if (!lip || XFS_FORCED_SHUTDOWN(mp)) { 387 if (!lip) {
389 /* 388 /*
390 * AIL is empty or our push has reached the end. 389 * If the AIL is empty or our push has reached the end we are
390 * done now.
391 */ 391 */
392 xfs_trans_ail_cursor_done(ailp, &cur); 392 xfs_trans_ail_cursor_done(ailp, &cur);
393 spin_unlock(&ailp->xa_lock); 393 spin_unlock(&ailp->xa_lock);
@@ -396,54 +396,42 @@ xfsaild_push(
396 396
397 XFS_STATS_INC(xs_push_ail); 397 XFS_STATS_INC(xs_push_ail);
398 398
399 /*
400 * While the item we are looking at is below the given threshold
401 * try to flush it out. We'd like not to stop until we've at least
402 * tried to push on everything in the AIL with an LSN less than
403 * the given threshold.
404 *
405 * However, we will stop after a certain number of pushes and wait
406 * for a reduced timeout to fire before pushing further. This
407 * prevents use from spinning when we can't do anything or there is
408 * lots of contention on the AIL lists.
409 */
410 lsn = lip->li_lsn; 399 lsn = lip->li_lsn;
400 target = ailp->xa_target;
411 while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) { 401 while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
412 int lock_result; 402 int lock_result;
403
413 /* 404 /*
414 * If we can lock the item without sleeping, unlock the AIL 405 * Note that IOP_PUSH may unlock and reacquire the AIL lock. We
415 * lock and flush the item. Then re-grab the AIL lock so we 406 * rely on the AIL cursor implementation to be able to deal with
416 * can look for the next item on the AIL. List changes are 407 * the dropped lock.
417 * handled by the AIL lookup functions internally
418 *
419 * If we can't lock the item, either its holder will flush it
420 * or it is already being flushed or it is being relogged. In
421 * any of these case it is being taken care of and we can just
422 * skip to the next item in the list.
423 */ 408 */
424 lock_result = IOP_TRYLOCK(lip); 409 lock_result = IOP_PUSH(lip, &ailp->xa_buf_list);
425 spin_unlock(&ailp->xa_lock);
426 switch (lock_result) { 410 switch (lock_result) {
427 case XFS_ITEM_SUCCESS: 411 case XFS_ITEM_SUCCESS:
428 XFS_STATS_INC(xs_push_ail_success); 412 XFS_STATS_INC(xs_push_ail_success);
429 trace_xfs_ail_push(lip); 413 trace_xfs_ail_push(lip);
430 414
431 IOP_PUSH(lip);
432 ailp->xa_last_pushed_lsn = lsn; 415 ailp->xa_last_pushed_lsn = lsn;
433 break; 416 break;
434 417
435 case XFS_ITEM_PUSHBUF: 418 case XFS_ITEM_FLUSHING:
436 XFS_STATS_INC(xs_push_ail_pushbuf); 419 /*
437 trace_xfs_ail_pushbuf(lip); 420 * The item or its backing buffer is already beeing
438 421 * flushed. The typical reason for that is that an
439 if (!IOP_PUSHBUF(lip)) { 422 * inode buffer is locked because we already pushed the
440 trace_xfs_ail_pushbuf_pinned(lip); 423 * updates to it as part of inode clustering.
441 stuck++; 424 *
442 ailp->xa_log_flush++; 425 * We do not want to to stop flushing just because lots
443 } else { 426 * of items are already beeing flushed, but we need to
444 ailp->xa_last_pushed_lsn = lsn; 427 * re-try the flushing relatively soon if most of the
445 } 428 * AIL is beeing flushed.
446 push_xfsbufd = 1; 429 */
430 XFS_STATS_INC(xs_push_ail_flushing);
431 trace_xfs_ail_flushing(lip);
432
433 flushing++;
434 ailp->xa_last_pushed_lsn = lsn;
447 break; 435 break;
448 436
449 case XFS_ITEM_PINNED: 437 case XFS_ITEM_PINNED:
@@ -453,28 +441,22 @@ xfsaild_push(
453 stuck++; 441 stuck++;
454 ailp->xa_log_flush++; 442 ailp->xa_log_flush++;
455 break; 443 break;
456
457 case XFS_ITEM_LOCKED: 444 case XFS_ITEM_LOCKED:
458 XFS_STATS_INC(xs_push_ail_locked); 445 XFS_STATS_INC(xs_push_ail_locked);
459 trace_xfs_ail_locked(lip); 446 trace_xfs_ail_locked(lip);
447
460 stuck++; 448 stuck++;
461 break; 449 break;
462
463 default: 450 default:
464 ASSERT(0); 451 ASSERT(0);
465 break; 452 break;
466 } 453 }
467 454
468 spin_lock(&ailp->xa_lock);
469 /* should we bother continuing? */
470 if (XFS_FORCED_SHUTDOWN(mp))
471 break;
472 ASSERT(mp->m_log);
473
474 count++; 455 count++;
475 456
476 /* 457 /*
477 * Are there too many items we can't do anything with? 458 * Are there too many items we can't do anything with?
459 *
478 * If we we are skipping too many items because we can't flush 460 * If we we are skipping too many items because we can't flush
479 * them or they are already being flushed, we back off and 461 * them or they are already being flushed, we back off and
480 * given them time to complete whatever operation is being 462 * given them time to complete whatever operation is being
@@ -496,42 +478,36 @@ xfsaild_push(
496 xfs_trans_ail_cursor_done(ailp, &cur); 478 xfs_trans_ail_cursor_done(ailp, &cur);
497 spin_unlock(&ailp->xa_lock); 479 spin_unlock(&ailp->xa_lock);
498 480
499 if (push_xfsbufd) { 481 if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list))
500 /* we've got delayed write buffers to flush */ 482 ailp->xa_log_flush++;
501 wake_up_process(mp->m_ddev_targp->bt_task);
502 }
503 483
504 /* assume we have more work to do in a short while */ 484 if (!count || XFS_LSN_CMP(lsn, target) >= 0) {
505out_done: 485out_done:
506 if (!count) {
507 /* We're past our target or empty, so idle */
508 ailp->xa_last_pushed_lsn = 0;
509 ailp->xa_log_flush = 0;
510
511 tout = 50;
512 } else if (XFS_LSN_CMP(lsn, target) >= 0) {
513 /* 486 /*
514 * We reached the target so wait a bit longer for I/O to 487 * We reached the target or the AIL is empty, so wait a bit
515 * complete and remove pushed items from the AIL before we 488 * longer for I/O to complete and remove pushed items from the
516 * start the next scan from the start of the AIL. 489 * AIL before we start the next scan from the start of the AIL.
517 */ 490 */
518 tout = 50; 491 tout = 50;
519 ailp->xa_last_pushed_lsn = 0; 492 ailp->xa_last_pushed_lsn = 0;
520 } else if ((stuck * 100) / count > 90) { 493 } else if (((stuck + flushing) * 100) / count > 90) {
521 /* 494 /*
522 * Either there is a lot of contention on the AIL or we 495 * Either there is a lot of contention on the AIL or we are
523 * are stuck due to operations in progress. "Stuck" in this 496 * stuck due to operations in progress. "Stuck" in this case
524 * case is defined as >90% of the items we tried to push 497 * is defined as >90% of the items we tried to push were stuck.
525 * were stuck.
526 * 498 *
527 * Backoff a bit more to allow some I/O to complete before 499 * Backoff a bit more to allow some I/O to complete before
528 * restarting from the start of the AIL. This prevents us 500 * restarting from the start of the AIL. This prevents us from
529 * from spinning on the same items, and if they are pinned will 501 * spinning on the same items, and if they are pinned will all
530 * all the restart to issue a log force to unpin the stuck 502 * the restart to issue a log force to unpin the stuck items.
531 * items.
532 */ 503 */
533 tout = 20; 504 tout = 20;
534 ailp->xa_last_pushed_lsn = 0; 505 ailp->xa_last_pushed_lsn = 0;
506 } else {
507 /*
508 * Assume we have more work to do in a short while.
509 */
510 tout = 10;
535 } 511 }
536 512
537 return tout; 513 return tout;
@@ -544,6 +520,8 @@ xfsaild(
544 struct xfs_ail *ailp = data; 520 struct xfs_ail *ailp = data;
545 long tout = 0; /* milliseconds */ 521 long tout = 0; /* milliseconds */
546 522
523 current->flags |= PF_MEMALLOC;
524
547 while (!kthread_should_stop()) { 525 while (!kthread_should_stop()) {
548 if (tout && tout <= 20) 526 if (tout && tout <= 20)
549 __set_current_state(TASK_KILLABLE); 527 __set_current_state(TASK_KILLABLE);
@@ -611,6 +589,30 @@ xfs_ail_push_all(
611} 589}
612 590
613/* 591/*
592 * Push out all items in the AIL immediately and wait until the AIL is empty.
593 */
594void
595xfs_ail_push_all_sync(
596 struct xfs_ail *ailp)
597{
598 struct xfs_log_item *lip;
599 DEFINE_WAIT(wait);
600
601 spin_lock(&ailp->xa_lock);
602 while ((lip = xfs_ail_max(ailp)) != NULL) {
603 prepare_to_wait(&ailp->xa_empty, &wait, TASK_UNINTERRUPTIBLE);
604 ailp->xa_target = lip->li_lsn;
605 wake_up_process(ailp->xa_task);
606 spin_unlock(&ailp->xa_lock);
607 schedule();
608 spin_lock(&ailp->xa_lock);
609 }
610 spin_unlock(&ailp->xa_lock);
611
612 finish_wait(&ailp->xa_empty, &wait);
613}
614
615/*
614 * xfs_trans_ail_update - bulk AIL insertion operation. 616 * xfs_trans_ail_update - bulk AIL insertion operation.
615 * 617 *
616 * @xfs_trans_ail_update takes an array of log items that all need to be 618 * @xfs_trans_ail_update takes an array of log items that all need to be
@@ -667,11 +669,15 @@ xfs_trans_ail_update_bulk(
667 669
668 if (!list_empty(&tmp)) 670 if (!list_empty(&tmp))
669 xfs_ail_splice(ailp, cur, &tmp, lsn); 671 xfs_ail_splice(ailp, cur, &tmp, lsn);
670 spin_unlock(&ailp->xa_lock);
671 672
672 if (mlip_changed && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) { 673 if (mlip_changed) {
673 xlog_assign_tail_lsn(ailp->xa_mount); 674 if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
675 xlog_assign_tail_lsn_locked(ailp->xa_mount);
676 spin_unlock(&ailp->xa_lock);
677
674 xfs_log_space_wake(ailp->xa_mount); 678 xfs_log_space_wake(ailp->xa_mount);
679 } else {
680 spin_unlock(&ailp->xa_lock);
675 } 681 }
676} 682}
677 683
@@ -700,7 +706,8 @@ void
700xfs_trans_ail_delete_bulk( 706xfs_trans_ail_delete_bulk(
701 struct xfs_ail *ailp, 707 struct xfs_ail *ailp,
702 struct xfs_log_item **log_items, 708 struct xfs_log_item **log_items,
703 int nr_items) __releases(ailp->xa_lock) 709 int nr_items,
710 int shutdown_type) __releases(ailp->xa_lock)
704{ 711{
705 xfs_log_item_t *mlip; 712 xfs_log_item_t *mlip;
706 int mlip_changed = 0; 713 int mlip_changed = 0;
@@ -718,7 +725,7 @@ xfs_trans_ail_delete_bulk(
718 xfs_alert_tag(mp, XFS_PTAG_AILDELETE, 725 xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
719 "%s: attempting to delete a log item that is not in the AIL", 726 "%s: attempting to delete a log item that is not in the AIL",
720 __func__); 727 __func__);
721 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 728 xfs_force_shutdown(mp, shutdown_type);
722 } 729 }
723 return; 730 return;
724 } 731 }
@@ -729,28 +736,20 @@ xfs_trans_ail_delete_bulk(
729 if (mlip == lip) 736 if (mlip == lip)
730 mlip_changed = 1; 737 mlip_changed = 1;
731 } 738 }
732 spin_unlock(&ailp->xa_lock);
733 739
734 if (mlip_changed && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) { 740 if (mlip_changed) {
735 xlog_assign_tail_lsn(ailp->xa_mount); 741 if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
742 xlog_assign_tail_lsn_locked(ailp->xa_mount);
743 if (list_empty(&ailp->xa_ail))
744 wake_up_all(&ailp->xa_empty);
745 spin_unlock(&ailp->xa_lock);
746
736 xfs_log_space_wake(ailp->xa_mount); 747 xfs_log_space_wake(ailp->xa_mount);
748 } else {
749 spin_unlock(&ailp->xa_lock);
737 } 750 }
738} 751}
739 752
740/*
741 * The active item list (AIL) is a doubly linked list of log
742 * items sorted by ascending lsn. The base of the list is
743 * a forw/back pointer pair embedded in the xfs mount structure.
744 * The base is initialized with both pointers pointing to the
745 * base. This case always needs to be distinguished, because
746 * the base has no lsn to look at. We almost always insert
747 * at the end of the list, so on inserts we search from the
748 * end of the list to find where the new item belongs.
749 */
750
751/*
752 * Initialize the doubly linked list to point only to itself.
753 */
754int 753int
755xfs_trans_ail_init( 754xfs_trans_ail_init(
756 xfs_mount_t *mp) 755 xfs_mount_t *mp)
@@ -765,6 +764,8 @@ xfs_trans_ail_init(
765 INIT_LIST_HEAD(&ailp->xa_ail); 764 INIT_LIST_HEAD(&ailp->xa_ail);
766 INIT_LIST_HEAD(&ailp->xa_cursors); 765 INIT_LIST_HEAD(&ailp->xa_cursors);
767 spin_lock_init(&ailp->xa_lock); 766 spin_lock_init(&ailp->xa_lock);
767 INIT_LIST_HEAD(&ailp->xa_buf_list);
768 init_waitqueue_head(&ailp->xa_empty);
768 769
769 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", 770 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
770 ailp->xa_mount->m_fsname); 771 ailp->xa_mount->m_fsname);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 1302d1d95a58..21c5a5e3700d 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -18,9 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h" 21#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 22#include "xfs_trans.h"
25#include "xfs_sb.h" 23#include "xfs_sb.h"
26#include "xfs_ag.h" 24#include "xfs_ag.h"
@@ -33,7 +31,6 @@
33#include "xfs_buf_item.h" 31#include "xfs_buf_item.h"
34#include "xfs_trans_priv.h" 32#include "xfs_trans_priv.h"
35#include "xfs_error.h" 33#include "xfs_error.h"
36#include "xfs_rw.h"
37#include "xfs_trace.h" 34#include "xfs_trace.h"
38 35
39/* 36/*
@@ -56,7 +53,7 @@ xfs_trans_buf_item_match(
56 if (blip->bli_item.li_type == XFS_LI_BUF && 53 if (blip->bli_item.li_type == XFS_LI_BUF &&
57 blip->bli_buf->b_target == target && 54 blip->bli_buf->b_target == target &&
58 XFS_BUF_ADDR(blip->bli_buf) == blkno && 55 XFS_BUF_ADDR(blip->bli_buf) == blkno &&
59 XFS_BUF_COUNT(blip->bli_buf) == len) 56 BBTOB(blip->bli_buf->b_length) == len)
60 return blip->bli_buf; 57 return blip->bli_buf;
61 } 58 }
62 59
@@ -141,15 +138,11 @@ xfs_trans_get_buf(xfs_trans_t *tp,
141 xfs_buf_t *bp; 138 xfs_buf_t *bp;
142 xfs_buf_log_item_t *bip; 139 xfs_buf_log_item_t *bip;
143 140
144 if (flags == 0)
145 flags = XBF_LOCK | XBF_MAPPED;
146
147 /* 141 /*
148 * Default to a normal get_buf() call if the tp is NULL. 142 * Default to a normal get_buf() call if the tp is NULL.
149 */ 143 */
150 if (tp == NULL) 144 if (tp == NULL)
151 return xfs_buf_get(target_dev, blkno, len, 145 return xfs_buf_get(target_dev, blkno, len, flags);
152 flags | XBF_DONT_BLOCK);
153 146
154 /* 147 /*
155 * If we find the buffer in the cache with this transaction 148 * If we find the buffer in the cache with this transaction
@@ -165,14 +158,6 @@ xfs_trans_get_buf(xfs_trans_t *tp,
165 XFS_BUF_DONE(bp); 158 XFS_BUF_DONE(bp);
166 } 159 }
167 160
168 /*
169 * If the buffer is stale then it was binval'ed
170 * since last read. This doesn't matter since the
171 * caller isn't allowed to use the data anyway.
172 */
173 else if (XFS_BUF_ISSTALE(bp))
174 ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
175
176 ASSERT(bp->b_transp == tp); 161 ASSERT(bp->b_transp == tp);
177 bip = bp->b_fspriv; 162 bip = bp->b_fspriv;
178 ASSERT(bip != NULL); 163 ASSERT(bip != NULL);
@@ -182,15 +167,7 @@ xfs_trans_get_buf(xfs_trans_t *tp,
182 return (bp); 167 return (bp);
183 } 168 }
184 169
185 /* 170 bp = xfs_buf_get(target_dev, blkno, len, flags);
186 * We always specify the XBF_DONT_BLOCK flag within a transaction
187 * so that get_buf does not try to push out a delayed write buffer
188 * which might cause another transaction to take place (if the
189 * buffer was delayed alloc). Such recursive transactions can
190 * easily deadlock with our current transaction as well as cause
191 * us to run out of stack space.
192 */
193 bp = xfs_buf_get(target_dev, blkno, len, flags | XBF_DONT_BLOCK);
194 if (bp == NULL) { 171 if (bp == NULL) {
195 return NULL; 172 return NULL;
196 } 173 }
@@ -282,14 +259,13 @@ xfs_trans_read_buf(
282 xfs_buf_log_item_t *bip; 259 xfs_buf_log_item_t *bip;
283 int error; 260 int error;
284 261
285 if (flags == 0) 262 *bpp = NULL;
286 flags = XBF_LOCK | XBF_MAPPED;
287 263
288 /* 264 /*
289 * Default to a normal get_buf() call if the tp is NULL. 265 * Default to a normal get_buf() call if the tp is NULL.
290 */ 266 */
291 if (tp == NULL) { 267 if (tp == NULL) {
292 bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK); 268 bp = xfs_buf_read(target, blkno, len, flags);
293 if (!bp) 269 if (!bp)
294 return (flags & XBF_TRYLOCK) ? 270 return (flags & XBF_TRYLOCK) ?
295 EAGAIN : XFS_ERROR(ENOMEM); 271 EAGAIN : XFS_ERROR(ENOMEM);
@@ -297,6 +273,8 @@ xfs_trans_read_buf(
297 if (bp->b_error) { 273 if (bp->b_error) {
298 error = bp->b_error; 274 error = bp->b_error;
299 xfs_buf_ioerror_alert(bp, __func__); 275 xfs_buf_ioerror_alert(bp, __func__);
276 XFS_BUF_UNDONE(bp);
277 xfs_buf_stale(bp);
300 xfs_buf_relse(bp); 278 xfs_buf_relse(bp);
301 return error; 279 return error;
302 } 280 }
@@ -371,15 +349,7 @@ xfs_trans_read_buf(
371 return 0; 349 return 0;
372 } 350 }
373 351
374 /* 352 bp = xfs_buf_read(target, blkno, len, flags);
375 * We always specify the XBF_DONT_BLOCK flag within a transaction
376 * so that get_buf does not try to push out a delayed write buffer
377 * which might cause another transaction to take place (if the
378 * buffer was delayed alloc). Such recursive transactions can
379 * easily deadlock with our current transaction as well as cause
380 * us to run out of stack space.
381 */
382 bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
383 if (bp == NULL) { 353 if (bp == NULL) {
384 *bpp = NULL; 354 *bpp = NULL;
385 return (flags & XBF_TRYLOCK) ? 355 return (flags & XBF_TRYLOCK) ?
@@ -418,19 +388,6 @@ xfs_trans_read_buf(
418 return 0; 388 return 0;
419 389
420shutdown_abort: 390shutdown_abort:
421 /*
422 * the theory here is that buffer is good but we're
423 * bailing out because the filesystem is being forcibly
424 * shut down. So we should leave the b_flags alone since
425 * the buffer's not staled and just get out.
426 */
427#if defined(DEBUG)
428 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
429 xfs_notice(mp, "about to pop assert, bp == 0x%p", bp);
430#endif
431 ASSERT((bp->b_flags & (XBF_STALE|XBF_DELWRI)) !=
432 (XBF_STALE|XBF_DELWRI));
433
434 trace_xfs_trans_read_buf_shut(bp, _RET_IP_); 391 trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
435 xfs_buf_relse(bp); 392 xfs_buf_relse(bp);
436 *bpp = NULL; 393 *bpp = NULL;
@@ -606,7 +563,7 @@ xfs_trans_log_buf(xfs_trans_t *tp,
606 563
607 ASSERT(bp->b_transp == tp); 564 ASSERT(bp->b_transp == tp);
608 ASSERT(bip != NULL); 565 ASSERT(bip != NULL);
609 ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp))); 566 ASSERT(first <= last && last < BBTOB(bp->b_length));
610 ASSERT(bp->b_iodone == NULL || 567 ASSERT(bp->b_iodone == NULL ||
611 bp->b_iodone == xfs_buf_iodone_callbacks); 568 bp->b_iodone == xfs_buf_iodone_callbacks);
612 569
@@ -626,8 +583,6 @@ xfs_trans_log_buf(xfs_trans_t *tp,
626 bp->b_iodone = xfs_buf_iodone_callbacks; 583 bp->b_iodone = xfs_buf_iodone_callbacks;
627 bip->bli_item.li_cb = xfs_buf_iodone; 584 bip->bli_item.li_cb = xfs_buf_iodone;
628 585
629 xfs_buf_delwri_queue(bp);
630
631 trace_xfs_trans_log_buf(bip); 586 trace_xfs_trans_log_buf(bip);
632 587
633 /* 588 /*
@@ -651,22 +606,33 @@ xfs_trans_log_buf(xfs_trans_t *tp,
651 606
652 607
653/* 608/*
654 * This called to invalidate a buffer that is being used within 609 * Invalidate a buffer that is being used within a transaction.
655 * a transaction. Typically this is because the blocks in the 610 *
656 * buffer are being freed, so we need to prevent it from being 611 * Typically this is because the blocks in the buffer are being freed, so we
657 * written out when we're done. Allowing it to be written again 612 * need to prevent it from being written out when we're done. Allowing it
658 * might overwrite data in the free blocks if they are reallocated 613 * to be written again might overwrite data in the free blocks if they are
659 * to a file. 614 * reallocated to a file.
615 *
616 * We prevent the buffer from being written out by marking it stale. We can't
617 * get rid of the buf log item at this point because the buffer may still be
618 * pinned by another transaction. If that is the case, then we'll wait until
619 * the buffer is committed to disk for the last time (we can tell by the ref
620 * count) and free it in xfs_buf_item_unpin(). Until that happens we will
621 * keep the buffer locked so that the buffer and buf log item are not reused.
622 *
623 * We also set the XFS_BLF_CANCEL flag in the buf log format structure and log
624 * the buf item. This will be used at recovery time to determine that copies
625 * of the buffer in the log before this should not be replayed.
660 * 626 *
661 * We prevent the buffer from being written out by clearing the 627 * We mark the item descriptor and the transaction dirty so that we'll hold
662 * B_DELWRI flag. We can't always 628 * the buffer until after the commit.
663 * get rid of the buf log item at this point, though, because 629 *
664 * the buffer may still be pinned by another transaction. If that 630 * Since we're invalidating the buffer, we also clear the state about which
665 * is the case, then we'll wait until the buffer is committed to 631 * parts of the buffer have been logged. We also clear the flag indicating
666 * disk for the last time (we can tell by the ref count) and 632 * that this is an inode buffer since the data in the buffer will no longer
667 * free it in xfs_buf_item_unpin(). Until it is cleaned up we 633 * be valid.
668 * will keep the buffer locked so that the buffer and buf log item 634 *
669 * are not reused. 635 * We set the stale bit in the buffer as well since we're getting rid of it.
670 */ 636 */
671void 637void
672xfs_trans_binval( 638xfs_trans_binval(
@@ -686,7 +652,6 @@ xfs_trans_binval(
686 * If the buffer is already invalidated, then 652 * If the buffer is already invalidated, then
687 * just return. 653 * just return.
688 */ 654 */
689 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
690 ASSERT(XFS_BUF_ISSTALE(bp)); 655 ASSERT(XFS_BUF_ISSTALE(bp));
691 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); 656 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
692 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF)); 657 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
@@ -696,27 +661,8 @@ xfs_trans_binval(
696 return; 661 return;
697 } 662 }
698 663
699 /*
700 * Clear the dirty bit in the buffer and set the STALE flag
701 * in the buf log item. The STALE flag will be used in
702 * xfs_buf_item_unpin() to determine if it should clean up
703 * when the last reference to the buf item is given up.
704 * We set the XFS_BLF_CANCEL flag in the buf log format structure
705 * and log the buf item. This will be used at recovery time
706 * to determine that copies of the buffer in the log before
707 * this should not be replayed.
708 * We mark the item descriptor and the transaction dirty so
709 * that we'll hold the buffer until after the commit.
710 *
711 * Since we're invalidating the buffer, we also clear the state
712 * about which parts of the buffer have been logged. We also
713 * clear the flag indicating that this is an inode buffer since
714 * the data in the buffer will no longer be valid.
715 *
716 * We set the stale bit in the buffer as well since we're getting
717 * rid of it.
718 */
719 xfs_buf_stale(bp); 664 xfs_buf_stale(bp);
665
720 bip->bli_flags |= XFS_BLI_STALE; 666 bip->bli_flags |= XFS_BLI_STALE;
721 bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); 667 bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
722 bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; 668 bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 279099717ed2..bcb60542fcf1 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -17,9 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_bit.h"
21#include "xfs_log.h" 20#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h" 21#include "xfs_trans.h"
24#include "xfs_sb.h" 22#include "xfs_sb.h"
25#include "xfs_ag.h" 23#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index f7590f5badea..8d71b16eccae 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -19,7 +19,6 @@
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_log.h" 21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h" 22#include "xfs_trans.h"
24#include "xfs_sb.h" 23#include "xfs_sb.h"
25#include "xfs_ag.h" 24#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 7a7442c03f2b..d2eee20d5f5b 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -18,9 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h" 21#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 22#include "xfs_trans.h"
25#include "xfs_sb.h" 23#include "xfs_sb.h"
26#include "xfs_ag.h" 24#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 8ab2ced415f1..fb62377d1cbc 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -71,6 +71,8 @@ struct xfs_ail {
71 spinlock_t xa_lock; 71 spinlock_t xa_lock;
72 xfs_lsn_t xa_last_pushed_lsn; 72 xfs_lsn_t xa_last_pushed_lsn;
73 int xa_log_flush; 73 int xa_log_flush;
74 struct list_head xa_buf_list;
75 wait_queue_head_t xa_empty;
74}; 76};
75 77
76/* 78/*
@@ -90,18 +92,22 @@ xfs_trans_ail_update(
90} 92}
91 93
92void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp, 94void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
93 struct xfs_log_item **log_items, int nr_items) 95 struct xfs_log_item **log_items, int nr_items,
96 int shutdown_type)
94 __releases(ailp->xa_lock); 97 __releases(ailp->xa_lock);
95static inline void 98static inline void
96xfs_trans_ail_delete( 99xfs_trans_ail_delete(
97 struct xfs_ail *ailp, 100 struct xfs_ail *ailp,
98 xfs_log_item_t *lip) __releases(ailp->xa_lock) 101 xfs_log_item_t *lip,
102 int shutdown_type) __releases(ailp->xa_lock)
99{ 103{
100 xfs_trans_ail_delete_bulk(ailp, &lip, 1); 104 xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type);
101} 105}
102 106
103void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); 107void xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
104void xfs_ail_push_all(struct xfs_ail *); 108void xfs_ail_push_all(struct xfs_ail *);
109void xfs_ail_push_all_sync(struct xfs_ail *);
110struct xfs_log_item *xfs_ail_min(struct xfs_ail *ailp);
105xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp); 111xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp);
106 112
107struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp, 113struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 65584b55607d..398cf681d025 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -57,6 +57,7 @@ typedef __uint64_t __psunsigned_t;
57#endif /* __KERNEL__ */ 57#endif /* __KERNEL__ */
58 58
59typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */ 59typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */
60typedef __uint32_t xfs_agino_t; /* inode # within allocation grp */
60typedef __uint32_t xfs_extlen_t; /* extent length in blocks */ 61typedef __uint32_t xfs_extlen_t; /* extent length in blocks */
61typedef __uint32_t xfs_agnumber_t; /* allocation group number */ 62typedef __uint32_t xfs_agnumber_t; /* allocation group number */
62typedef __int32_t xfs_extnum_t; /* # of extents in a file */ 63typedef __int32_t xfs_extnum_t; /* # of extents in a file */
@@ -101,6 +102,7 @@ typedef __uint64_t xfs_fileoff_t; /* block number in a file */
101typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ 102typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */
102typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */ 103typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */
103 104
105
104/* 106/*
105 * Null values for the types. 107 * Null values for the types.
106 */ 108 */
@@ -120,6 +122,9 @@ typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */
120 122
121#define NULLCOMMITLSN ((xfs_lsn_t)-1) 123#define NULLCOMMITLSN ((xfs_lsn_t)-1)
122 124
125#define NULLFSINO ((xfs_ino_t)-1)
126#define NULLAGINO ((xfs_agino_t)-1)
127
123/* 128/*
124 * Max values for extlen, extnum, aextnum. 129 * Max values for extlen, extnum, aextnum.
125 */ 130 */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 79c05ac85bfe..4e5b9ad5cb97 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -18,9 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h" 21#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 22#include "xfs_trans.h"
25#include "xfs_sb.h" 23#include "xfs_sb.h"
26#include "xfs_ag.h" 24#include "xfs_ag.h"
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 64981d7e7375..b6a82d817a82 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -21,7 +21,6 @@
21#include "xfs_types.h" 21#include "xfs_types.h"
22#include "xfs_bit.h" 22#include "xfs_bit.h"
23#include "xfs_log.h" 23#include "xfs_log.h"
24#include "xfs_inum.h"
25#include "xfs_trans.h" 24#include "xfs_trans.h"
26#include "xfs_sb.h" 25#include "xfs_sb.h"
27#include "xfs_ag.h" 26#include "xfs_ag.h"
@@ -39,7 +38,6 @@
39#include "xfs_bmap.h" 38#include "xfs_bmap.h"
40#include "xfs_acl.h" 39#include "xfs_acl.h"
41#include "xfs_attr.h" 40#include "xfs_attr.h"
42#include "xfs_rw.h"
43#include "xfs_error.h" 41#include "xfs_error.h"
44#include "xfs_quota.h" 42#include "xfs_quota.h"
45#include "xfs_utils.h" 43#include "xfs_utils.h"
@@ -81,8 +79,7 @@ xfs_readlink_bmap(
81 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); 79 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
82 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); 80 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
83 81
84 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 82 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
85 XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK);
86 if (!bp) 83 if (!bp)
87 return XFS_ERROR(ENOMEM); 84 return XFS_ERROR(ENOMEM);
88 error = bp->b_error; 85 error = bp->b_error;
@@ -1919,7 +1916,7 @@ xfs_alloc_file_space(
1919 1916
1920error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ 1917error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
1921 xfs_bmap_cancel(&free_list); 1918 xfs_bmap_cancel(&free_list);
1922 xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); 1919 xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
1923 1920
1924error1: /* Just cancel transaction */ 1921error1: /* Just cancel transaction */
1925 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 1922 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
@@ -1966,7 +1963,7 @@ xfs_zero_remaining_bytes(
1966 1963
1967 bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? 1964 bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
1968 mp->m_rtdev_targp : mp->m_ddev_targp, 1965 mp->m_rtdev_targp : mp->m_ddev_targp,
1969 mp->m_sb.sb_blocksize, XBF_DONT_BLOCK); 1966 BTOBB(mp->m_sb.sb_blocksize), 0);
1970 if (!bp) 1967 if (!bp)
1971 return XFS_ERROR(ENOMEM); 1968 return XFS_ERROR(ENOMEM);
1972 1969
@@ -2315,17 +2312,33 @@ xfs_change_file_space(
2315 case XFS_IOC_ALLOCSP64: 2312 case XFS_IOC_ALLOCSP64:
2316 case XFS_IOC_FREESP: 2313 case XFS_IOC_FREESP:
2317 case XFS_IOC_FREESP64: 2314 case XFS_IOC_FREESP64:
2315 /*
2316 * These operations actually do IO when extending the file, but
2317 * the allocation is done seperately to the zeroing that is
2318 * done. This set of operations need to be serialised against
2319 * other IO operations, such as truncate and buffered IO. We
2320 * need to take the IOLOCK here to serialise the allocation and
2321 * zeroing IO to prevent other IOLOCK holders (e.g. getbmap,
2322 * truncate, direct IO) from racing against the transient
2323 * allocated but not written state we can have here.
2324 */
2325 xfs_ilock(ip, XFS_IOLOCK_EXCL);
2318 if (startoffset > fsize) { 2326 if (startoffset > fsize) {
2319 error = xfs_alloc_file_space(ip, fsize, 2327 error = xfs_alloc_file_space(ip, fsize,
2320 startoffset - fsize, 0, attr_flags); 2328 startoffset - fsize, 0,
2321 if (error) 2329 attr_flags | XFS_ATTR_NOLOCK);
2330 if (error) {
2331 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
2322 break; 2332 break;
2333 }
2323 } 2334 }
2324 2335
2325 iattr.ia_valid = ATTR_SIZE; 2336 iattr.ia_valid = ATTR_SIZE;
2326 iattr.ia_size = startoffset; 2337 iattr.ia_size = startoffset;
2327 2338
2328 error = xfs_setattr_size(ip, &iattr, attr_flags); 2339 error = xfs_setattr_size(ip, &iattr,
2340 attr_flags | XFS_ATTR_NOLOCK);
2341 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
2329 2342
2330 if (error) 2343 if (error)
2331 return error; 2344 return error;