diff options
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_buf.c')
-rw-r--r-- | fs/xfs/linux-2.6/xfs_buf.c | 542 |
1 files changed, 333 insertions, 209 deletions
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 965df1227d64..44c2b0ef9a41 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c | |||
@@ -18,7 +18,7 @@ | |||
18 | #include "xfs.h" | 18 | #include "xfs.h" |
19 | #include <linux/stddef.h> | 19 | #include <linux/stddef.h> |
20 | #include <linux/errno.h> | 20 | #include <linux/errno.h> |
21 | #include <linux/slab.h> | 21 | #include <linux/gfp.h> |
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | #include <linux/init.h> | 23 | #include <linux/init.h> |
24 | #include <linux/vmalloc.h> | 24 | #include <linux/vmalloc.h> |
@@ -33,12 +33,14 @@ | |||
33 | #include <linux/migrate.h> | 33 | #include <linux/migrate.h> |
34 | #include <linux/backing-dev.h> | 34 | #include <linux/backing-dev.h> |
35 | #include <linux/freezer.h> | 35 | #include <linux/freezer.h> |
36 | #include <linux/list_sort.h> | ||
36 | 37 | ||
37 | #include "xfs_sb.h" | 38 | #include "xfs_sb.h" |
38 | #include "xfs_inum.h" | 39 | #include "xfs_inum.h" |
39 | #include "xfs_ag.h" | 40 | #include "xfs_ag.h" |
40 | #include "xfs_dmapi.h" | 41 | #include "xfs_dmapi.h" |
41 | #include "xfs_mount.h" | 42 | #include "xfs_mount.h" |
43 | #include "xfs_trace.h" | ||
42 | 44 | ||
43 | static kmem_zone_t *xfs_buf_zone; | 45 | static kmem_zone_t *xfs_buf_zone; |
44 | STATIC int xfsbufd(void *); | 46 | STATIC int xfsbufd(void *); |
@@ -53,34 +55,6 @@ static struct workqueue_struct *xfslogd_workqueue; | |||
53 | struct workqueue_struct *xfsdatad_workqueue; | 55 | struct workqueue_struct *xfsdatad_workqueue; |
54 | struct workqueue_struct *xfsconvertd_workqueue; | 56 | struct workqueue_struct *xfsconvertd_workqueue; |
55 | 57 | ||
56 | #ifdef XFS_BUF_TRACE | ||
57 | void | ||
58 | xfs_buf_trace( | ||
59 | xfs_buf_t *bp, | ||
60 | char *id, | ||
61 | void *data, | ||
62 | void *ra) | ||
63 | { | ||
64 | ktrace_enter(xfs_buf_trace_buf, | ||
65 | bp, id, | ||
66 | (void *)(unsigned long)bp->b_flags, | ||
67 | (void *)(unsigned long)bp->b_hold.counter, | ||
68 | (void *)(unsigned long)bp->b_sema.count, | ||
69 | (void *)current, | ||
70 | data, ra, | ||
71 | (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff), | ||
72 | (void *)(unsigned long)(bp->b_file_offset & 0xffffffff), | ||
73 | (void *)(unsigned long)bp->b_buffer_length, | ||
74 | NULL, NULL, NULL, NULL, NULL); | ||
75 | } | ||
76 | ktrace_t *xfs_buf_trace_buf; | ||
77 | #define XFS_BUF_TRACE_SIZE 4096 | ||
78 | #define XB_TRACE(bp, id, data) \ | ||
79 | xfs_buf_trace(bp, id, (void *)data, (void *)__builtin_return_address(0)) | ||
80 | #else | ||
81 | #define XB_TRACE(bp, id, data) do { } while (0) | ||
82 | #endif | ||
83 | |||
84 | #ifdef XFS_BUF_LOCK_TRACKING | 58 | #ifdef XFS_BUF_LOCK_TRACKING |
85 | # define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) | 59 | # define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) |
86 | # define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) | 60 | # define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) |
@@ -103,6 +77,27 @@ ktrace_t *xfs_buf_trace_buf; | |||
103 | #define xfs_buf_deallocate(bp) \ | 77 | #define xfs_buf_deallocate(bp) \ |
104 | kmem_zone_free(xfs_buf_zone, (bp)); | 78 | kmem_zone_free(xfs_buf_zone, (bp)); |
105 | 79 | ||
80 | static inline int | ||
81 | xfs_buf_is_vmapped( | ||
82 | struct xfs_buf *bp) | ||
83 | { | ||
84 | /* | ||
85 | * Return true if the buffer is vmapped. | ||
86 | * | ||
87 | * The XBF_MAPPED flag is set if the buffer should be mapped, but the | ||
88 | * code is clever enough to know it doesn't have to map a single page, | ||
89 | * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1. | ||
90 | */ | ||
91 | return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1; | ||
92 | } | ||
93 | |||
94 | static inline int | ||
95 | xfs_buf_vmap_len( | ||
96 | struct xfs_buf *bp) | ||
97 | { | ||
98 | return (bp->b_page_count * PAGE_SIZE) - bp->b_offset; | ||
99 | } | ||
100 | |||
106 | /* | 101 | /* |
107 | * Page Region interfaces. | 102 | * Page Region interfaces. |
108 | * | 103 | * |
@@ -149,7 +144,7 @@ page_region_mask( | |||
149 | return mask; | 144 | return mask; |
150 | } | 145 | } |
151 | 146 | ||
152 | STATIC_INLINE void | 147 | STATIC void |
153 | set_page_region( | 148 | set_page_region( |
154 | struct page *page, | 149 | struct page *page, |
155 | size_t offset, | 150 | size_t offset, |
@@ -161,7 +156,7 @@ set_page_region( | |||
161 | SetPageUptodate(page); | 156 | SetPageUptodate(page); |
162 | } | 157 | } |
163 | 158 | ||
164 | STATIC_INLINE int | 159 | STATIC int |
165 | test_page_region( | 160 | test_page_region( |
166 | struct page *page, | 161 | struct page *page, |
167 | size_t offset, | 162 | size_t offset, |
@@ -173,75 +168,6 @@ test_page_region( | |||
173 | } | 168 | } |
174 | 169 | ||
175 | /* | 170 | /* |
176 | * Mapping of multi-page buffers into contiguous virtual space | ||
177 | */ | ||
178 | |||
179 | typedef struct a_list { | ||
180 | void *vm_addr; | ||
181 | struct a_list *next; | ||
182 | } a_list_t; | ||
183 | |||
184 | static a_list_t *as_free_head; | ||
185 | static int as_list_len; | ||
186 | static DEFINE_SPINLOCK(as_lock); | ||
187 | |||
188 | /* | ||
189 | * Try to batch vunmaps because they are costly. | ||
190 | */ | ||
191 | STATIC void | ||
192 | free_address( | ||
193 | void *addr) | ||
194 | { | ||
195 | a_list_t *aentry; | ||
196 | |||
197 | #ifdef CONFIG_XEN | ||
198 | /* | ||
199 | * Xen needs to be able to make sure it can get an exclusive | ||
200 | * RO mapping of pages it wants to turn into a pagetable. If | ||
201 | * a newly allocated page is also still being vmap()ed by xfs, | ||
202 | * it will cause pagetable construction to fail. This is a | ||
203 | * quick workaround to always eagerly unmap pages so that Xen | ||
204 | * is happy. | ||
205 | */ | ||
206 | vunmap(addr); | ||
207 | return; | ||
208 | #endif | ||
209 | |||
210 | aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT); | ||
211 | if (likely(aentry)) { | ||
212 | spin_lock(&as_lock); | ||
213 | aentry->next = as_free_head; | ||
214 | aentry->vm_addr = addr; | ||
215 | as_free_head = aentry; | ||
216 | as_list_len++; | ||
217 | spin_unlock(&as_lock); | ||
218 | } else { | ||
219 | vunmap(addr); | ||
220 | } | ||
221 | } | ||
222 | |||
223 | STATIC void | ||
224 | purge_addresses(void) | ||
225 | { | ||
226 | a_list_t *aentry, *old; | ||
227 | |||
228 | if (as_free_head == NULL) | ||
229 | return; | ||
230 | |||
231 | spin_lock(&as_lock); | ||
232 | aentry = as_free_head; | ||
233 | as_free_head = NULL; | ||
234 | as_list_len = 0; | ||
235 | spin_unlock(&as_lock); | ||
236 | |||
237 | while ((old = aentry) != NULL) { | ||
238 | vunmap(aentry->vm_addr); | ||
239 | aentry = aentry->next; | ||
240 | kfree(old); | ||
241 | } | ||
242 | } | ||
243 | |||
244 | /* | ||
245 | * Internal xfs_buf_t object manipulation | 171 | * Internal xfs_buf_t object manipulation |
246 | */ | 172 | */ |
247 | 173 | ||
@@ -279,7 +205,8 @@ _xfs_buf_initialize( | |||
279 | init_waitqueue_head(&bp->b_waiters); | 205 | init_waitqueue_head(&bp->b_waiters); |
280 | 206 | ||
281 | XFS_STATS_INC(xb_create); | 207 | XFS_STATS_INC(xb_create); |
282 | XB_TRACE(bp, "initialize", target); | 208 | |
209 | trace_xfs_buf_init(bp, _RET_IP_); | ||
283 | } | 210 | } |
284 | 211 | ||
285 | /* | 212 | /* |
@@ -318,6 +245,7 @@ _xfs_buf_free_pages( | |||
318 | { | 245 | { |
319 | if (bp->b_pages != bp->b_page_array) { | 246 | if (bp->b_pages != bp->b_page_array) { |
320 | kmem_free(bp->b_pages); | 247 | kmem_free(bp->b_pages); |
248 | bp->b_pages = NULL; | ||
321 | } | 249 | } |
322 | } | 250 | } |
323 | 251 | ||
@@ -332,15 +260,16 @@ void | |||
332 | xfs_buf_free( | 260 | xfs_buf_free( |
333 | xfs_buf_t *bp) | 261 | xfs_buf_t *bp) |
334 | { | 262 | { |
335 | XB_TRACE(bp, "free", 0); | 263 | trace_xfs_buf_free(bp, _RET_IP_); |
336 | 264 | ||
337 | ASSERT(list_empty(&bp->b_hash_list)); | 265 | ASSERT(list_empty(&bp->b_hash_list)); |
338 | 266 | ||
339 | if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { | 267 | if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { |
340 | uint i; | 268 | uint i; |
341 | 269 | ||
342 | if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) | 270 | if (xfs_buf_is_vmapped(bp)) |
343 | free_address(bp->b_addr - bp->b_offset); | 271 | vm_unmap_ram(bp->b_addr - bp->b_offset, |
272 | bp->b_page_count); | ||
344 | 273 | ||
345 | for (i = 0; i < bp->b_page_count; i++) { | 274 | for (i = 0; i < bp->b_page_count; i++) { |
346 | struct page *page = bp->b_pages[i]; | 275 | struct page *page = bp->b_pages[i]; |
@@ -349,9 +278,8 @@ xfs_buf_free( | |||
349 | ASSERT(!PagePrivate(page)); | 278 | ASSERT(!PagePrivate(page)); |
350 | page_cache_release(page); | 279 | page_cache_release(page); |
351 | } | 280 | } |
352 | _xfs_buf_free_pages(bp); | ||
353 | } | 281 | } |
354 | 282 | _xfs_buf_free_pages(bp); | |
355 | xfs_buf_deallocate(bp); | 283 | xfs_buf_deallocate(bp); |
356 | } | 284 | } |
357 | 285 | ||
@@ -445,7 +373,6 @@ _xfs_buf_lookup_pages( | |||
445 | if (page_count == bp->b_page_count) | 373 | if (page_count == bp->b_page_count) |
446 | bp->b_flags |= XBF_DONE; | 374 | bp->b_flags |= XBF_DONE; |
447 | 375 | ||
448 | XB_TRACE(bp, "lookup_pages", (long)page_count); | ||
449 | return error; | 376 | return error; |
450 | } | 377 | } |
451 | 378 | ||
@@ -462,10 +389,8 @@ _xfs_buf_map_pages( | |||
462 | bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; | 389 | bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; |
463 | bp->b_flags |= XBF_MAPPED; | 390 | bp->b_flags |= XBF_MAPPED; |
464 | } else if (flags & XBF_MAPPED) { | 391 | } else if (flags & XBF_MAPPED) { |
465 | if (as_list_len > 64) | 392 | bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, |
466 | purge_addresses(); | 393 | -1, PAGE_KERNEL); |
467 | bp->b_addr = vmap(bp->b_pages, bp->b_page_count, | ||
468 | VM_MAP, PAGE_KERNEL); | ||
469 | if (unlikely(bp->b_addr == NULL)) | 394 | if (unlikely(bp->b_addr == NULL)) |
470 | return -ENOMEM; | 395 | return -ENOMEM; |
471 | bp->b_addr += bp->b_offset; | 396 | bp->b_addr += bp->b_offset; |
@@ -548,7 +473,6 @@ found: | |||
548 | if (down_trylock(&bp->b_sema)) { | 473 | if (down_trylock(&bp->b_sema)) { |
549 | if (!(flags & XBF_TRYLOCK)) { | 474 | if (!(flags & XBF_TRYLOCK)) { |
550 | /* wait for buffer ownership */ | 475 | /* wait for buffer ownership */ |
551 | XB_TRACE(bp, "get_lock", 0); | ||
552 | xfs_buf_lock(bp); | 476 | xfs_buf_lock(bp); |
553 | XFS_STATS_INC(xb_get_locked_waited); | 477 | XFS_STATS_INC(xb_get_locked_waited); |
554 | } else { | 478 | } else { |
@@ -571,7 +495,8 @@ found: | |||
571 | ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); | 495 | ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); |
572 | bp->b_flags &= XBF_MAPPED; | 496 | bp->b_flags &= XBF_MAPPED; |
573 | } | 497 | } |
574 | XB_TRACE(bp, "got_lock", 0); | 498 | |
499 | trace_xfs_buf_find(bp, flags, _RET_IP_); | ||
575 | XFS_STATS_INC(xb_get_locked); | 500 | XFS_STATS_INC(xb_get_locked); |
576 | return bp; | 501 | return bp; |
577 | } | 502 | } |
@@ -582,7 +507,7 @@ found: | |||
582 | * although backing storage may not be. | 507 | * although backing storage may not be. |
583 | */ | 508 | */ |
584 | xfs_buf_t * | 509 | xfs_buf_t * |
585 | xfs_buf_get_flags( | 510 | xfs_buf_get( |
586 | xfs_buftarg_t *target,/* target for buffer */ | 511 | xfs_buftarg_t *target,/* target for buffer */ |
587 | xfs_off_t ioff, /* starting offset of range */ | 512 | xfs_off_t ioff, /* starting offset of range */ |
588 | size_t isize, /* length of range */ | 513 | size_t isize, /* length of range */ |
@@ -627,7 +552,7 @@ xfs_buf_get_flags( | |||
627 | bp->b_bn = ioff; | 552 | bp->b_bn = ioff; |
628 | bp->b_count_desired = bp->b_buffer_length; | 553 | bp->b_count_desired = bp->b_buffer_length; |
629 | 554 | ||
630 | XB_TRACE(bp, "get", (unsigned long)flags); | 555 | trace_xfs_buf_get(bp, flags, _RET_IP_); |
631 | return bp; | 556 | return bp; |
632 | 557 | ||
633 | no_buffer: | 558 | no_buffer: |
@@ -644,8 +569,6 @@ _xfs_buf_read( | |||
644 | { | 569 | { |
645 | int status; | 570 | int status; |
646 | 571 | ||
647 | XB_TRACE(bp, "_xfs_buf_read", (unsigned long)flags); | ||
648 | |||
649 | ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE))); | 572 | ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE))); |
650 | ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); | 573 | ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); |
651 | 574 | ||
@@ -661,7 +584,7 @@ _xfs_buf_read( | |||
661 | } | 584 | } |
662 | 585 | ||
663 | xfs_buf_t * | 586 | xfs_buf_t * |
664 | xfs_buf_read_flags( | 587 | xfs_buf_read( |
665 | xfs_buftarg_t *target, | 588 | xfs_buftarg_t *target, |
666 | xfs_off_t ioff, | 589 | xfs_off_t ioff, |
667 | size_t isize, | 590 | size_t isize, |
@@ -671,21 +594,20 @@ xfs_buf_read_flags( | |||
671 | 594 | ||
672 | flags |= XBF_READ; | 595 | flags |= XBF_READ; |
673 | 596 | ||
674 | bp = xfs_buf_get_flags(target, ioff, isize, flags); | 597 | bp = xfs_buf_get(target, ioff, isize, flags); |
675 | if (bp) { | 598 | if (bp) { |
599 | trace_xfs_buf_read(bp, flags, _RET_IP_); | ||
600 | |||
676 | if (!XFS_BUF_ISDONE(bp)) { | 601 | if (!XFS_BUF_ISDONE(bp)) { |
677 | XB_TRACE(bp, "read", (unsigned long)flags); | ||
678 | XFS_STATS_INC(xb_get_read); | 602 | XFS_STATS_INC(xb_get_read); |
679 | _xfs_buf_read(bp, flags); | 603 | _xfs_buf_read(bp, flags); |
680 | } else if (flags & XBF_ASYNC) { | 604 | } else if (flags & XBF_ASYNC) { |
681 | XB_TRACE(bp, "read_async", (unsigned long)flags); | ||
682 | /* | 605 | /* |
683 | * Read ahead call which is already satisfied, | 606 | * Read ahead call which is already satisfied, |
684 | * drop the buffer | 607 | * drop the buffer |
685 | */ | 608 | */ |
686 | goto no_buffer; | 609 | goto no_buffer; |
687 | } else { | 610 | } else { |
688 | XB_TRACE(bp, "read_done", (unsigned long)flags); | ||
689 | /* We do not want read in the flags */ | 611 | /* We do not want read in the flags */ |
690 | bp->b_flags &= ~XBF_READ; | 612 | bp->b_flags &= ~XBF_READ; |
691 | } | 613 | } |
@@ -718,7 +640,7 @@ xfs_buf_readahead( | |||
718 | return; | 640 | return; |
719 | 641 | ||
720 | flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); | 642 | flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); |
721 | xfs_buf_read_flags(target, ioff, isize, flags); | 643 | xfs_buf_read(target, ioff, isize, flags); |
722 | } | 644 | } |
723 | 645 | ||
724 | xfs_buf_t * | 646 | xfs_buf_t * |
@@ -823,7 +745,7 @@ xfs_buf_get_noaddr( | |||
823 | 745 | ||
824 | xfs_buf_unlock(bp); | 746 | xfs_buf_unlock(bp); |
825 | 747 | ||
826 | XB_TRACE(bp, "no_daddr", len); | 748 | trace_xfs_buf_get_noaddr(bp, _RET_IP_); |
827 | return bp; | 749 | return bp; |
828 | 750 | ||
829 | fail_free_mem: | 751 | fail_free_mem: |
@@ -845,8 +767,8 @@ void | |||
845 | xfs_buf_hold( | 767 | xfs_buf_hold( |
846 | xfs_buf_t *bp) | 768 | xfs_buf_t *bp) |
847 | { | 769 | { |
770 | trace_xfs_buf_hold(bp, _RET_IP_); | ||
848 | atomic_inc(&bp->b_hold); | 771 | atomic_inc(&bp->b_hold); |
849 | XB_TRACE(bp, "hold", 0); | ||
850 | } | 772 | } |
851 | 773 | ||
852 | /* | 774 | /* |
@@ -859,7 +781,7 @@ xfs_buf_rele( | |||
859 | { | 781 | { |
860 | xfs_bufhash_t *hash = bp->b_hash; | 782 | xfs_bufhash_t *hash = bp->b_hash; |
861 | 783 | ||
862 | XB_TRACE(bp, "rele", bp->b_relse); | 784 | trace_xfs_buf_rele(bp, _RET_IP_); |
863 | 785 | ||
864 | if (unlikely(!hash)) { | 786 | if (unlikely(!hash)) { |
865 | ASSERT(!bp->b_relse); | 787 | ASSERT(!bp->b_relse); |
@@ -909,21 +831,19 @@ xfs_buf_cond_lock( | |||
909 | int locked; | 831 | int locked; |
910 | 832 | ||
911 | locked = down_trylock(&bp->b_sema) == 0; | 833 | locked = down_trylock(&bp->b_sema) == 0; |
912 | if (locked) { | 834 | if (locked) |
913 | XB_SET_OWNER(bp); | 835 | XB_SET_OWNER(bp); |
914 | } | 836 | |
915 | XB_TRACE(bp, "cond_lock", (long)locked); | 837 | trace_xfs_buf_cond_lock(bp, _RET_IP_); |
916 | return locked ? 0 : -EBUSY; | 838 | return locked ? 0 : -EBUSY; |
917 | } | 839 | } |
918 | 840 | ||
919 | #if defined(DEBUG) || defined(XFS_BLI_TRACE) | ||
920 | int | 841 | int |
921 | xfs_buf_lock_value( | 842 | xfs_buf_lock_value( |
922 | xfs_buf_t *bp) | 843 | xfs_buf_t *bp) |
923 | { | 844 | { |
924 | return bp->b_sema.count; | 845 | return bp->b_sema.count; |
925 | } | 846 | } |
926 | #endif | ||
927 | 847 | ||
928 | /* | 848 | /* |
929 | * Locks a buffer object. | 849 | * Locks a buffer object. |
@@ -935,12 +855,14 @@ void | |||
935 | xfs_buf_lock( | 855 | xfs_buf_lock( |
936 | xfs_buf_t *bp) | 856 | xfs_buf_t *bp) |
937 | { | 857 | { |
938 | XB_TRACE(bp, "lock", 0); | 858 | trace_xfs_buf_lock(bp, _RET_IP_); |
859 | |||
939 | if (atomic_read(&bp->b_io_remaining)) | 860 | if (atomic_read(&bp->b_io_remaining)) |
940 | blk_run_address_space(bp->b_target->bt_mapping); | 861 | blk_run_address_space(bp->b_target->bt_mapping); |
941 | down(&bp->b_sema); | 862 | down(&bp->b_sema); |
942 | XB_SET_OWNER(bp); | 863 | XB_SET_OWNER(bp); |
943 | XB_TRACE(bp, "locked", 0); | 864 | |
865 | trace_xfs_buf_lock_done(bp, _RET_IP_); | ||
944 | } | 866 | } |
945 | 867 | ||
946 | /* | 868 | /* |
@@ -962,7 +884,8 @@ xfs_buf_unlock( | |||
962 | 884 | ||
963 | XB_CLEAR_OWNER(bp); | 885 | XB_CLEAR_OWNER(bp); |
964 | up(&bp->b_sema); | 886 | up(&bp->b_sema); |
965 | XB_TRACE(bp, "unlock", 0); | 887 | |
888 | trace_xfs_buf_unlock(bp, _RET_IP_); | ||
966 | } | 889 | } |
967 | 890 | ||
968 | 891 | ||
@@ -974,17 +897,18 @@ void | |||
974 | xfs_buf_pin( | 897 | xfs_buf_pin( |
975 | xfs_buf_t *bp) | 898 | xfs_buf_t *bp) |
976 | { | 899 | { |
900 | trace_xfs_buf_pin(bp, _RET_IP_); | ||
977 | atomic_inc(&bp->b_pin_count); | 901 | atomic_inc(&bp->b_pin_count); |
978 | XB_TRACE(bp, "pin", (long)bp->b_pin_count.counter); | ||
979 | } | 902 | } |
980 | 903 | ||
981 | void | 904 | void |
982 | xfs_buf_unpin( | 905 | xfs_buf_unpin( |
983 | xfs_buf_t *bp) | 906 | xfs_buf_t *bp) |
984 | { | 907 | { |
908 | trace_xfs_buf_unpin(bp, _RET_IP_); | ||
909 | |||
985 | if (atomic_dec_and_test(&bp->b_pin_count)) | 910 | if (atomic_dec_and_test(&bp->b_pin_count)) |
986 | wake_up_all(&bp->b_waiters); | 911 | wake_up_all(&bp->b_waiters); |
987 | XB_TRACE(bp, "unpin", (long)bp->b_pin_count.counter); | ||
988 | } | 912 | } |
989 | 913 | ||
990 | int | 914 | int |
@@ -1035,7 +959,7 @@ xfs_buf_iodone_work( | |||
1035 | */ | 959 | */ |
1036 | if ((bp->b_error == EOPNOTSUPP) && | 960 | if ((bp->b_error == EOPNOTSUPP) && |
1037 | (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) { | 961 | (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) { |
1038 | XB_TRACE(bp, "ordered_retry", bp->b_iodone); | 962 | trace_xfs_buf_ordered_retry(bp, _RET_IP_); |
1039 | bp->b_flags &= ~XBF_ORDERED; | 963 | bp->b_flags &= ~XBF_ORDERED; |
1040 | bp->b_flags |= _XFS_BARRIER_FAILED; | 964 | bp->b_flags |= _XFS_BARRIER_FAILED; |
1041 | xfs_buf_iorequest(bp); | 965 | xfs_buf_iorequest(bp); |
@@ -1050,12 +974,12 @@ xfs_buf_ioend( | |||
1050 | xfs_buf_t *bp, | 974 | xfs_buf_t *bp, |
1051 | int schedule) | 975 | int schedule) |
1052 | { | 976 | { |
977 | trace_xfs_buf_iodone(bp, _RET_IP_); | ||
978 | |||
1053 | bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); | 979 | bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); |
1054 | if (bp->b_error == 0) | 980 | if (bp->b_error == 0) |
1055 | bp->b_flags |= XBF_DONE; | 981 | bp->b_flags |= XBF_DONE; |
1056 | 982 | ||
1057 | XB_TRACE(bp, "iodone", bp->b_iodone); | ||
1058 | |||
1059 | if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { | 983 | if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { |
1060 | if (schedule) { | 984 | if (schedule) { |
1061 | INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); | 985 | INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); |
@@ -1075,26 +999,34 @@ xfs_buf_ioerror( | |||
1075 | { | 999 | { |
1076 | ASSERT(error >= 0 && error <= 0xffff); | 1000 | ASSERT(error >= 0 && error <= 0xffff); |
1077 | bp->b_error = (unsigned short)error; | 1001 | bp->b_error = (unsigned short)error; |
1078 | XB_TRACE(bp, "ioerror", (unsigned long)error); | 1002 | trace_xfs_buf_ioerror(bp, error, _RET_IP_); |
1079 | } | 1003 | } |
1080 | 1004 | ||
1081 | int | 1005 | int |
1082 | xfs_bawrite( | 1006 | xfs_bwrite( |
1083 | void *mp, | 1007 | struct xfs_mount *mp, |
1084 | struct xfs_buf *bp) | 1008 | struct xfs_buf *bp) |
1085 | { | 1009 | { |
1086 | XB_TRACE(bp, "bawrite", 0); | 1010 | int iowait = (bp->b_flags & XBF_ASYNC) == 0; |
1011 | int error = 0; | ||
1087 | 1012 | ||
1088 | ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); | 1013 | bp->b_strat = xfs_bdstrat_cb; |
1014 | bp->b_mount = mp; | ||
1015 | bp->b_flags |= XBF_WRITE; | ||
1016 | if (!iowait) | ||
1017 | bp->b_flags |= _XBF_RUN_QUEUES; | ||
1089 | 1018 | ||
1090 | xfs_buf_delwri_dequeue(bp); | 1019 | xfs_buf_delwri_dequeue(bp); |
1020 | xfs_buf_iostrategy(bp); | ||
1091 | 1021 | ||
1092 | bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD); | 1022 | if (iowait) { |
1093 | bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES); | 1023 | error = xfs_buf_iowait(bp); |
1024 | if (error) | ||
1025 | xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); | ||
1026 | xfs_buf_relse(bp); | ||
1027 | } | ||
1094 | 1028 | ||
1095 | bp->b_mount = mp; | 1029 | return error; |
1096 | bp->b_strat = xfs_bdstrat_cb; | ||
1097 | return xfs_bdstrat_cb(bp); | ||
1098 | } | 1030 | } |
1099 | 1031 | ||
1100 | void | 1032 | void |
@@ -1102,7 +1034,7 @@ xfs_bdwrite( | |||
1102 | void *mp, | 1034 | void *mp, |
1103 | struct xfs_buf *bp) | 1035 | struct xfs_buf *bp) |
1104 | { | 1036 | { |
1105 | XB_TRACE(bp, "bdwrite", 0); | 1037 | trace_xfs_buf_bdwrite(bp, _RET_IP_); |
1106 | 1038 | ||
1107 | bp->b_strat = xfs_bdstrat_cb; | 1039 | bp->b_strat = xfs_bdstrat_cb; |
1108 | bp->b_mount = mp; | 1040 | bp->b_mount = mp; |
@@ -1113,7 +1045,127 @@ xfs_bdwrite( | |||
1113 | xfs_buf_delwri_queue(bp, 1); | 1045 | xfs_buf_delwri_queue(bp, 1); |
1114 | } | 1046 | } |
1115 | 1047 | ||
1116 | STATIC_INLINE void | 1048 | /* |
1049 | * Called when we want to stop a buffer from getting written or read. | ||
1050 | * We attach the EIO error, muck with its flags, and call biodone | ||
1051 | * so that the proper iodone callbacks get called. | ||
1052 | */ | ||
1053 | STATIC int | ||
1054 | xfs_bioerror( | ||
1055 | xfs_buf_t *bp) | ||
1056 | { | ||
1057 | #ifdef XFSERRORDEBUG | ||
1058 | ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone); | ||
1059 | #endif | ||
1060 | |||
1061 | /* | ||
1062 | * No need to wait until the buffer is unpinned, we aren't flushing it. | ||
1063 | */ | ||
1064 | XFS_BUF_ERROR(bp, EIO); | ||
1065 | |||
1066 | /* | ||
1067 | * We're calling biodone, so delete XBF_DONE flag. | ||
1068 | */ | ||
1069 | XFS_BUF_UNREAD(bp); | ||
1070 | XFS_BUF_UNDELAYWRITE(bp); | ||
1071 | XFS_BUF_UNDONE(bp); | ||
1072 | XFS_BUF_STALE(bp); | ||
1073 | |||
1074 | XFS_BUF_CLR_BDSTRAT_FUNC(bp); | ||
1075 | xfs_biodone(bp); | ||
1076 | |||
1077 | return EIO; | ||
1078 | } | ||
1079 | |||
1080 | /* | ||
1081 | * Same as xfs_bioerror, except that we are releasing the buffer | ||
1082 | * here ourselves, and avoiding the biodone call. | ||
1083 | * This is meant for userdata errors; metadata bufs come with | ||
1084 | * iodone functions attached, so that we can track down errors. | ||
1085 | */ | ||
1086 | STATIC int | ||
1087 | xfs_bioerror_relse( | ||
1088 | struct xfs_buf *bp) | ||
1089 | { | ||
1090 | int64_t fl = XFS_BUF_BFLAGS(bp); | ||
1091 | /* | ||
1092 | * No need to wait until the buffer is unpinned. | ||
1093 | * We aren't flushing it. | ||
1094 | * | ||
1095 | * chunkhold expects B_DONE to be set, whether | ||
1096 | * we actually finish the I/O or not. We don't want to | ||
1097 | * change that interface. | ||
1098 | */ | ||
1099 | XFS_BUF_UNREAD(bp); | ||
1100 | XFS_BUF_UNDELAYWRITE(bp); | ||
1101 | XFS_BUF_DONE(bp); | ||
1102 | XFS_BUF_STALE(bp); | ||
1103 | XFS_BUF_CLR_IODONE_FUNC(bp); | ||
1104 | XFS_BUF_CLR_BDSTRAT_FUNC(bp); | ||
1105 | if (!(fl & XBF_ASYNC)) { | ||
1106 | /* | ||
1107 | * Mark b_error and B_ERROR _both_. | ||
1108 | * Lot's of chunkcache code assumes that. | ||
1109 | * There's no reason to mark error for | ||
1110 | * ASYNC buffers. | ||
1111 | */ | ||
1112 | XFS_BUF_ERROR(bp, EIO); | ||
1113 | XFS_BUF_FINISH_IOWAIT(bp); | ||
1114 | } else { | ||
1115 | xfs_buf_relse(bp); | ||
1116 | } | ||
1117 | |||
1118 | return EIO; | ||
1119 | } | ||
1120 | |||
1121 | |||
1122 | /* | ||
1123 | * All xfs metadata buffers except log state machine buffers | ||
1124 | * get this attached as their b_bdstrat callback function. | ||
1125 | * This is so that we can catch a buffer | ||
1126 | * after prematurely unpinning it to forcibly shutdown the filesystem. | ||
1127 | */ | ||
1128 | int | ||
1129 | xfs_bdstrat_cb( | ||
1130 | struct xfs_buf *bp) | ||
1131 | { | ||
1132 | if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { | ||
1133 | trace_xfs_bdstrat_shut(bp, _RET_IP_); | ||
1134 | /* | ||
1135 | * Metadata write that didn't get logged but | ||
1136 | * written delayed anyway. These aren't associated | ||
1137 | * with a transaction, and can be ignored. | ||
1138 | */ | ||
1139 | if (!bp->b_iodone && !XFS_BUF_ISREAD(bp)) | ||
1140 | return xfs_bioerror_relse(bp); | ||
1141 | else | ||
1142 | return xfs_bioerror(bp); | ||
1143 | } | ||
1144 | |||
1145 | xfs_buf_iorequest(bp); | ||
1146 | return 0; | ||
1147 | } | ||
1148 | |||
1149 | /* | ||
1150 | * Wrapper around bdstrat so that we can stop data from going to disk in case | ||
1151 | * we are shutting down the filesystem. Typically user data goes thru this | ||
1152 | * path; one of the exceptions is the superblock. | ||
1153 | */ | ||
1154 | void | ||
1155 | xfsbdstrat( | ||
1156 | struct xfs_mount *mp, | ||
1157 | struct xfs_buf *bp) | ||
1158 | { | ||
1159 | if (XFS_FORCED_SHUTDOWN(mp)) { | ||
1160 | trace_xfs_bdstrat_shut(bp, _RET_IP_); | ||
1161 | xfs_bioerror_relse(bp); | ||
1162 | return; | ||
1163 | } | ||
1164 | |||
1165 | xfs_buf_iorequest(bp); | ||
1166 | } | ||
1167 | |||
1168 | STATIC void | ||
1117 | _xfs_buf_ioend( | 1169 | _xfs_buf_ioend( |
1118 | xfs_buf_t *bp, | 1170 | xfs_buf_t *bp, |
1119 | int schedule) | 1171 | int schedule) |
@@ -1135,6 +1187,9 @@ xfs_buf_bio_end_io( | |||
1135 | 1187 | ||
1136 | xfs_buf_ioerror(bp, -error); | 1188 | xfs_buf_ioerror(bp, -error); |
1137 | 1189 | ||
1190 | if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) | ||
1191 | invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); | ||
1192 | |||
1138 | do { | 1193 | do { |
1139 | struct page *page = bvec->bv_page; | 1194 | struct page *page = bvec->bv_page; |
1140 | 1195 | ||
@@ -1177,10 +1232,14 @@ _xfs_buf_ioapply( | |||
1177 | if (bp->b_flags & XBF_ORDERED) { | 1232 | if (bp->b_flags & XBF_ORDERED) { |
1178 | ASSERT(!(bp->b_flags & XBF_READ)); | 1233 | ASSERT(!(bp->b_flags & XBF_READ)); |
1179 | rw = WRITE_BARRIER; | 1234 | rw = WRITE_BARRIER; |
1180 | } else if (bp->b_flags & _XBF_RUN_QUEUES) { | 1235 | } else if (bp->b_flags & XBF_LOG_BUFFER) { |
1181 | ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); | 1236 | ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); |
1182 | bp->b_flags &= ~_XBF_RUN_QUEUES; | 1237 | bp->b_flags &= ~_XBF_RUN_QUEUES; |
1183 | rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC; | 1238 | rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC; |
1239 | } else if (bp->b_flags & _XBF_RUN_QUEUES) { | ||
1240 | ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); | ||
1241 | bp->b_flags &= ~_XBF_RUN_QUEUES; | ||
1242 | rw = (bp->b_flags & XBF_WRITE) ? WRITE_META : READ_META; | ||
1184 | } else { | 1243 | } else { |
1185 | rw = (bp->b_flags & XBF_WRITE) ? WRITE : | 1244 | rw = (bp->b_flags & XBF_WRITE) ? WRITE : |
1186 | (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; | 1245 | (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; |
@@ -1240,6 +1299,10 @@ next_chunk: | |||
1240 | 1299 | ||
1241 | submit_io: | 1300 | submit_io: |
1242 | if (likely(bio->bi_size)) { | 1301 | if (likely(bio->bi_size)) { |
1302 | if (xfs_buf_is_vmapped(bp)) { | ||
1303 | flush_kernel_vmap_range(bp->b_addr, | ||
1304 | xfs_buf_vmap_len(bp)); | ||
1305 | } | ||
1243 | submit_bio(rw, bio); | 1306 | submit_bio(rw, bio); |
1244 | if (size) | 1307 | if (size) |
1245 | goto next_chunk; | 1308 | goto next_chunk; |
@@ -1253,7 +1316,7 @@ int | |||
1253 | xfs_buf_iorequest( | 1316 | xfs_buf_iorequest( |
1254 | xfs_buf_t *bp) | 1317 | xfs_buf_t *bp) |
1255 | { | 1318 | { |
1256 | XB_TRACE(bp, "iorequest", 0); | 1319 | trace_xfs_buf_iorequest(bp, _RET_IP_); |
1257 | 1320 | ||
1258 | if (bp->b_flags & XBF_DELWRI) { | 1321 | if (bp->b_flags & XBF_DELWRI) { |
1259 | xfs_buf_delwri_queue(bp, 1); | 1322 | xfs_buf_delwri_queue(bp, 1); |
@@ -1287,11 +1350,13 @@ int | |||
1287 | xfs_buf_iowait( | 1350 | xfs_buf_iowait( |
1288 | xfs_buf_t *bp) | 1351 | xfs_buf_t *bp) |
1289 | { | 1352 | { |
1290 | XB_TRACE(bp, "iowait", 0); | 1353 | trace_xfs_buf_iowait(bp, _RET_IP_); |
1354 | |||
1291 | if (atomic_read(&bp->b_io_remaining)) | 1355 | if (atomic_read(&bp->b_io_remaining)) |
1292 | blk_run_address_space(bp->b_target->bt_mapping); | 1356 | blk_run_address_space(bp->b_target->bt_mapping); |
1293 | wait_for_completion(&bp->b_iowait); | 1357 | wait_for_completion(&bp->b_iowait); |
1294 | XB_TRACE(bp, "iowaited", (long)bp->b_error); | 1358 | |
1359 | trace_xfs_buf_iowait_done(bp, _RET_IP_); | ||
1295 | return bp->b_error; | 1360 | return bp->b_error; |
1296 | } | 1361 | } |
1297 | 1362 | ||
@@ -1318,7 +1383,7 @@ xfs_buf_iomove( | |||
1318 | xfs_buf_t *bp, /* buffer to process */ | 1383 | xfs_buf_t *bp, /* buffer to process */ |
1319 | size_t boff, /* starting buffer offset */ | 1384 | size_t boff, /* starting buffer offset */ |
1320 | size_t bsize, /* length to copy */ | 1385 | size_t bsize, /* length to copy */ |
1321 | caddr_t data, /* data address */ | 1386 | void *data, /* data address */ |
1322 | xfs_buf_rw_t mode) /* read/write/zero flag */ | 1387 | xfs_buf_rw_t mode) /* read/write/zero flag */ |
1323 | { | 1388 | { |
1324 | size_t bend, cpoff, csize; | 1389 | size_t bend, cpoff, csize; |
@@ -1400,8 +1465,8 @@ xfs_alloc_bufhash( | |||
1400 | 1465 | ||
1401 | btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ | 1466 | btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ |
1402 | btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; | 1467 | btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; |
1403 | btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) * | 1468 | btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * |
1404 | sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE); | 1469 | sizeof(xfs_bufhash_t)); |
1405 | for (i = 0; i < (1 << btp->bt_hashshift); i++) { | 1470 | for (i = 0; i < (1 << btp->bt_hashshift); i++) { |
1406 | spin_lock_init(&btp->bt_hash[i].bh_lock); | 1471 | spin_lock_init(&btp->bt_hash[i].bh_lock); |
1407 | INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); | 1472 | INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); |
@@ -1412,7 +1477,7 @@ STATIC void | |||
1412 | xfs_free_bufhash( | 1477 | xfs_free_bufhash( |
1413 | xfs_buftarg_t *btp) | 1478 | xfs_buftarg_t *btp) |
1414 | { | 1479 | { |
1415 | kmem_free(btp->bt_hash); | 1480 | kmem_free_large(btp->bt_hash); |
1416 | btp->bt_hash = NULL; | 1481 | btp->bt_hash = NULL; |
1417 | } | 1482 | } |
1418 | 1483 | ||
@@ -1604,7 +1669,8 @@ xfs_buf_delwri_queue( | |||
1604 | struct list_head *dwq = &bp->b_target->bt_delwrite_queue; | 1669 | struct list_head *dwq = &bp->b_target->bt_delwrite_queue; |
1605 | spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; | 1670 | spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; |
1606 | 1671 | ||
1607 | XB_TRACE(bp, "delwri_q", (long)unlock); | 1672 | trace_xfs_buf_delwri_queue(bp, _RET_IP_); |
1673 | |||
1608 | ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC)); | 1674 | ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC)); |
1609 | 1675 | ||
1610 | spin_lock(dwlk); | 1676 | spin_lock(dwlk); |
@@ -1616,6 +1682,11 @@ xfs_buf_delwri_queue( | |||
1616 | list_del(&bp->b_list); | 1682 | list_del(&bp->b_list); |
1617 | } | 1683 | } |
1618 | 1684 | ||
1685 | if (list_empty(dwq)) { | ||
1686 | /* start xfsbufd as it is about to have something to do */ | ||
1687 | wake_up_process(bp->b_target->bt_task); | ||
1688 | } | ||
1689 | |||
1619 | bp->b_flags |= _XBF_DELWRI_Q; | 1690 | bp->b_flags |= _XBF_DELWRI_Q; |
1620 | list_add_tail(&bp->b_list, dwq); | 1691 | list_add_tail(&bp->b_list, dwq); |
1621 | bp->b_queuetime = jiffies; | 1692 | bp->b_queuetime = jiffies; |
@@ -1644,7 +1715,36 @@ xfs_buf_delwri_dequeue( | |||
1644 | if (dequeued) | 1715 | if (dequeued) |
1645 | xfs_buf_rele(bp); | 1716 | xfs_buf_rele(bp); |
1646 | 1717 | ||
1647 | XB_TRACE(bp, "delwri_dq", (long)dequeued); | 1718 | trace_xfs_buf_delwri_dequeue(bp, _RET_IP_); |
1719 | } | ||
1720 | |||
1721 | /* | ||
1722 | * If a delwri buffer needs to be pushed before it has aged out, then promote | ||
1723 | * it to the head of the delwri queue so that it will be flushed on the next | ||
1724 | * xfsbufd run. We do this by resetting the queuetime of the buffer to be older | ||
1725 | * than the age currently needed to flush the buffer. Hence the next time the | ||
1726 | * xfsbufd sees it is guaranteed to be considered old enough to flush. | ||
1727 | */ | ||
1728 | void | ||
1729 | xfs_buf_delwri_promote( | ||
1730 | struct xfs_buf *bp) | ||
1731 | { | ||
1732 | struct xfs_buftarg *btp = bp->b_target; | ||
1733 | long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1; | ||
1734 | |||
1735 | ASSERT(bp->b_flags & XBF_DELWRI); | ||
1736 | ASSERT(bp->b_flags & _XBF_DELWRI_Q); | ||
1737 | |||
1738 | /* | ||
1739 | * Check the buffer age before locking the delayed write queue as we | ||
1740 | * don't need to promote buffers that are already past the flush age. | ||
1741 | */ | ||
1742 | if (bp->b_queuetime < jiffies - age) | ||
1743 | return; | ||
1744 | bp->b_queuetime = jiffies - age; | ||
1745 | spin_lock(&btp->bt_delwrite_lock); | ||
1746 | list_move(&bp->b_list, &btp->bt_delwrite_queue); | ||
1747 | spin_unlock(&btp->bt_delwrite_lock); | ||
1648 | } | 1748 | } |
1649 | 1749 | ||
1650 | STATIC void | 1750 | STATIC void |
@@ -1665,6 +1765,8 @@ xfsbufd_wakeup( | |||
1665 | list_for_each_entry(btp, &xfs_buftarg_list, bt_list) { | 1765 | list_for_each_entry(btp, &xfs_buftarg_list, bt_list) { |
1666 | if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags)) | 1766 | if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags)) |
1667 | continue; | 1767 | continue; |
1768 | if (list_empty(&btp->bt_delwrite_queue)) | ||
1769 | continue; | ||
1668 | set_bit(XBT_FORCE_FLUSH, &btp->bt_flags); | 1770 | set_bit(XBT_FORCE_FLUSH, &btp->bt_flags); |
1669 | wake_up_process(btp->bt_task); | 1771 | wake_up_process(btp->bt_task); |
1670 | } | 1772 | } |
@@ -1692,7 +1794,7 @@ xfs_buf_delwri_split( | |||
1692 | INIT_LIST_HEAD(list); | 1794 | INIT_LIST_HEAD(list); |
1693 | spin_lock(dwlk); | 1795 | spin_lock(dwlk); |
1694 | list_for_each_entry_safe(bp, n, dwq, b_list) { | 1796 | list_for_each_entry_safe(bp, n, dwq, b_list) { |
1695 | XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp)); | 1797 | trace_xfs_buf_delwri_split(bp, _RET_IP_); |
1696 | ASSERT(bp->b_flags & XBF_DELWRI); | 1798 | ASSERT(bp->b_flags & XBF_DELWRI); |
1697 | 1799 | ||
1698 | if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) { | 1800 | if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) { |
@@ -1715,20 +1817,53 @@ xfs_buf_delwri_split( | |||
1715 | 1817 | ||
1716 | } | 1818 | } |
1717 | 1819 | ||
1820 | /* | ||
1821 | * Compare function is more complex than it needs to be because | ||
1822 | * the return value is only 32 bits and we are doing comparisons | ||
1823 | * on 64 bit values | ||
1824 | */ | ||
1825 | static int | ||
1826 | xfs_buf_cmp( | ||
1827 | void *priv, | ||
1828 | struct list_head *a, | ||
1829 | struct list_head *b) | ||
1830 | { | ||
1831 | struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); | ||
1832 | struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); | ||
1833 | xfs_daddr_t diff; | ||
1834 | |||
1835 | diff = ap->b_bn - bp->b_bn; | ||
1836 | if (diff < 0) | ||
1837 | return -1; | ||
1838 | if (diff > 0) | ||
1839 | return 1; | ||
1840 | return 0; | ||
1841 | } | ||
1842 | |||
1843 | void | ||
1844 | xfs_buf_delwri_sort( | ||
1845 | xfs_buftarg_t *target, | ||
1846 | struct list_head *list) | ||
1847 | { | ||
1848 | list_sort(NULL, list, xfs_buf_cmp); | ||
1849 | } | ||
1850 | |||
1718 | STATIC int | 1851 | STATIC int |
1719 | xfsbufd( | 1852 | xfsbufd( |
1720 | void *data) | 1853 | void *data) |
1721 | { | 1854 | { |
1722 | struct list_head tmp; | 1855 | xfs_buftarg_t *target = (xfs_buftarg_t *)data; |
1723 | xfs_buftarg_t *target = (xfs_buftarg_t *)data; | ||
1724 | int count; | ||
1725 | xfs_buf_t *bp; | ||
1726 | 1856 | ||
1727 | current->flags |= PF_MEMALLOC; | 1857 | current->flags |= PF_MEMALLOC; |
1728 | 1858 | ||
1729 | set_freezable(); | 1859 | set_freezable(); |
1730 | 1860 | ||
1731 | do { | 1861 | do { |
1862 | long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); | ||
1863 | long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); | ||
1864 | int count = 0; | ||
1865 | struct list_head tmp; | ||
1866 | |||
1732 | if (unlikely(freezing(current))) { | 1867 | if (unlikely(freezing(current))) { |
1733 | set_bit(XBT_FORCE_SLEEP, &target->bt_flags); | 1868 | set_bit(XBT_FORCE_SLEEP, &target->bt_flags); |
1734 | refrigerator(); | 1869 | refrigerator(); |
@@ -1736,24 +1871,20 @@ xfsbufd( | |||
1736 | clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); | 1871 | clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); |
1737 | } | 1872 | } |
1738 | 1873 | ||
1739 | schedule_timeout_interruptible( | 1874 | /* sleep for a long time if there is nothing to do. */ |
1740 | xfs_buf_timer_centisecs * msecs_to_jiffies(10)); | 1875 | if (list_empty(&target->bt_delwrite_queue)) |
1876 | tout = MAX_SCHEDULE_TIMEOUT; | ||
1877 | schedule_timeout_interruptible(tout); | ||
1741 | 1878 | ||
1742 | xfs_buf_delwri_split(target, &tmp, | 1879 | xfs_buf_delwri_split(target, &tmp, age); |
1743 | xfs_buf_age_centisecs * msecs_to_jiffies(10)); | 1880 | list_sort(NULL, &tmp, xfs_buf_cmp); |
1744 | |||
1745 | count = 0; | ||
1746 | while (!list_empty(&tmp)) { | 1881 | while (!list_empty(&tmp)) { |
1747 | bp = list_entry(tmp.next, xfs_buf_t, b_list); | 1882 | struct xfs_buf *bp; |
1748 | ASSERT(target == bp->b_target); | 1883 | bp = list_first_entry(&tmp, struct xfs_buf, b_list); |
1749 | |||
1750 | list_del_init(&bp->b_list); | 1884 | list_del_init(&bp->b_list); |
1751 | xfs_buf_iostrategy(bp); | 1885 | xfs_buf_iostrategy(bp); |
1752 | count++; | 1886 | count++; |
1753 | } | 1887 | } |
1754 | |||
1755 | if (as_list_len > 0) | ||
1756 | purge_addresses(); | ||
1757 | if (count) | 1888 | if (count) |
1758 | blk_run_address_space(target->bt_mapping); | 1889 | blk_run_address_space(target->bt_mapping); |
1759 | 1890 | ||
@@ -1772,42 +1903,45 @@ xfs_flush_buftarg( | |||
1772 | xfs_buftarg_t *target, | 1903 | xfs_buftarg_t *target, |
1773 | int wait) | 1904 | int wait) |
1774 | { | 1905 | { |
1775 | struct list_head tmp; | 1906 | xfs_buf_t *bp; |
1776 | xfs_buf_t *bp, *n; | ||
1777 | int pincount = 0; | 1907 | int pincount = 0; |
1908 | LIST_HEAD(tmp_list); | ||
1909 | LIST_HEAD(wait_list); | ||
1778 | 1910 | ||
1779 | xfs_buf_runall_queues(xfsconvertd_workqueue); | 1911 | xfs_buf_runall_queues(xfsconvertd_workqueue); |
1780 | xfs_buf_runall_queues(xfsdatad_workqueue); | 1912 | xfs_buf_runall_queues(xfsdatad_workqueue); |
1781 | xfs_buf_runall_queues(xfslogd_workqueue); | 1913 | xfs_buf_runall_queues(xfslogd_workqueue); |
1782 | 1914 | ||
1783 | set_bit(XBT_FORCE_FLUSH, &target->bt_flags); | 1915 | set_bit(XBT_FORCE_FLUSH, &target->bt_flags); |
1784 | pincount = xfs_buf_delwri_split(target, &tmp, 0); | 1916 | pincount = xfs_buf_delwri_split(target, &tmp_list, 0); |
1785 | 1917 | ||
1786 | /* | 1918 | /* |
1787 | * Dropped the delayed write list lock, now walk the temporary list | 1919 | * Dropped the delayed write list lock, now walk the temporary list. |
1920 | * All I/O is issued async and then if we need to wait for completion | ||
1921 | * we do that after issuing all the IO. | ||
1788 | */ | 1922 | */ |
1789 | list_for_each_entry_safe(bp, n, &tmp, b_list) { | 1923 | list_sort(NULL, &tmp_list, xfs_buf_cmp); |
1924 | while (!list_empty(&tmp_list)) { | ||
1925 | bp = list_first_entry(&tmp_list, struct xfs_buf, b_list); | ||
1790 | ASSERT(target == bp->b_target); | 1926 | ASSERT(target == bp->b_target); |
1791 | if (wait) | 1927 | list_del_init(&bp->b_list); |
1928 | if (wait) { | ||
1792 | bp->b_flags &= ~XBF_ASYNC; | 1929 | bp->b_flags &= ~XBF_ASYNC; |
1793 | else | 1930 | list_add(&bp->b_list, &wait_list); |
1794 | list_del_init(&bp->b_list); | 1931 | } |
1795 | |||
1796 | xfs_buf_iostrategy(bp); | 1932 | xfs_buf_iostrategy(bp); |
1797 | } | 1933 | } |
1798 | 1934 | ||
1799 | if (wait) | 1935 | if (wait) { |
1936 | /* Expedite and wait for IO to complete. */ | ||
1800 | blk_run_address_space(target->bt_mapping); | 1937 | blk_run_address_space(target->bt_mapping); |
1938 | while (!list_empty(&wait_list)) { | ||
1939 | bp = list_first_entry(&wait_list, struct xfs_buf, b_list); | ||
1801 | 1940 | ||
1802 | /* | 1941 | list_del_init(&bp->b_list); |
1803 | * Remaining list items must be flushed before returning | 1942 | xfs_iowait(bp); |
1804 | */ | 1943 | xfs_buf_relse(bp); |
1805 | while (!list_empty(&tmp)) { | 1944 | } |
1806 | bp = list_entry(tmp.next, xfs_buf_t, b_list); | ||
1807 | |||
1808 | list_del_init(&bp->b_list); | ||
1809 | xfs_iowait(bp); | ||
1810 | xfs_buf_relse(bp); | ||
1811 | } | 1945 | } |
1812 | 1946 | ||
1813 | return pincount; | 1947 | return pincount; |
@@ -1816,14 +1950,10 @@ xfs_flush_buftarg( | |||
1816 | int __init | 1950 | int __init |
1817 | xfs_buf_init(void) | 1951 | xfs_buf_init(void) |
1818 | { | 1952 | { |
1819 | #ifdef XFS_BUF_TRACE | ||
1820 | xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_NOFS); | ||
1821 | #endif | ||
1822 | |||
1823 | xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", | 1953 | xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", |
1824 | KM_ZONE_HWALIGN, NULL); | 1954 | KM_ZONE_HWALIGN, NULL); |
1825 | if (!xfs_buf_zone) | 1955 | if (!xfs_buf_zone) |
1826 | goto out_free_trace_buf; | 1956 | goto out; |
1827 | 1957 | ||
1828 | xfslogd_workqueue = create_workqueue("xfslogd"); | 1958 | xfslogd_workqueue = create_workqueue("xfslogd"); |
1829 | if (!xfslogd_workqueue) | 1959 | if (!xfslogd_workqueue) |
@@ -1846,10 +1976,7 @@ xfs_buf_init(void) | |||
1846 | destroy_workqueue(xfslogd_workqueue); | 1976 | destroy_workqueue(xfslogd_workqueue); |
1847 | out_free_buf_zone: | 1977 | out_free_buf_zone: |
1848 | kmem_zone_destroy(xfs_buf_zone); | 1978 | kmem_zone_destroy(xfs_buf_zone); |
1849 | out_free_trace_buf: | 1979 | out: |
1850 | #ifdef XFS_BUF_TRACE | ||
1851 | ktrace_free(xfs_buf_trace_buf); | ||
1852 | #endif | ||
1853 | return -ENOMEM; | 1980 | return -ENOMEM; |
1854 | } | 1981 | } |
1855 | 1982 | ||
@@ -1861,9 +1988,6 @@ xfs_buf_terminate(void) | |||
1861 | destroy_workqueue(xfsdatad_workqueue); | 1988 | destroy_workqueue(xfsdatad_workqueue); |
1862 | destroy_workqueue(xfslogd_workqueue); | 1989 | destroy_workqueue(xfslogd_workqueue); |
1863 | kmem_zone_destroy(xfs_buf_zone); | 1990 | kmem_zone_destroy(xfs_buf_zone); |
1864 | #ifdef XFS_BUF_TRACE | ||
1865 | ktrace_free(xfs_buf_trace_buf); | ||
1866 | #endif | ||
1867 | } | 1991 | } |
1868 | 1992 | ||
1869 | #ifdef CONFIG_KDB_MODULES | 1993 | #ifdef CONFIG_KDB_MODULES |