aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2011-03-25 18:16:45 -0400
committerDave Chinner <david@fromorbit.com>2011-03-25 18:16:45 -0400
commit0e6e847ffe37436e331c132639f9f872febce82e (patch)
treeeb440ef910af695eafef787e12badc64fac0f8fe /fs
parent704b2907c2d47ceb187c0e25a6bbc2174b198f2f (diff)
xfs: stop using the page cache to back the buffer cache
Now that the buffer cache has it's own LRU, we do not need to use the page cache to provide persistent caching and reclaim infrastructure. Convert the buffer cache to use alloc_pages() instead of the page cache. This will remove all the overhead of page cache management from setup and teardown of the buffers, as well as needing to mark pages accessed as we find buffers in the buffer cache. By avoiding the page cache, we also remove the need to keep state in the page_private(page) field for persistant storage across buffer free/buffer rebuild and so all that code can be removed. This also fixes the long-standing problem of not having enough bits in the page_private field to track all the state needed for a 512 sector/64k page setup. It also removes the need for page locking during reads as the pages are unique to the buffer and nobody else will be attempting to access them. Finally, it removes the buftarg address space lock as a point of global contention on workloads that allocate and free buffers quickly such as when creating or removing large numbers of inodes in parallel. This remove the 16TB limit on filesystem size on 32 bit machines as the page index (32 bit) is no longer used for lookups of metadata buffers - the buffer cache is now solely indexed by disk address which is stored in a 64 bit field in the buffer. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Alex Elder <aelder@sgi.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c341
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h40
2 files changed, 84 insertions, 297 deletions
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index a5a260fab824..d45b2cdee6c4 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -94,75 +94,6 @@ xfs_buf_vmap_len(
94} 94}
95 95
96/* 96/*
97 * Page Region interfaces.
98 *
99 * For pages in filesystems where the blocksize is smaller than the
100 * pagesize, we use the page->private field (long) to hold a bitmap
101 * of uptodate regions within the page.
102 *
103 * Each such region is "bytes per page / bits per long" bytes long.
104 *
105 * NBPPR == number-of-bytes-per-page-region
106 * BTOPR == bytes-to-page-region (rounded up)
107 * BTOPRT == bytes-to-page-region-truncated (rounded down)
108 */
109#if (BITS_PER_LONG == 32)
110#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */
111#elif (BITS_PER_LONG == 64)
112#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */
113#else
114#error BITS_PER_LONG must be 32 or 64
115#endif
116#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG)
117#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
118#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT))
119
120STATIC unsigned long
121page_region_mask(
122 size_t offset,
123 size_t length)
124{
125 unsigned long mask;
126 int first, final;
127
128 first = BTOPR(offset);
129 final = BTOPRT(offset + length - 1);
130 first = min(first, final);
131
132 mask = ~0UL;
133 mask <<= BITS_PER_LONG - (final - first);
134 mask >>= BITS_PER_LONG - (final);
135
136 ASSERT(offset + length <= PAGE_CACHE_SIZE);
137 ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
138
139 return mask;
140}
141
142STATIC void
143set_page_region(
144 struct page *page,
145 size_t offset,
146 size_t length)
147{
148 set_page_private(page,
149 page_private(page) | page_region_mask(offset, length));
150 if (page_private(page) == ~0UL)
151 SetPageUptodate(page);
152}
153
154STATIC int
155test_page_region(
156 struct page *page,
157 size_t offset,
158 size_t length)
159{
160 unsigned long mask = page_region_mask(offset, length);
161
162 return (mask && (page_private(page) & mask) == mask);
163}
164
165/*
166 * xfs_buf_lru_add - add a buffer to the LRU. 97 * xfs_buf_lru_add - add a buffer to the LRU.
167 * 98 *
168 * The LRU takes a new reference to the buffer so that it will only be freed 99 * The LRU takes a new reference to the buffer so that it will only be freed
@@ -332,7 +263,7 @@ xfs_buf_free(
332 263
333 ASSERT(list_empty(&bp->b_lru)); 264 ASSERT(list_empty(&bp->b_lru));
334 265
335 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 266 if (bp->b_flags & _XBF_PAGES) {
336 uint i; 267 uint i;
337 268
338 if (xfs_buf_is_vmapped(bp)) 269 if (xfs_buf_is_vmapped(bp))
@@ -342,25 +273,22 @@ xfs_buf_free(
342 for (i = 0; i < bp->b_page_count; i++) { 273 for (i = 0; i < bp->b_page_count; i++) {
343 struct page *page = bp->b_pages[i]; 274 struct page *page = bp->b_pages[i];
344 275
345 if (bp->b_flags & _XBF_PAGE_CACHE) 276 __free_page(page);
346 ASSERT(!PagePrivate(page));
347 page_cache_release(page);
348 } 277 }
349 } 278 } else if (bp->b_flags & _XBF_KMEM)
279 kmem_free(bp->b_addr);
350 _xfs_buf_free_pages(bp); 280 _xfs_buf_free_pages(bp);
351 xfs_buf_deallocate(bp); 281 xfs_buf_deallocate(bp);
352} 282}
353 283
354/* 284/*
355 * Finds all pages for buffer in question and builds it's page list. 285 * Allocates all the pages for buffer in question and builds it's page list.
356 */ 286 */
357STATIC int 287STATIC int
358_xfs_buf_lookup_pages( 288xfs_buf_allocate_memory(
359 xfs_buf_t *bp, 289 xfs_buf_t *bp,
360 uint flags) 290 uint flags)
361{ 291{
362 struct address_space *mapping = bp->b_target->bt_mapping;
363 size_t blocksize = bp->b_target->bt_bsize;
364 size_t size = bp->b_count_desired; 292 size_t size = bp->b_count_desired;
365 size_t nbytes, offset; 293 size_t nbytes, offset;
366 gfp_t gfp_mask = xb_to_gfp(flags); 294 gfp_t gfp_mask = xb_to_gfp(flags);
@@ -369,29 +297,55 @@ _xfs_buf_lookup_pages(
369 xfs_off_t end; 297 xfs_off_t end;
370 int error; 298 int error;
371 299
300 /*
301 * for buffers that are contained within a single page, just allocate
302 * the memory from the heap - there's no need for the complexity of
303 * page arrays to keep allocation down to order 0.
304 */
305 if (bp->b_buffer_length < PAGE_SIZE) {
306 bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
307 if (!bp->b_addr) {
308 /* low memory - use alloc_page loop instead */
309 goto use_alloc_page;
310 }
311
312 if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
313 PAGE_MASK) !=
314 ((unsigned long)bp->b_addr & PAGE_MASK)) {
315 /* b_addr spans two pages - use alloc_page instead */
316 kmem_free(bp->b_addr);
317 bp->b_addr = NULL;
318 goto use_alloc_page;
319 }
320 bp->b_offset = offset_in_page(bp->b_addr);
321 bp->b_pages = bp->b_page_array;
322 bp->b_pages[0] = virt_to_page(bp->b_addr);
323 bp->b_page_count = 1;
324 bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
325 return 0;
326 }
327
328use_alloc_page:
372 end = bp->b_file_offset + bp->b_buffer_length; 329 end = bp->b_file_offset + bp->b_buffer_length;
373 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); 330 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
374
375 error = _xfs_buf_get_pages(bp, page_count, flags); 331 error = _xfs_buf_get_pages(bp, page_count, flags);
376 if (unlikely(error)) 332 if (unlikely(error))
377 return error; 333 return error;
378 bp->b_flags |= _XBF_PAGE_CACHE;
379 334
380 offset = bp->b_offset; 335 offset = bp->b_offset;
381 first = bp->b_file_offset >> PAGE_CACHE_SHIFT; 336 first = bp->b_file_offset >> PAGE_SHIFT;
337 bp->b_flags |= _XBF_PAGES;
382 338
383 for (i = 0; i < bp->b_page_count; i++) { 339 for (i = 0; i < bp->b_page_count; i++) {
384 struct page *page; 340 struct page *page;
385 uint retries = 0; 341 uint retries = 0;
386 342retry:
387 retry: 343 page = alloc_page(gfp_mask);
388 page = find_or_create_page(mapping, first + i, gfp_mask);
389 if (unlikely(page == NULL)) { 344 if (unlikely(page == NULL)) {
390 if (flags & XBF_READ_AHEAD) { 345 if (flags & XBF_READ_AHEAD) {
391 bp->b_page_count = i; 346 bp->b_page_count = i;
392 for (i = 0; i < bp->b_page_count; i++) 347 error = ENOMEM;
393 unlock_page(bp->b_pages[i]); 348 goto out_free_pages;
394 return -ENOMEM;
395 } 349 }
396 350
397 /* 351 /*
@@ -412,33 +366,16 @@ _xfs_buf_lookup_pages(
412 366
413 XFS_STATS_INC(xb_page_found); 367 XFS_STATS_INC(xb_page_found);
414 368
415 nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); 369 nbytes = min_t(size_t, size, PAGE_SIZE - offset);
416 size -= nbytes; 370 size -= nbytes;
417
418 ASSERT(!PagePrivate(page));
419 if (!PageUptodate(page)) {
420 page_count--;
421 if (blocksize >= PAGE_CACHE_SIZE) {
422 if (flags & XBF_READ)
423 bp->b_flags |= _XBF_PAGE_LOCKED;
424 } else if (!PagePrivate(page)) {
425 if (test_page_region(page, offset, nbytes))
426 page_count++;
427 }
428 }
429
430 bp->b_pages[i] = page; 371 bp->b_pages[i] = page;
431 offset = 0; 372 offset = 0;
432 } 373 }
374 return 0;
433 375
434 if (!(bp->b_flags & _XBF_PAGE_LOCKED)) { 376out_free_pages:
435 for (i = 0; i < bp->b_page_count; i++) 377 for (i = 0; i < bp->b_page_count; i++)
436 unlock_page(bp->b_pages[i]); 378 __free_page(bp->b_pages[i]);
437 }
438
439 if (page_count == bp->b_page_count)
440 bp->b_flags |= XBF_DONE;
441
442 return error; 379 return error;
443} 380}
444 381
@@ -450,8 +387,9 @@ _xfs_buf_map_pages(
450 xfs_buf_t *bp, 387 xfs_buf_t *bp,
451 uint flags) 388 uint flags)
452{ 389{
453 /* A single page buffer is always mappable */ 390 ASSERT(bp->b_flags & _XBF_PAGES);
454 if (bp->b_page_count == 1) { 391 if (bp->b_page_count == 1) {
392 /* A single page buffer is always mappable */
455 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 393 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
456 bp->b_flags |= XBF_MAPPED; 394 bp->b_flags |= XBF_MAPPED;
457 } else if (flags & XBF_MAPPED) { 395 } else if (flags & XBF_MAPPED) {
@@ -576,9 +514,14 @@ found:
576 } 514 }
577 } 515 }
578 516
517 /*
518 * if the buffer is stale, clear all the external state associated with
519 * it. We need to keep flags such as how we allocated the buffer memory
520 * intact here.
521 */
579 if (bp->b_flags & XBF_STALE) { 522 if (bp->b_flags & XBF_STALE) {
580 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 523 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
581 bp->b_flags &= XBF_MAPPED; 524 bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
582 } 525 }
583 526
584 trace_xfs_buf_find(bp, flags, _RET_IP_); 527 trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -599,7 +542,7 @@ xfs_buf_get(
599 xfs_buf_flags_t flags) 542 xfs_buf_flags_t flags)
600{ 543{
601 xfs_buf_t *bp, *new_bp; 544 xfs_buf_t *bp, *new_bp;
602 int error = 0, i; 545 int error = 0;
603 546
604 new_bp = xfs_buf_allocate(flags); 547 new_bp = xfs_buf_allocate(flags);
605 if (unlikely(!new_bp)) 548 if (unlikely(!new_bp))
@@ -607,7 +550,7 @@ xfs_buf_get(
607 550
608 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); 551 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
609 if (bp == new_bp) { 552 if (bp == new_bp) {
610 error = _xfs_buf_lookup_pages(bp, flags); 553 error = xfs_buf_allocate_memory(bp, flags);
611 if (error) 554 if (error)
612 goto no_buffer; 555 goto no_buffer;
613 } else { 556 } else {
@@ -616,9 +559,6 @@ xfs_buf_get(
616 return NULL; 559 return NULL;
617 } 560 }
618 561
619 for (i = 0; i < bp->b_page_count; i++)
620 mark_page_accessed(bp->b_pages[i]);
621
622 if (!(bp->b_flags & XBF_MAPPED)) { 562 if (!(bp->b_flags & XBF_MAPPED)) {
623 error = _xfs_buf_map_pages(bp, flags); 563 error = _xfs_buf_map_pages(bp, flags);
624 if (unlikely(error)) { 564 if (unlikely(error)) {
@@ -719,8 +659,7 @@ xfs_buf_readahead(
719{ 659{
720 struct backing_dev_info *bdi; 660 struct backing_dev_info *bdi;
721 661
722 bdi = target->bt_mapping->backing_dev_info; 662 if (bdi_read_congested(target->bt_bdi))
723 if (bdi_read_congested(bdi))
724 return; 663 return;
725 664
726 xfs_buf_read(target, ioff, isize, 665 xfs_buf_read(target, ioff, isize,
@@ -798,10 +737,10 @@ xfs_buf_associate_memory(
798 size_t buflen; 737 size_t buflen;
799 int page_count; 738 int page_count;
800 739
801 pageaddr = (unsigned long)mem & PAGE_CACHE_MASK; 740 pageaddr = (unsigned long)mem & PAGE_MASK;
802 offset = (unsigned long)mem - pageaddr; 741 offset = (unsigned long)mem - pageaddr;
803 buflen = PAGE_CACHE_ALIGN(len + offset); 742 buflen = PAGE_ALIGN(len + offset);
804 page_count = buflen >> PAGE_CACHE_SHIFT; 743 page_count = buflen >> PAGE_SHIFT;
805 744
806 /* Free any previous set of page pointers */ 745 /* Free any previous set of page pointers */
807 if (bp->b_pages) 746 if (bp->b_pages)
@@ -818,13 +757,12 @@ xfs_buf_associate_memory(
818 757
819 for (i = 0; i < bp->b_page_count; i++) { 758 for (i = 0; i < bp->b_page_count; i++) {
820 bp->b_pages[i] = mem_to_page((void *)pageaddr); 759 bp->b_pages[i] = mem_to_page((void *)pageaddr);
821 pageaddr += PAGE_CACHE_SIZE; 760 pageaddr += PAGE_SIZE;
822 } 761 }
823 762
824 bp->b_count_desired = len; 763 bp->b_count_desired = len;
825 bp->b_buffer_length = buflen; 764 bp->b_buffer_length = buflen;
826 bp->b_flags |= XBF_MAPPED; 765 bp->b_flags |= XBF_MAPPED;
827 bp->b_flags &= ~_XBF_PAGE_LOCKED;
828 766
829 return 0; 767 return 0;
830} 768}
@@ -931,20 +869,7 @@ xfs_buf_rele(
931 869
932 870
933/* 871/*
934 * Mutual exclusion on buffers. Locking model: 872 * Lock a buffer object, if it is not already locked.
935 *
936 * Buffers associated with inodes for which buffer locking
937 * is not enabled are not protected by semaphores, and are
938 * assumed to be exclusively owned by the caller. There is a
939 * spinlock in the buffer, used by the caller when concurrent
940 * access is possible.
941 */
942
943/*
944 * Locks a buffer object, if it is not already locked. Note that this in
945 * no way locks the underlying pages, so it is only useful for
946 * synchronizing concurrent use of buffer objects, not for synchronizing
947 * independent access to the underlying pages.
948 * 873 *
949 * If we come across a stale, pinned, locked buffer, we know that we are 874 * If we come across a stale, pinned, locked buffer, we know that we are
950 * being asked to lock a buffer that has been reallocated. Because it is 875 * being asked to lock a buffer that has been reallocated. Because it is
@@ -978,10 +903,7 @@ xfs_buf_lock_value(
978} 903}
979 904
980/* 905/*
981 * Locks a buffer object. 906 * Lock a buffer object.
982 * Note that this in no way locks the underlying pages, so it is only
983 * useful for synchronizing concurrent use of buffer objects, not for
984 * synchronizing independent access to the underlying pages.
985 * 907 *
986 * If we come across a stale, pinned, locked buffer, we know that we 908 * If we come across a stale, pinned, locked buffer, we know that we
987 * are being asked to lock a buffer that has been reallocated. Because 909 * are being asked to lock a buffer that has been reallocated. Because
@@ -998,7 +920,7 @@ xfs_buf_lock(
998 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 920 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
999 xfs_log_force(bp->b_target->bt_mount, 0); 921 xfs_log_force(bp->b_target->bt_mount, 0);
1000 if (atomic_read(&bp->b_io_remaining)) 922 if (atomic_read(&bp->b_io_remaining))
1001 blk_run_address_space(bp->b_target->bt_mapping); 923 blk_run_backing_dev(bp->b_target->bt_bdi, NULL);
1002 down(&bp->b_sema); 924 down(&bp->b_sema);
1003 XB_SET_OWNER(bp); 925 XB_SET_OWNER(bp);
1004 926
@@ -1043,7 +965,7 @@ xfs_buf_wait_unpin(
1043 if (atomic_read(&bp->b_pin_count) == 0) 965 if (atomic_read(&bp->b_pin_count) == 0)
1044 break; 966 break;
1045 if (atomic_read(&bp->b_io_remaining)) 967 if (atomic_read(&bp->b_io_remaining))
1046 blk_run_address_space(bp->b_target->bt_mapping); 968 blk_run_backing_dev(bp->b_target->bt_bdi, NULL);
1047 schedule(); 969 schedule();
1048 } 970 }
1049 remove_wait_queue(&bp->b_waiters, &wait); 971 remove_wait_queue(&bp->b_waiters, &wait);
@@ -1256,10 +1178,8 @@ _xfs_buf_ioend(
1256 xfs_buf_t *bp, 1178 xfs_buf_t *bp,
1257 int schedule) 1179 int schedule)
1258{ 1180{
1259 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { 1181 if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
1260 bp->b_flags &= ~_XBF_PAGE_LOCKED;
1261 xfs_buf_ioend(bp, schedule); 1182 xfs_buf_ioend(bp, schedule);
1262 }
1263} 1183}
1264 1184
1265STATIC void 1185STATIC void
@@ -1268,35 +1188,12 @@ xfs_buf_bio_end_io(
1268 int error) 1188 int error)
1269{ 1189{
1270 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; 1190 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
1271 unsigned int blocksize = bp->b_target->bt_bsize;
1272 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1273 1191
1274 xfs_buf_ioerror(bp, -error); 1192 xfs_buf_ioerror(bp, -error);
1275 1193
1276 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1194 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1277 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1195 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1278 1196
1279 do {
1280 struct page *page = bvec->bv_page;
1281
1282 ASSERT(!PagePrivate(page));
1283 if (unlikely(bp->b_error)) {
1284 if (bp->b_flags & XBF_READ)
1285 ClearPageUptodate(page);
1286 } else if (blocksize >= PAGE_CACHE_SIZE) {
1287 SetPageUptodate(page);
1288 } else if (!PagePrivate(page) &&
1289 (bp->b_flags & _XBF_PAGE_CACHE)) {
1290 set_page_region(page, bvec->bv_offset, bvec->bv_len);
1291 }
1292
1293 if (--bvec >= bio->bi_io_vec)
1294 prefetchw(&bvec->bv_page->flags);
1295
1296 if (bp->b_flags & _XBF_PAGE_LOCKED)
1297 unlock_page(page);
1298 } while (bvec >= bio->bi_io_vec);
1299
1300 _xfs_buf_ioend(bp, 1); 1197 _xfs_buf_ioend(bp, 1);
1301 bio_put(bio); 1198 bio_put(bio);
1302} 1199}
@@ -1310,7 +1207,6 @@ _xfs_buf_ioapply(
1310 int offset = bp->b_offset; 1207 int offset = bp->b_offset;
1311 int size = bp->b_count_desired; 1208 int size = bp->b_count_desired;
1312 sector_t sector = bp->b_bn; 1209 sector_t sector = bp->b_bn;
1313 unsigned int blocksize = bp->b_target->bt_bsize;
1314 1210
1315 total_nr_pages = bp->b_page_count; 1211 total_nr_pages = bp->b_page_count;
1316 map_i = 0; 1212 map_i = 0;
@@ -1331,29 +1227,6 @@ _xfs_buf_ioapply(
1331 (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; 1227 (bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
1332 } 1228 }
1333 1229
1334 /* Special code path for reading a sub page size buffer in --
1335 * we populate up the whole page, and hence the other metadata
1336 * in the same page. This optimization is only valid when the
1337 * filesystem block size is not smaller than the page size.
1338 */
1339 if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
1340 ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) ==
1341 (XBF_READ|_XBF_PAGE_LOCKED)) &&
1342 (blocksize >= PAGE_CACHE_SIZE)) {
1343 bio = bio_alloc(GFP_NOIO, 1);
1344
1345 bio->bi_bdev = bp->b_target->bt_bdev;
1346 bio->bi_sector = sector - (offset >> BBSHIFT);
1347 bio->bi_end_io = xfs_buf_bio_end_io;
1348 bio->bi_private = bp;
1349
1350 bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0);
1351 size = 0;
1352
1353 atomic_inc(&bp->b_io_remaining);
1354
1355 goto submit_io;
1356 }
1357 1230
1358next_chunk: 1231next_chunk:
1359 atomic_inc(&bp->b_io_remaining); 1232 atomic_inc(&bp->b_io_remaining);
@@ -1367,8 +1240,9 @@ next_chunk:
1367 bio->bi_end_io = xfs_buf_bio_end_io; 1240 bio->bi_end_io = xfs_buf_bio_end_io;
1368 bio->bi_private = bp; 1241 bio->bi_private = bp;
1369 1242
1243
1370 for (; size && nr_pages; nr_pages--, map_i++) { 1244 for (; size && nr_pages; nr_pages--, map_i++) {
1371 int rbytes, nbytes = PAGE_CACHE_SIZE - offset; 1245 int rbytes, nbytes = PAGE_SIZE - offset;
1372 1246
1373 if (nbytes > size) 1247 if (nbytes > size)
1374 nbytes = size; 1248 nbytes = size;
@@ -1383,7 +1257,6 @@ next_chunk:
1383 total_nr_pages--; 1257 total_nr_pages--;
1384 } 1258 }
1385 1259
1386submit_io:
1387 if (likely(bio->bi_size)) { 1260 if (likely(bio->bi_size)) {
1388 if (xfs_buf_is_vmapped(bp)) { 1261 if (xfs_buf_is_vmapped(bp)) {
1389 flush_kernel_vmap_range(bp->b_addr, 1262 flush_kernel_vmap_range(bp->b_addr,
@@ -1393,18 +1266,7 @@ submit_io:
1393 if (size) 1266 if (size)
1394 goto next_chunk; 1267 goto next_chunk;
1395 } else { 1268 } else {
1396 /*
1397 * if we get here, no pages were added to the bio. However,
1398 * we can't just error out here - if the pages are locked then
1399 * we have to unlock them otherwise we can hang on a later
1400 * access to the page.
1401 */
1402 xfs_buf_ioerror(bp, EIO); 1269 xfs_buf_ioerror(bp, EIO);
1403 if (bp->b_flags & _XBF_PAGE_LOCKED) {
1404 int i;
1405 for (i = 0; i < bp->b_page_count; i++)
1406 unlock_page(bp->b_pages[i]);
1407 }
1408 bio_put(bio); 1270 bio_put(bio);
1409 } 1271 }
1410} 1272}
@@ -1450,7 +1312,7 @@ xfs_buf_iowait(
1450 trace_xfs_buf_iowait(bp, _RET_IP_); 1312 trace_xfs_buf_iowait(bp, _RET_IP_);
1451 1313
1452 if (atomic_read(&bp->b_io_remaining)) 1314 if (atomic_read(&bp->b_io_remaining))
1453 blk_run_address_space(bp->b_target->bt_mapping); 1315 blk_run_backing_dev(bp->b_target->bt_bdi, NULL);
1454 wait_for_completion(&bp->b_iowait); 1316 wait_for_completion(&bp->b_iowait);
1455 1317
1456 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1318 trace_xfs_buf_iowait_done(bp, _RET_IP_);
@@ -1468,8 +1330,8 @@ xfs_buf_offset(
1468 return XFS_BUF_PTR(bp) + offset; 1330 return XFS_BUF_PTR(bp) + offset;
1469 1331
1470 offset += bp->b_offset; 1332 offset += bp->b_offset;
1471 page = bp->b_pages[offset >> PAGE_CACHE_SHIFT]; 1333 page = bp->b_pages[offset >> PAGE_SHIFT];
1472 return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1)); 1334 return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
1473} 1335}
1474 1336
1475/* 1337/*
@@ -1491,9 +1353,9 @@ xfs_buf_iomove(
1491 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; 1353 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
1492 cpoff = xfs_buf_poff(boff + bp->b_offset); 1354 cpoff = xfs_buf_poff(boff + bp->b_offset);
1493 csize = min_t(size_t, 1355 csize = min_t(size_t,
1494 PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff); 1356 PAGE_SIZE-cpoff, bp->b_count_desired-boff);
1495 1357
1496 ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE)); 1358 ASSERT(((csize + cpoff) <= PAGE_SIZE));
1497 1359
1498 switch (mode) { 1360 switch (mode) {
1499 case XBRW_ZERO: 1361 case XBRW_ZERO:
@@ -1606,7 +1468,6 @@ xfs_free_buftarg(
1606 xfs_flush_buftarg(btp, 1); 1468 xfs_flush_buftarg(btp, 1);
1607 if (mp->m_flags & XFS_MOUNT_BARRIER) 1469 if (mp->m_flags & XFS_MOUNT_BARRIER)
1608 xfs_blkdev_issue_flush(btp); 1470 xfs_blkdev_issue_flush(btp);
1609 iput(btp->bt_mapping->host);
1610 1471
1611 kthread_stop(btp->bt_task); 1472 kthread_stop(btp->bt_task);
1612 kmem_free(btp); 1473 kmem_free(btp);
@@ -1630,15 +1491,6 @@ xfs_setsize_buftarg_flags(
1630 return EINVAL; 1491 return EINVAL;
1631 } 1492 }
1632 1493
1633 if (verbose &&
1634 (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
1635 printk(KERN_WARNING
1636 "XFS: %u byte sectors in use on device %s. "
1637 "This is suboptimal; %u or greater is ideal.\n",
1638 sectorsize, XFS_BUFTARG_NAME(btp),
1639 (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
1640 }
1641
1642 return 0; 1494 return 0;
1643} 1495}
1644 1496
@@ -1653,7 +1505,7 @@ xfs_setsize_buftarg_early(
1653 struct block_device *bdev) 1505 struct block_device *bdev)
1654{ 1506{
1655 return xfs_setsize_buftarg_flags(btp, 1507 return xfs_setsize_buftarg_flags(btp,
1656 PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0); 1508 PAGE_SIZE, bdev_logical_block_size(bdev), 0);
1657} 1509}
1658 1510
1659int 1511int
@@ -1666,41 +1518,6 @@ xfs_setsize_buftarg(
1666} 1518}
1667 1519
1668STATIC int 1520STATIC int
1669xfs_mapping_buftarg(
1670 xfs_buftarg_t *btp,
1671 struct block_device *bdev)
1672{
1673 struct backing_dev_info *bdi;
1674 struct inode *inode;
1675 struct address_space *mapping;
1676 static const struct address_space_operations mapping_aops = {
1677 .sync_page = block_sync_page,
1678 .migratepage = fail_migrate_page,
1679 };
1680
1681 inode = new_inode(bdev->bd_inode->i_sb);
1682 if (!inode) {
1683 printk(KERN_WARNING
1684 "XFS: Cannot allocate mapping inode for device %s\n",
1685 XFS_BUFTARG_NAME(btp));
1686 return ENOMEM;
1687 }
1688 inode->i_ino = get_next_ino();
1689 inode->i_mode = S_IFBLK;
1690 inode->i_bdev = bdev;
1691 inode->i_rdev = bdev->bd_dev;
1692 bdi = blk_get_backing_dev_info(bdev);
1693 if (!bdi)
1694 bdi = &default_backing_dev_info;
1695 mapping = &inode->i_data;
1696 mapping->a_ops = &mapping_aops;
1697 mapping->backing_dev_info = bdi;
1698 mapping_set_gfp_mask(mapping, GFP_NOFS);
1699 btp->bt_mapping = mapping;
1700 return 0;
1701}
1702
1703STATIC int
1704xfs_alloc_delwrite_queue( 1521xfs_alloc_delwrite_queue(
1705 xfs_buftarg_t *btp, 1522 xfs_buftarg_t *btp,
1706 const char *fsname) 1523 const char *fsname)
@@ -1728,12 +1545,14 @@ xfs_alloc_buftarg(
1728 btp->bt_mount = mp; 1545 btp->bt_mount = mp;
1729 btp->bt_dev = bdev->bd_dev; 1546 btp->bt_dev = bdev->bd_dev;
1730 btp->bt_bdev = bdev; 1547 btp->bt_bdev = bdev;
1548 btp->bt_bdi = blk_get_backing_dev_info(bdev);
1549 if (!btp->bt_bdi)
1550 goto error;
1551
1731 INIT_LIST_HEAD(&btp->bt_lru); 1552 INIT_LIST_HEAD(&btp->bt_lru);
1732 spin_lock_init(&btp->bt_lru_lock); 1553 spin_lock_init(&btp->bt_lru_lock);
1733 if (xfs_setsize_buftarg_early(btp, bdev)) 1554 if (xfs_setsize_buftarg_early(btp, bdev))
1734 goto error; 1555 goto error;
1735 if (xfs_mapping_buftarg(btp, bdev))
1736 goto error;
1737 if (xfs_alloc_delwrite_queue(btp, fsname)) 1556 if (xfs_alloc_delwrite_queue(btp, fsname))
1738 goto error; 1557 goto error;
1739 btp->bt_shrinker.shrink = xfs_buftarg_shrink; 1558 btp->bt_shrinker.shrink = xfs_buftarg_shrink;
@@ -1955,7 +1774,7 @@ xfsbufd(
1955 count++; 1774 count++;
1956 } 1775 }
1957 if (count) 1776 if (count)
1958 blk_run_address_space(target->bt_mapping); 1777 blk_run_backing_dev(target->bt_bdi, NULL);
1959 1778
1960 } while (!kthread_should_stop()); 1779 } while (!kthread_should_stop());
1961 1780
@@ -2003,7 +1822,7 @@ xfs_flush_buftarg(
2003 1822
2004 if (wait) { 1823 if (wait) {
2005 /* Expedite and wait for IO to complete. */ 1824 /* Expedite and wait for IO to complete. */
2006 blk_run_address_space(target->bt_mapping); 1825 blk_run_backing_dev(target->bt_bdi, NULL);
2007 while (!list_empty(&wait_list)) { 1826 while (!list_empty(&wait_list)) {
2008 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 1827 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
2009 1828
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index cbe65950e524..a9a1c4512645 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -61,30 +61,11 @@ typedef enum {
61#define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */ 61#define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */
62 62
63/* flags used only internally */ 63/* flags used only internally */
64#define _XBF_PAGE_CACHE (1 << 17)/* backed by pagecache */
65#define _XBF_PAGES (1 << 18)/* backed by refcounted pages */ 64#define _XBF_PAGES (1 << 18)/* backed by refcounted pages */
66#define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */ 65#define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */
66#define _XBF_KMEM (1 << 20)/* backed by heap memory */
67#define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */ 67#define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */
68 68
69/*
70 * Special flag for supporting metadata blocks smaller than a FSB.
71 *
72 * In this case we can have multiple xfs_buf_t on a single page and
73 * need to lock out concurrent xfs_buf_t readers as they only
74 * serialise access to the buffer.
75 *
76 * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
77 * between reads of the page. Hence we can have one thread read the
78 * page and modify it, but then race with another thread that thinks
79 * the page is not up-to-date and hence reads it again.
80 *
81 * The result is that the first modifcation to the page is lost.
82 * This sort of AGF/AGI reading race can happen when unlinking inodes
83 * that require truncation and results in the AGI unlinked list
84 * modifications being lost.
85 */
86#define _XBF_PAGE_LOCKED (1 << 22)
87
88typedef unsigned int xfs_buf_flags_t; 69typedef unsigned int xfs_buf_flags_t;
89 70
90#define XFS_BUF_FLAGS \ 71#define XFS_BUF_FLAGS \
@@ -100,12 +81,10 @@ typedef unsigned int xfs_buf_flags_t;
100 { XBF_LOCK, "LOCK" }, /* should never be set */\ 81 { XBF_LOCK, "LOCK" }, /* should never be set */\
101 { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\ 82 { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\
102 { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\ 83 { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\
103 { _XBF_PAGE_CACHE, "PAGE_CACHE" }, \
104 { _XBF_PAGES, "PAGES" }, \ 84 { _XBF_PAGES, "PAGES" }, \
105 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \ 85 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \
106 { _XBF_DELWRI_Q, "DELWRI_Q" }, \ 86 { _XBF_KMEM, "KMEM" }, \
107 { _XBF_PAGE_LOCKED, "PAGE_LOCKED" } 87 { _XBF_DELWRI_Q, "DELWRI_Q" }
108
109 88
110typedef enum { 89typedef enum {
111 XBT_FORCE_SLEEP = 0, 90 XBT_FORCE_SLEEP = 0,
@@ -120,7 +99,7 @@ typedef struct xfs_bufhash {
120typedef struct xfs_buftarg { 99typedef struct xfs_buftarg {
121 dev_t bt_dev; 100 dev_t bt_dev;
122 struct block_device *bt_bdev; 101 struct block_device *bt_bdev;
123 struct address_space *bt_mapping; 102 struct backing_dev_info *bt_bdi;
124 struct xfs_mount *bt_mount; 103 struct xfs_mount *bt_mount;
125 unsigned int bt_bsize; 104 unsigned int bt_bsize;
126 unsigned int bt_sshift; 105 unsigned int bt_sshift;
@@ -139,17 +118,6 @@ typedef struct xfs_buftarg {
139 unsigned int bt_lru_nr; 118 unsigned int bt_lru_nr;
140} xfs_buftarg_t; 119} xfs_buftarg_t;
141 120
142/*
143 * xfs_buf_t: Buffer structure for pagecache-based buffers
144 *
145 * This buffer structure is used by the pagecache buffer management routines
146 * to refer to an assembly of pages forming a logical buffer.
147 *
148 * The buffer structure is used on a temporary basis only, and discarded when
149 * released. The real data storage is recorded in the pagecache. Buffers are
150 * hashed to the block device on which the file system resides.
151 */
152
153struct xfs_buf; 121struct xfs_buf;
154typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); 122typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
155 123