aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-10-06 11:18:10 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-10-06 11:18:10 -0400
commit8d370595811e13378243832006f8c52bbc9cca5e (patch)
tree8cab6785c7fedd8d648b51db0ec420f610b2cd2a
parentd230ec72c4efed7d0f414a80a756c54d4c422a6e (diff)
parent155cd433b516506df065866f3d974661f6473572 (diff)
Merge tag 'xfs-for-linus-4.9-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs
Pull xfs and iomap updates from Dave Chinner: "The main things in this update are the iomap-based DAX infrastructure, an XFS delalloc rework, and a chunk of fixes to how log recovery schedules writeback to prevent spurious corruption detections when recovery of certain items was not required. The other main chunk of code is some preparation for the upcoming reflink functionality. Most of it is generic and cleanups that stand alone, but they were ready and reviewed so are in this pull request. Speaking of reflink, I'm currently planning to send you another pull request next week containing all the new reflink functionality. I'm working through a similar process to the last cycle, where I sent the reverse mapping code in a separate request because of how large it was. The reflink code merge is even bigger than reverse mapping, so I'll be doing the same thing again.... Summary for this update: - change of XFS mailing list to linux-xfs@vger.kernel.org - iomap-based DAX infrastructure w/ XFS and ext2 support - small iomap fixes and additions - more efficient XFS delayed allocation infrastructure based on iomap - a rework of log recovery writeback scheduling to ensure we don't fail recovery when trying to replay items that are already on disk - some preparation patches for upcoming reflink support - configurable error handling fixes and documentation - aio access time update race fixes for XFS and generic_file_read_iter" * tag 'xfs-for-linus-4.9-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs: (40 commits) fs: update atime before I/O in generic_file_read_iter xfs: update atime before I/O in xfs_file_dio_aio_read ext2: fix possible integer truncation in ext2_iomap_begin xfs: log recovery tracepoints to track current lsn and buffer submission xfs: update metadata LSN in buffers during log recovery xfs: don't warn on buffers not being recovered due to LSN xfs: pass current lsn to log recovery buffer validation xfs: rework log recovery to submit buffers on LSN boundaries xfs: quiesce the filesystem after recovery on readonly mount xfs: remote attribute blocks aren't really userdata ext2: use iomap to implement DAX ext2: stop passing buffer_head to ext2_get_blocks xfs: use iomap to implement DAX xfs: refactor xfs_setfilesize xfs: take the ilock shared if possible in xfs_file_iomap_begin xfs: fix locking for DAX writes dax: provide an iomap based fault handler dax: provide an iomap based dax read/write path dax: don't pass buffer_head to copy_user_dax dax: don't pass buffer_head to dax_insert_mapping ...
-rw-r--r--Documentation/filesystems/xfs.txt123
-rw-r--r--MAINTAINERS7
-rw-r--r--fs/dax.c252
-rw-r--r--fs/ext2/Kconfig1
-rw-r--r--fs/ext2/ext2.h1
-rw-r--r--fs/ext2/file.c76
-rw-r--r--fs/ext2/inode.c100
-rw-r--r--fs/internal.h11
-rw-r--r--fs/iomap.c89
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c325
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.h35
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c135
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h25
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c136
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h12
-rw-r--r--fs/xfs/libxfs/xfs_btree.c59
-rw-r--r--fs/xfs/libxfs/xfs_btree.h28
-rw-r--r--fs/xfs/libxfs/xfs_defer.c79
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h10
-rw-r--r--fs/xfs/xfs_aops.c31
-rw-r--r--fs/xfs/xfs_aops.h1
-rw-r--r--fs/xfs/xfs_bmap_util.c2
-rw-r--r--fs/xfs/xfs_buf_item.c9
-rw-r--r--fs/xfs/xfs_extent_busy.c2
-rw-r--r--fs/xfs/xfs_file.c82
-rw-r--r--fs/xfs/xfs_filestream.c13
-rw-r--r--fs/xfs/xfs_fsops.c2
-rw-r--r--fs/xfs/xfs_icache.c14
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_iomap.c494
-rw-r--r--fs/xfs/xfs_iomap.h2
-rw-r--r--fs/xfs/xfs_log_priv.h3
-rw-r--r--fs/xfs/xfs_log_recover.c191
-rw-r--r--fs/xfs/xfs_mount.c14
-rw-r--r--fs/xfs/xfs_mount.h44
-rw-r--r--fs/xfs/xfs_rmap_item.c36
-rw-r--r--fs/xfs/xfs_rmap_item.h8
-rw-r--r--fs/xfs/xfs_super.c7
-rw-r--r--fs/xfs/xfs_super.h1
-rw-r--r--fs/xfs/xfs_sysfs.c47
-rw-r--r--fs/xfs/xfs_trace.h114
-rw-r--r--fs/xfs/xfs_trans.c3
-rw-r--r--fs/xfs/xfs_trans_extfree.c3
-rw-r--r--fs/xfs/xfs_xattr.c1
-rw-r--r--include/linux/dax.h6
-rw-r--r--include/linux/iomap.h4
-rw-r--r--mm/filemap.c14
49 files changed, 1942 insertions, 714 deletions
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 8146e9fd5ffc..c2d44e6e117b 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -348,3 +348,126 @@ Removed Sysctls
348 ---- ------- 348 ---- -------
349 fs.xfs.xfsbufd_centisec v4.0 349 fs.xfs.xfsbufd_centisec v4.0
350 fs.xfs.age_buffer_centisecs v4.0 350 fs.xfs.age_buffer_centisecs v4.0
351
352
353Error handling
354==============
355
356XFS can act differently according to the type of error found during its
357operation. The implementation introduces the following concepts to the error
358handler:
359
360 -failure speed:
361 Defines how fast XFS should propagate an error upwards when a specific
362 error is found during the filesystem operation. It can propagate
363 immediately, after a defined number of retries, after a set time period,
364 or simply retry forever.
365
366 -error classes:
367 Specifies the subsystem the error configuration will apply to, such as
368 metadata IO or memory allocation. Different subsystems will have
369 different error handlers for which behaviour can be configured.
370
371 -error handlers:
372 Defines the behavior for a specific error.
373
374The filesystem behavior during an error can be set via sysfs files. Each
375error handler works independently - the first condition met by an error handler
376for a specific class will cause the error to be propagated rather than reset and
377retried.
378
379The action taken by the filesystem when the error is propagated is context
380dependent - it may cause a shut down in the case of an unrecoverable error,
381it may be reported back to userspace, or it may even be ignored because
382there's nothing useful we can with the error or anyone we can report it to (e.g.
383during unmount).
384
385The configuration files are organized into the following hierarchy for each
386mounted filesystem:
387
388 /sys/fs/xfs/<dev>/error/<class>/<error>/
389
390Where:
391 <dev>
392 The short device name of the mounted filesystem. This is the same device
393 name that shows up in XFS kernel error messages as "XFS(<dev>): ..."
394
395 <class>
396 The subsystem the error configuration belongs to. As of 4.9, the defined
397 classes are:
398
399 - "metadata": applies metadata buffer write IO
400
401 <error>
402 The individual error handler configurations.
403
404
405Each filesystem has "global" error configuration options defined in their top
406level directory:
407
408 /sys/fs/xfs/<dev>/error/
409
410 fail_at_unmount (Min: 0 Default: 1 Max: 1)
411 Defines the filesystem error behavior at unmount time.
412
413 If set to a value of 1, XFS will override all other error configurations
414 during unmount and replace them with "immediate fail" characteristics.
415 i.e. no retries, no retry timeout. This will always allow unmount to
416 succeed when there are persistent errors present.
417
418 If set to 0, the configured retry behaviour will continue until all
419 retries and/or timeouts have been exhausted. This will delay unmount
420 completion when there are persistent errors, and it may prevent the
421 filesystem from ever unmounting fully in the case of "retry forever"
422 handler configurations.
423
424 Note: there is no guarantee that fail_at_unmount can be set whilst an
425 unmount is in progress. It is possible that the sysfs entries are
426 removed by the unmounting filesystem before a "retry forever" error
427 handler configuration causes unmount to hang, and hence the filesystem
428 must be configured appropriately before unmount begins to prevent
429 unmount hangs.
430
431Each filesystem has specific error class handlers that define the error
432propagation behaviour for specific errors. There is also a "default" error
433handler defined, which defines the behaviour for all errors that don't have
434specific handlers defined. Where multiple retry constraints are configuredi for
435a single error, the first retry configuration that expires will cause the error
436to be propagated. The handler configurations are found in the directory:
437
438 /sys/fs/xfs/<dev>/error/<class>/<error>/
439
440 max_retries (Min: -1 Default: Varies Max: INTMAX)
441 Defines the allowed number of retries of a specific error before
442 the filesystem will propagate the error. The retry count for a given
443 error context (e.g. a specific metadata buffer) is reset every time
444 there is a successful completion of the operation.
445
446 Setting the value to "-1" will cause XFS to retry forever for this
447 specific error.
448
449 Setting the value to "0" will cause XFS to fail immediately when the
450 specific error is reported.
451
452 Setting the value to "N" (where 0 < N < Max) will make XFS retry the
453 operation "N" times before propagating the error.
454
455 retry_timeout_seconds (Min: -1 Default: Varies Max: 1 day)
456 Define the amount of time (in seconds) that the filesystem is
457 allowed to retry its operations when the specific error is
458 found.
459
460 Setting the value to "-1" will allow XFS to retry forever for this
461 specific error.
462
463 Setting the value to "0" will cause XFS to fail immediately when the
464 specific error is reported.
465
466 Setting the value to "N" (where 0 < N < Max) will allow XFS to retry the
467 operation for up to "N" seconds before propagating the error.
468
469Note: The default behaviour for a specific error handler is dependent on both
470the class and error context. For example, the default values for
471"metadata/ENODEV" are "0" rather than "-1" so that this error handler defaults
472to "fail immediately" behaviour. This is done because ENODEV is a fatal,
473unrecoverable error no matter how many times the metadata IO is retried.
diff --git a/MAINTAINERS b/MAINTAINERS
index 841ffa3833ff..ef4f7c419376 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13099,11 +13099,10 @@ F: arch/x86/xen/*swiotlb*
13099F: drivers/xen/*swiotlb* 13099F: drivers/xen/*swiotlb*
13100 13100
13101XFS FILESYSTEM 13101XFS FILESYSTEM
13102P: Silicon Graphics Inc
13103M: Dave Chinner <david@fromorbit.com> 13102M: Dave Chinner <david@fromorbit.com>
13104M: xfs@oss.sgi.com 13103M: linux-xfs@vger.kernel.org
13105L: xfs@oss.sgi.com 13104L: linux-xfs@vger.kernel.org
13106W: http://oss.sgi.com/projects/xfs 13105W: http://xfs.org/
13107T: git git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs.git 13106T: git git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs.git
13108S: Supported 13107S: Supported
13109F: Documentation/filesystems/xfs.txt 13108F: Documentation/filesystems/xfs.txt
diff --git a/fs/dax.c b/fs/dax.c
index 993dc6fe0416..cc025f82ef07 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -31,6 +31,8 @@
31#include <linux/vmstat.h> 31#include <linux/vmstat.h>
32#include <linux/pfn_t.h> 32#include <linux/pfn_t.h>
33#include <linux/sizes.h> 33#include <linux/sizes.h>
34#include <linux/iomap.h>
35#include "internal.h"
34 36
35/* 37/*
36 * We use lowest available bit in exceptional entry for locking, other two 38 * We use lowest available bit in exceptional entry for locking, other two
@@ -580,14 +582,13 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
580 return VM_FAULT_LOCKED; 582 return VM_FAULT_LOCKED;
581} 583}
582 584
583static int copy_user_bh(struct page *to, struct inode *inode, 585static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
584 struct buffer_head *bh, unsigned long vaddr) 586 struct page *to, unsigned long vaddr)
585{ 587{
586 struct blk_dax_ctl dax = { 588 struct blk_dax_ctl dax = {
587 .sector = to_sector(bh, inode), 589 .sector = sector,
588 .size = bh->b_size, 590 .size = size,
589 }; 591 };
590 struct block_device *bdev = bh->b_bdev;
591 void *vto; 592 void *vto;
592 593
593 if (dax_map_atomic(bdev, &dax) < 0) 594 if (dax_map_atomic(bdev, &dax) < 0)
@@ -790,14 +791,13 @@ int dax_writeback_mapping_range(struct address_space *mapping,
790EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 791EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
791 792
792static int dax_insert_mapping(struct address_space *mapping, 793static int dax_insert_mapping(struct address_space *mapping,
793 struct buffer_head *bh, void **entryp, 794 struct block_device *bdev, sector_t sector, size_t size,
794 struct vm_area_struct *vma, struct vm_fault *vmf) 795 void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
795{ 796{
796 unsigned long vaddr = (unsigned long)vmf->virtual_address; 797 unsigned long vaddr = (unsigned long)vmf->virtual_address;
797 struct block_device *bdev = bh->b_bdev;
798 struct blk_dax_ctl dax = { 798 struct blk_dax_ctl dax = {
799 .sector = to_sector(bh, mapping->host), 799 .sector = sector,
800 .size = bh->b_size, 800 .size = size,
801 }; 801 };
802 void *ret; 802 void *ret;
803 void *entry = *entryp; 803 void *entry = *entryp;
@@ -868,7 +868,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
868 if (vmf->cow_page) { 868 if (vmf->cow_page) {
869 struct page *new_page = vmf->cow_page; 869 struct page *new_page = vmf->cow_page;
870 if (buffer_written(&bh)) 870 if (buffer_written(&bh))
871 error = copy_user_bh(new_page, inode, &bh, vaddr); 871 error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
872 bh.b_size, new_page, vaddr);
872 else 873 else
873 clear_user_highpage(new_page, vaddr); 874 clear_user_highpage(new_page, vaddr);
874 if (error) 875 if (error)
@@ -898,7 +899,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
898 899
899 /* Filesystem should not return unwritten buffers to us! */ 900 /* Filesystem should not return unwritten buffers to us! */
900 WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); 901 WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
901 error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf); 902 error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
903 bh.b_size, &entry, vma, vmf);
902 unlock_entry: 904 unlock_entry:
903 put_locked_mapping_entry(mapping, vmf->pgoff, entry); 905 put_locked_mapping_entry(mapping, vmf->pgoff, entry);
904 out: 906 out:
@@ -1241,3 +1243,229 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
1241 return dax_zero_page_range(inode, from, length, get_block); 1243 return dax_zero_page_range(inode, from, length, get_block);
1242} 1244}
1243EXPORT_SYMBOL_GPL(dax_truncate_page); 1245EXPORT_SYMBOL_GPL(dax_truncate_page);
1246
1247#ifdef CONFIG_FS_IOMAP
1248static loff_t
1249iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1250 struct iomap *iomap)
1251{
1252 struct iov_iter *iter = data;
1253 loff_t end = pos + length, done = 0;
1254 ssize_t ret = 0;
1255
1256 if (iov_iter_rw(iter) == READ) {
1257 end = min(end, i_size_read(inode));
1258 if (pos >= end)
1259 return 0;
1260
1261 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
1262 return iov_iter_zero(min(length, end - pos), iter);
1263 }
1264
1265 if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
1266 return -EIO;
1267
1268 while (pos < end) {
1269 unsigned offset = pos & (PAGE_SIZE - 1);
1270 struct blk_dax_ctl dax = { 0 };
1271 ssize_t map_len;
1272
1273 dax.sector = iomap->blkno +
1274 (((pos & PAGE_MASK) - iomap->offset) >> 9);
1275 dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
1276 map_len = dax_map_atomic(iomap->bdev, &dax);
1277 if (map_len < 0) {
1278 ret = map_len;
1279 break;
1280 }
1281
1282 dax.addr += offset;
1283 map_len -= offset;
1284 if (map_len > end - pos)
1285 map_len = end - pos;
1286
1287 if (iov_iter_rw(iter) == WRITE)
1288 map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
1289 else
1290 map_len = copy_to_iter(dax.addr, map_len, iter);
1291 dax_unmap_atomic(iomap->bdev, &dax);
1292 if (map_len <= 0) {
1293 ret = map_len ? map_len : -EFAULT;
1294 break;
1295 }
1296
1297 pos += map_len;
1298 length -= map_len;
1299 done += map_len;
1300 }
1301
1302 return done ? done : ret;
1303}
1304
1305/**
1306 * iomap_dax_rw - Perform I/O to a DAX file
1307 * @iocb: The control block for this I/O
1308 * @iter: The addresses to do I/O from or to
1309 * @ops: iomap ops passed from the file system
1310 *
1311 * This function performs read and write operations to directly mapped
1312 * persistent memory. The callers needs to take care of read/write exclusion
1313 * and evicting any page cache pages in the region under I/O.
1314 */
1315ssize_t
1316iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
1317 struct iomap_ops *ops)
1318{
1319 struct address_space *mapping = iocb->ki_filp->f_mapping;
1320 struct inode *inode = mapping->host;
1321 loff_t pos = iocb->ki_pos, ret = 0, done = 0;
1322 unsigned flags = 0;
1323
1324 if (iov_iter_rw(iter) == WRITE)
1325 flags |= IOMAP_WRITE;
1326
1327 /*
1328 * Yes, even DAX files can have page cache attached to them: A zeroed
1329 * page is inserted into the pagecache when we have to serve a write
1330 * fault on a hole. It should never be dirtied and can simply be
1331 * dropped from the pagecache once we get real data for the page.
1332 *
1333 * XXX: This is racy against mmap, and there's nothing we can do about
1334 * it. We'll eventually need to shift this down even further so that
1335 * we can check if we allocated blocks over a hole first.
1336 */
1337 if (mapping->nrpages) {
1338 ret = invalidate_inode_pages2_range(mapping,
1339 pos >> PAGE_SHIFT,
1340 (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
1341 WARN_ON_ONCE(ret);
1342 }
1343
1344 while (iov_iter_count(iter)) {
1345 ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
1346 iter, iomap_dax_actor);
1347 if (ret <= 0)
1348 break;
1349 pos += ret;
1350 done += ret;
1351 }
1352
1353 iocb->ki_pos += done;
1354 return done ? done : ret;
1355}
1356EXPORT_SYMBOL_GPL(iomap_dax_rw);
1357
1358/**
1359 * iomap_dax_fault - handle a page fault on a DAX file
1360 * @vma: The virtual memory area where the fault occurred
1361 * @vmf: The description of the fault
1362 * @ops: iomap ops passed from the file system
1363 *
1364 * When a page fault occurs, filesystems may call this helper in their fault
1365 * or mkwrite handler for DAX files. Assumes the caller has done all the
1366 * necessary locking for the page fault to proceed successfully.
1367 */
1368int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1369 struct iomap_ops *ops)
1370{
1371 struct address_space *mapping = vma->vm_file->f_mapping;
1372 struct inode *inode = mapping->host;
1373 unsigned long vaddr = (unsigned long)vmf->virtual_address;
1374 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
1375 sector_t sector;
1376 struct iomap iomap = { 0 };
1377 unsigned flags = 0;
1378 int error, major = 0;
1379 void *entry;
1380
1381 /*
1382 * Check whether offset isn't beyond end of file now. Caller is supposed
1383 * to hold locks serializing us with truncate / punch hole so this is
1384 * a reliable test.
1385 */
1386 if (pos >= i_size_read(inode))
1387 return VM_FAULT_SIGBUS;
1388
1389 entry = grab_mapping_entry(mapping, vmf->pgoff);
1390 if (IS_ERR(entry)) {
1391 error = PTR_ERR(entry);
1392 goto out;
1393 }
1394
1395 if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
1396 flags |= IOMAP_WRITE;
1397
1398 /*
1399 * Note that we don't bother to use iomap_apply here: DAX required
1400 * the file system block size to be equal the page size, which means
1401 * that we never have to deal with more than a single extent here.
1402 */
1403 error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
1404 if (error)
1405 goto unlock_entry;
1406 if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
1407 error = -EIO; /* fs corruption? */
1408 goto unlock_entry;
1409 }
1410
1411 sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9);
1412
1413 if (vmf->cow_page) {
1414 switch (iomap.type) {
1415 case IOMAP_HOLE:
1416 case IOMAP_UNWRITTEN:
1417 clear_user_highpage(vmf->cow_page, vaddr);
1418 break;
1419 case IOMAP_MAPPED:
1420 error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
1421 vmf->cow_page, vaddr);
1422 break;
1423 default:
1424 WARN_ON_ONCE(1);
1425 error = -EIO;
1426 break;
1427 }
1428
1429 if (error)
1430 goto unlock_entry;
1431 if (!radix_tree_exceptional_entry(entry)) {
1432 vmf->page = entry;
1433 return VM_FAULT_LOCKED;
1434 }
1435 vmf->entry = entry;
1436 return VM_FAULT_DAX_LOCKED;
1437 }
1438
1439 switch (iomap.type) {
1440 case IOMAP_MAPPED:
1441 if (iomap.flags & IOMAP_F_NEW) {
1442 count_vm_event(PGMAJFAULT);
1443 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1444 major = VM_FAULT_MAJOR;
1445 }
1446 error = dax_insert_mapping(mapping, iomap.bdev, sector,
1447 PAGE_SIZE, &entry, vma, vmf);
1448 break;
1449 case IOMAP_UNWRITTEN:
1450 case IOMAP_HOLE:
1451 if (!(vmf->flags & FAULT_FLAG_WRITE))
1452 return dax_load_hole(mapping, entry, vmf);
1453 /*FALLTHRU*/
1454 default:
1455 WARN_ON_ONCE(1);
1456 error = -EIO;
1457 break;
1458 }
1459
1460 unlock_entry:
1461 put_locked_mapping_entry(mapping, vmf->pgoff, entry);
1462 out:
1463 if (error == -ENOMEM)
1464 return VM_FAULT_OOM | major;
1465 /* -EBUSY is fine, somebody else faulted on the same PTE */
1466 if (error < 0 && error != -EBUSY)
1467 return VM_FAULT_SIGBUS | major;
1468 return VM_FAULT_NOPAGE | major;
1469}
1470EXPORT_SYMBOL_GPL(iomap_dax_fault);
1471#endif /* CONFIG_FS_IOMAP */
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index c634874e12d9..36bea5adcaba 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,5 +1,6 @@
1config EXT2_FS 1config EXT2_FS
2 tristate "Second extended fs support" 2 tristate "Second extended fs support"
3 select FS_IOMAP if FS_DAX
3 help 4 help
4 Ext2 is a standard Linux file system for hard disks. 5 Ext2 is a standard Linux file system for hard disks.
5 6
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 06af2f92226c..37e2be784ac7 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -814,6 +814,7 @@ extern const struct file_operations ext2_file_operations;
814/* inode.c */ 814/* inode.c */
815extern const struct address_space_operations ext2_aops; 815extern const struct address_space_operations ext2_aops;
816extern const struct address_space_operations ext2_nobh_aops; 816extern const struct address_space_operations ext2_nobh_aops;
817extern struct iomap_ops ext2_iomap_ops;
817 818
818/* namei.c */ 819/* namei.c */
819extern const struct inode_operations ext2_dir_inode_operations; 820extern const struct inode_operations ext2_dir_inode_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5efeefe17abb..423cc01c9d41 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -22,11 +22,59 @@
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/dax.h> 23#include <linux/dax.h>
24#include <linux/quotaops.h> 24#include <linux/quotaops.h>
25#include <linux/iomap.h>
26#include <linux/uio.h>
25#include "ext2.h" 27#include "ext2.h"
26#include "xattr.h" 28#include "xattr.h"
27#include "acl.h" 29#include "acl.h"
28 30
29#ifdef CONFIG_FS_DAX 31#ifdef CONFIG_FS_DAX
32static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
33{
34 struct inode *inode = iocb->ki_filp->f_mapping->host;
35 ssize_t ret;
36
37 if (!iov_iter_count(to))
38 return 0; /* skip atime */
39
40 inode_lock_shared(inode);
41 ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops);
42 inode_unlock_shared(inode);
43
44 file_accessed(iocb->ki_filp);
45 return ret;
46}
47
48static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
49{
50 struct file *file = iocb->ki_filp;
51 struct inode *inode = file->f_mapping->host;
52 ssize_t ret;
53
54 inode_lock(inode);
55 ret = generic_write_checks(iocb, from);
56 if (ret <= 0)
57 goto out_unlock;
58 ret = file_remove_privs(file);
59 if (ret)
60 goto out_unlock;
61 ret = file_update_time(file);
62 if (ret)
63 goto out_unlock;
64
65 ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops);
66 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
67 i_size_write(inode, iocb->ki_pos);
68 mark_inode_dirty(inode);
69 }
70
71out_unlock:
72 inode_unlock(inode);
73 if (ret > 0)
74 ret = generic_write_sync(iocb, ret);
75 return ret;
76}
77
30/* 78/*
31 * The lock ordering for ext2 DAX fault paths is: 79 * The lock ordering for ext2 DAX fault paths is:
32 * 80 *
@@ -51,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
51 } 99 }
52 down_read(&ei->dax_sem); 100 down_read(&ei->dax_sem);
53 101
54 ret = dax_fault(vma, vmf, ext2_get_block); 102 ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops);
55 103
56 up_read(&ei->dax_sem); 104 up_read(&ei->dax_sem);
57 if (vmf->flags & FAULT_FLAG_WRITE) 105 if (vmf->flags & FAULT_FLAG_WRITE)
@@ -156,14 +204,28 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
156 return ret; 204 return ret;
157} 205}
158 206
159/* 207static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
160 * We have mostly NULL's here: the current defaults are ok for 208{
161 * the ext2 filesystem. 209#ifdef CONFIG_FS_DAX
162 */ 210 if (IS_DAX(iocb->ki_filp->f_mapping->host))
211 return ext2_dax_read_iter(iocb, to);
212#endif
213 return generic_file_read_iter(iocb, to);
214}
215
216static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
217{
218#ifdef CONFIG_FS_DAX
219 if (IS_DAX(iocb->ki_filp->f_mapping->host))
220 return ext2_dax_write_iter(iocb, from);
221#endif
222 return generic_file_write_iter(iocb, from);
223}
224
163const struct file_operations ext2_file_operations = { 225const struct file_operations ext2_file_operations = {
164 .llseek = generic_file_llseek, 226 .llseek = generic_file_llseek,
165 .read_iter = generic_file_read_iter, 227 .read_iter = ext2_file_read_iter,
166 .write_iter = generic_file_write_iter, 228 .write_iter = ext2_file_write_iter,
167 .unlocked_ioctl = ext2_ioctl, 229 .unlocked_ioctl = ext2_ioctl,
168#ifdef CONFIG_COMPAT 230#ifdef CONFIG_COMPAT
169 .compat_ioctl = ext2_compat_ioctl, 231 .compat_ioctl = ext2_compat_ioctl,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 303ae2bb269a..1e72d425fd3b 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -32,6 +32,7 @@
32#include <linux/buffer_head.h> 32#include <linux/buffer_head.h>
33#include <linux/mpage.h> 33#include <linux/mpage.h>
34#include <linux/fiemap.h> 34#include <linux/fiemap.h>
35#include <linux/iomap.h>
35#include <linux/namei.h> 36#include <linux/namei.h>
36#include <linux/uio.h> 37#include <linux/uio.h>
37#include "ext2.h" 38#include "ext2.h"
@@ -618,7 +619,7 @@ static void ext2_splice_branch(struct inode *inode,
618 */ 619 */
619static int ext2_get_blocks(struct inode *inode, 620static int ext2_get_blocks(struct inode *inode,
620 sector_t iblock, unsigned long maxblocks, 621 sector_t iblock, unsigned long maxblocks,
621 struct buffer_head *bh_result, 622 u32 *bno, bool *new, bool *boundary,
622 int create) 623 int create)
623{ 624{
624 int err = -EIO; 625 int err = -EIO;
@@ -644,7 +645,6 @@ static int ext2_get_blocks(struct inode *inode,
644 /* Simplest case - block found, no allocation needed */ 645 /* Simplest case - block found, no allocation needed */
645 if (!partial) { 646 if (!partial) {
646 first_block = le32_to_cpu(chain[depth - 1].key); 647 first_block = le32_to_cpu(chain[depth - 1].key);
647 clear_buffer_new(bh_result); /* What's this do? */
648 count++; 648 count++;
649 /*map more blocks*/ 649 /*map more blocks*/
650 while (count < maxblocks && count <= blocks_to_boundary) { 650 while (count < maxblocks && count <= blocks_to_boundary) {
@@ -699,7 +699,6 @@ static int ext2_get_blocks(struct inode *inode,
699 mutex_unlock(&ei->truncate_mutex); 699 mutex_unlock(&ei->truncate_mutex);
700 if (err) 700 if (err)
701 goto cleanup; 701 goto cleanup;
702 clear_buffer_new(bh_result);
703 goto got_it; 702 goto got_it;
704 } 703 }
705 } 704 }
@@ -755,15 +754,16 @@ static int ext2_get_blocks(struct inode *inode,
755 mutex_unlock(&ei->truncate_mutex); 754 mutex_unlock(&ei->truncate_mutex);
756 goto cleanup; 755 goto cleanup;
757 } 756 }
758 } else 757 } else {
759 set_buffer_new(bh_result); 758 *new = true;
759 }
760 760
761 ext2_splice_branch(inode, iblock, partial, indirect_blks, count); 761 ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
762 mutex_unlock(&ei->truncate_mutex); 762 mutex_unlock(&ei->truncate_mutex);
763got_it: 763got_it:
764 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 764 *bno = le32_to_cpu(chain[depth-1].key);
765 if (count > blocks_to_boundary) 765 if (count > blocks_to_boundary)
766 set_buffer_boundary(bh_result); 766 *boundary = true;
767 err = count; 767 err = count;
768 /* Clean up and exit */ 768 /* Clean up and exit */
769 partial = chain + depth - 1; /* the whole chain */ 769 partial = chain + depth - 1; /* the whole chain */
@@ -775,19 +775,82 @@ cleanup:
775 return err; 775 return err;
776} 776}
777 777
778int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) 778int ext2_get_block(struct inode *inode, sector_t iblock,
779 struct buffer_head *bh_result, int create)
779{ 780{
780 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 781 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
781 int ret = ext2_get_blocks(inode, iblock, max_blocks, 782 bool new = false, boundary = false;
782 bh_result, create); 783 u32 bno;
783 if (ret > 0) { 784 int ret;
784 bh_result->b_size = (ret << inode->i_blkbits); 785
785 ret = 0; 786 ret = ext2_get_blocks(inode, iblock, max_blocks, &bno, &new, &boundary,
787 create);
788 if (ret <= 0)
789 return ret;
790
791 map_bh(bh_result, inode->i_sb, bno);
792 bh_result->b_size = (ret << inode->i_blkbits);
793 if (new)
794 set_buffer_new(bh_result);
795 if (boundary)
796 set_buffer_boundary(bh_result);
797 return 0;
798
799}
800
801#ifdef CONFIG_FS_DAX
802static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
803 unsigned flags, struct iomap *iomap)
804{
805 unsigned int blkbits = inode->i_blkbits;
806 unsigned long first_block = offset >> blkbits;
807 unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits;
808 bool new = false, boundary = false;
809 u32 bno;
810 int ret;
811
812 ret = ext2_get_blocks(inode, first_block, max_blocks,
813 &bno, &new, &boundary, flags & IOMAP_WRITE);
814 if (ret < 0)
815 return ret;
816
817 iomap->flags = 0;
818 iomap->bdev = inode->i_sb->s_bdev;
819 iomap->offset = (u64)first_block << blkbits;
820
821 if (ret == 0) {
822 iomap->type = IOMAP_HOLE;
823 iomap->blkno = IOMAP_NULL_BLOCK;
824 iomap->length = 1 << blkbits;
825 } else {
826 iomap->type = IOMAP_MAPPED;
827 iomap->blkno = (sector_t)bno << (blkbits - 9);
828 iomap->length = (u64)ret << blkbits;
829 iomap->flags |= IOMAP_F_MERGED;
786 } 830 }
787 return ret;
788 831
832 if (new)
833 iomap->flags |= IOMAP_F_NEW;
834 return 0;
789} 835}
790 836
837static int
838ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length,
839 ssize_t written, unsigned flags, struct iomap *iomap)
840{
841 if (iomap->type == IOMAP_MAPPED &&
842 written < length &&
843 (flags & IOMAP_WRITE))
844 ext2_write_failed(inode->i_mapping, offset + length);
845 return 0;
846}
847
848struct iomap_ops ext2_iomap_ops = {
849 .iomap_begin = ext2_iomap_begin,
850 .iomap_end = ext2_iomap_end,
851};
852#endif /* CONFIG_FS_DAX */
853
791int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 854int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
792 u64 start, u64 len) 855 u64 start, u64 len)
793{ 856{
@@ -873,11 +936,10 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
873 loff_t offset = iocb->ki_pos; 936 loff_t offset = iocb->ki_pos;
874 ssize_t ret; 937 ssize_t ret;
875 938
876 if (IS_DAX(inode)) 939 if (WARN_ON_ONCE(IS_DAX(inode)))
877 ret = dax_do_io(iocb, inode, iter, ext2_get_block, NULL, 940 return -EIO;
878 DIO_LOCKING); 941
879 else 942 ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block);
880 ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block);
881 if (ret < 0 && iov_iter_rw(iter) == WRITE) 943 if (ret < 0 && iov_iter_rw(iter) == WRITE)
882 ext2_write_failed(mapping, offset + count); 944 ext2_write_failed(mapping, offset + count);
883 return ret; 945 return ret;
diff --git a/fs/internal.h b/fs/internal.h
index ba0737649d4a..859178692ce4 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -12,6 +12,7 @@
12struct super_block; 12struct super_block;
13struct file_system_type; 13struct file_system_type;
14struct iomap; 14struct iomap;
15struct iomap_ops;
15struct linux_binprm; 16struct linux_binprm;
16struct path; 17struct path;
17struct mount; 18struct mount;
@@ -164,3 +165,13 @@ extern struct dentry_operations ns_dentry_operations;
164extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd, 165extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
165 unsigned long arg); 166 unsigned long arg);
166extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 167extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
168
169/*
170 * iomap support:
171 */
172typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
173 void *data, struct iomap *iomap);
174
175loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
176 unsigned flags, struct iomap_ops *ops, void *data,
177 iomap_actor_t actor);
diff --git a/fs/iomap.c b/fs/iomap.c
index 706270f21b35..013d1d36fbbf 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -27,9 +27,6 @@
27#include <linux/dax.h> 27#include <linux/dax.h>
28#include "internal.h" 28#include "internal.h"
29 29
30typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
31 void *data, struct iomap *iomap);
32
33/* 30/*
34 * Execute a iomap write on a segment of the mapping that spans a 31 * Execute a iomap write on a segment of the mapping that spans a
35 * contiguous range of pages that have identical block mapping state. 32 * contiguous range of pages that have identical block mapping state.
@@ -41,7 +38,7 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
41 * resources they require in the iomap_begin call, and release them in the 38 * resources they require in the iomap_begin call, and release them in the
42 * iomap_end call. 39 * iomap_end call.
43 */ 40 */
44static loff_t 41loff_t
45iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, 42iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
46 struct iomap_ops *ops, void *data, iomap_actor_t actor) 43 struct iomap_ops *ops, void *data, iomap_actor_t actor)
47{ 44{
@@ -252,6 +249,88 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
252} 249}
253EXPORT_SYMBOL_GPL(iomap_file_buffered_write); 250EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
254 251
252static struct page *
253__iomap_read_page(struct inode *inode, loff_t offset)
254{
255 struct address_space *mapping = inode->i_mapping;
256 struct page *page;
257
258 page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL);
259 if (IS_ERR(page))
260 return page;
261 if (!PageUptodate(page)) {
262 put_page(page);
263 return ERR_PTR(-EIO);
264 }
265 return page;
266}
267
268static loff_t
269iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
270 struct iomap *iomap)
271{
272 long status = 0;
273 ssize_t written = 0;
274
275 do {
276 struct page *page, *rpage;
277 unsigned long offset; /* Offset into pagecache page */
278 unsigned long bytes; /* Bytes to write to page */
279
280 offset = (pos & (PAGE_SIZE - 1));
281 bytes = min_t(unsigned long, PAGE_SIZE - offset, length);
282
283 rpage = __iomap_read_page(inode, pos);
284 if (IS_ERR(rpage))
285 return PTR_ERR(rpage);
286
287 status = iomap_write_begin(inode, pos, bytes,
288 AOP_FLAG_NOFS | AOP_FLAG_UNINTERRUPTIBLE,
289 &page, iomap);
290 put_page(rpage);
291 if (unlikely(status))
292 return status;
293
294 WARN_ON_ONCE(!PageUptodate(page));
295
296 status = iomap_write_end(inode, pos, bytes, bytes, page);
297 if (unlikely(status <= 0)) {
298 if (WARN_ON_ONCE(status == 0))
299 return -EIO;
300 return status;
301 }
302
303 cond_resched();
304
305 pos += status;
306 written += status;
307 length -= status;
308
309 balance_dirty_pages_ratelimited(inode->i_mapping);
310 } while (length);
311
312 return written;
313}
314
315int
316iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
317 struct iomap_ops *ops)
318{
319 loff_t ret;
320
321 while (len) {
322 ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
323 iomap_dirty_actor);
324 if (ret <= 0)
325 return ret;
326 pos += ret;
327 len -= ret;
328 }
329
330 return 0;
331}
332EXPORT_SYMBOL_GPL(iomap_file_dirty);
333
255static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, 334static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
256 unsigned bytes, struct iomap *iomap) 335 unsigned bytes, struct iomap *iomap)
257{ 336{
@@ -430,6 +509,8 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
430 509
431 if (iomap->flags & IOMAP_F_MERGED) 510 if (iomap->flags & IOMAP_F_MERGED)
432 flags |= FIEMAP_EXTENT_MERGED; 511 flags |= FIEMAP_EXTENT_MERGED;
512 if (iomap->flags & IOMAP_F_SHARED)
513 flags |= FIEMAP_EXTENT_SHARED;
433 514
434 return fiemap_fill_next_extent(fi, iomap->offset, 515 return fiemap_fill_next_extent(fi, iomap->offset,
435 iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0, 516 iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index fc593c869493..584e87e11cb6 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -52,6 +52,7 @@ xfs-y += $(addprefix libxfs/, \
52 xfs_inode_fork.o \ 52 xfs_inode_fork.o \
53 xfs_inode_buf.o \ 53 xfs_inode_buf.o \
54 xfs_log_rlimit.o \ 54 xfs_log_rlimit.o \
55 xfs_ag_resv.o \
55 xfs_rmap.o \ 56 xfs_rmap.o \
56 xfs_rmap_btree.o \ 57 xfs_rmap_btree.o \
57 xfs_sb.o \ 58 xfs_sb.o \
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
new file mode 100644
index 000000000000..e3ae0f2b4294
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -0,0 +1,325 @@
1/*
2 * Copyright (C) 2016 Oracle. All Rights Reserved.
3 *
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it would be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
19 */
20#include "xfs.h"
21#include "xfs_fs.h"
22#include "xfs_shared.h"
23#include "xfs_format.h"
24#include "xfs_log_format.h"
25#include "xfs_trans_resv.h"
26#include "xfs_sb.h"
27#include "xfs_mount.h"
28#include "xfs_defer.h"
29#include "xfs_alloc.h"
30#include "xfs_error.h"
31#include "xfs_trace.h"
32#include "xfs_cksum.h"
33#include "xfs_trans.h"
34#include "xfs_bit.h"
35#include "xfs_bmap.h"
36#include "xfs_bmap_btree.h"
37#include "xfs_ag_resv.h"
38#include "xfs_trans_space.h"
39#include "xfs_rmap_btree.h"
40#include "xfs_btree.h"
41
42/*
43 * Per-AG Block Reservations
44 *
45 * For some kinds of allocation group metadata structures, it is advantageous
46 * to reserve a small number of blocks in each AG so that future expansions of
47 * that data structure do not encounter ENOSPC because errors during a btree
48 * split cause the filesystem to go offline.
49 *
50 * Prior to the introduction of reflink, this wasn't an issue because the free
51 * space btrees maintain a reserve of space (the AGFL) to handle any expansion
52 * that may be necessary; and allocations of other metadata (inodes, BMBT,
53 * dir/attr) aren't restricted to a single AG. However, with reflink it is
54 * possible to allocate all the space in an AG, have subsequent reflink/CoW
55 * activity expand the refcount btree, and discover that there's no space left
56 * to handle that expansion. Since we can calculate the maximum size of the
57 * refcount btree, we can reserve space for it and avoid ENOSPC.
58 *
59 * Handling per-AG reservations consists of three changes to the allocator's
60 * behavior: First, because these reservations are always needed, we decrease
61 * the ag_max_usable counter to reflect the size of the AG after the reserved
62 * blocks are taken. Second, the reservations must be reflected in the
63 * fdblocks count to maintain proper accounting. Third, each AG must maintain
64 * its own reserved block counter so that we can calculate the amount of space
65 * that must remain free to maintain the reservations. Fourth, the "remaining
66 * reserved blocks" count must be used when calculating the length of the
67 * longest free extent in an AG and to clamp maxlen in the per-AG allocation
68 * functions. In other words, we maintain a virtual allocation via in-core
69 * accounting tricks so that we don't have to clean up after a crash. :)
70 *
71 * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type
72 * values via struct xfs_alloc_arg or directly to the xfs_free_extent
73 * function. It might seem a little funny to maintain a reservoir of blocks
74 * to feed another reservoir, but the AGFL only holds enough blocks to get
75 * through the next transaction. The per-AG reservation is to ensure (we
76 * hope) that each AG never runs out of blocks. Each data structure wanting
77 * to use the reservation system should update ask/used in xfs_ag_resv_init.
78 */
79
80/*
81 * Are we critically low on blocks? For now we'll define that as the number
82 * of blocks we can get our hands on being less than 10% of what we reserved
83 * or less than some arbitrary number (maximum btree height).
84 */
85bool
86xfs_ag_resv_critical(
87 struct xfs_perag *pag,
88 enum xfs_ag_resv_type type)
89{
90 xfs_extlen_t avail;
91 xfs_extlen_t orig;
92
93 switch (type) {
94 case XFS_AG_RESV_METADATA:
95 avail = pag->pagf_freeblks - pag->pag_agfl_resv.ar_reserved;
96 orig = pag->pag_meta_resv.ar_asked;
97 break;
98 case XFS_AG_RESV_AGFL:
99 avail = pag->pagf_freeblks + pag->pagf_flcount -
100 pag->pag_meta_resv.ar_reserved;
101 orig = pag->pag_agfl_resv.ar_asked;
102 break;
103 default:
104 ASSERT(0);
105 return false;
106 }
107
108 trace_xfs_ag_resv_critical(pag, type, avail);
109
110 /* Critically low if less than 10% or max btree height remains. */
111 return avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS;
112}
113
114/*
115 * How many blocks are reserved but not used, and therefore must not be
116 * allocated away?
117 */
118xfs_extlen_t
119xfs_ag_resv_needed(
120 struct xfs_perag *pag,
121 enum xfs_ag_resv_type type)
122{
123 xfs_extlen_t len;
124
125 len = pag->pag_meta_resv.ar_reserved + pag->pag_agfl_resv.ar_reserved;
126 switch (type) {
127 case XFS_AG_RESV_METADATA:
128 case XFS_AG_RESV_AGFL:
129 len -= xfs_perag_resv(pag, type)->ar_reserved;
130 break;
131 case XFS_AG_RESV_NONE:
132 /* empty */
133 break;
134 default:
135 ASSERT(0);
136 }
137
138 trace_xfs_ag_resv_needed(pag, type, len);
139
140 return len;
141}
142
143/* Clean out a reservation */
144static int
145__xfs_ag_resv_free(
146 struct xfs_perag *pag,
147 enum xfs_ag_resv_type type)
148{
149 struct xfs_ag_resv *resv;
150 xfs_extlen_t oldresv;
151 int error;
152
153 trace_xfs_ag_resv_free(pag, type, 0);
154
155 resv = xfs_perag_resv(pag, type);
156 pag->pag_mount->m_ag_max_usable += resv->ar_asked;
157 /*
158 * AGFL blocks are always considered "free", so whatever
159 * was reserved at mount time must be given back at umount.
160 */
161 if (type == XFS_AG_RESV_AGFL)
162 oldresv = resv->ar_orig_reserved;
163 else
164 oldresv = resv->ar_reserved;
165 error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
166 resv->ar_reserved = 0;
167 resv->ar_asked = 0;
168
169 if (error)
170 trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno,
171 error, _RET_IP_);
172 return error;
173}
174
175/* Free a per-AG reservation. */
176int
177xfs_ag_resv_free(
178 struct xfs_perag *pag)
179{
180 int error;
181 int err2;
182
183 error = __xfs_ag_resv_free(pag, XFS_AG_RESV_AGFL);
184 err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
185 if (err2 && !error)
186 error = err2;
187 return error;
188}
189
190static int
191__xfs_ag_resv_init(
192 struct xfs_perag *pag,
193 enum xfs_ag_resv_type type,
194 xfs_extlen_t ask,
195 xfs_extlen_t used)
196{
197 struct xfs_mount *mp = pag->pag_mount;
198 struct xfs_ag_resv *resv;
199 int error;
200
201 resv = xfs_perag_resv(pag, type);
202 if (used > ask)
203 ask = used;
204 resv->ar_asked = ask;
205 resv->ar_reserved = resv->ar_orig_reserved = ask - used;
206 mp->m_ag_max_usable -= ask;
207
208 trace_xfs_ag_resv_init(pag, type, ask);
209
210 error = xfs_mod_fdblocks(mp, -(int64_t)resv->ar_reserved, true);
211 if (error)
212 trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
213 error, _RET_IP_);
214
215 return error;
216}
217
218/* Create a per-AG block reservation. */
219int
220xfs_ag_resv_init(
221 struct xfs_perag *pag)
222{
223 xfs_extlen_t ask;
224 xfs_extlen_t used;
225 int error = 0;
226
227 /* Create the metadata reservation. */
228 if (pag->pag_meta_resv.ar_asked == 0) {
229 ask = used = 0;
230
231 error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
232 ask, used);
233 if (error)
234 goto out;
235 }
236
237 /* Create the AGFL metadata reservation */
238 if (pag->pag_agfl_resv.ar_asked == 0) {
239 ask = used = 0;
240
241 error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used);
242 if (error)
243 goto out;
244 }
245
246out:
247 return error;
248}
249
250/* Allocate a block from the reservation. */
251void
252xfs_ag_resv_alloc_extent(
253 struct xfs_perag *pag,
254 enum xfs_ag_resv_type type,
255 struct xfs_alloc_arg *args)
256{
257 struct xfs_ag_resv *resv;
258 xfs_extlen_t len;
259 uint field;
260
261 trace_xfs_ag_resv_alloc_extent(pag, type, args->len);
262
263 switch (type) {
264 case XFS_AG_RESV_METADATA:
265 case XFS_AG_RESV_AGFL:
266 resv = xfs_perag_resv(pag, type);
267 break;
268 default:
269 ASSERT(0);
270 /* fall through */
271 case XFS_AG_RESV_NONE:
272 field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
273 XFS_TRANS_SB_FDBLOCKS;
274 xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len);
275 return;
276 }
277
278 len = min_t(xfs_extlen_t, args->len, resv->ar_reserved);
279 resv->ar_reserved -= len;
280 if (type == XFS_AG_RESV_AGFL)
281 return;
282 /* Allocations of reserved blocks only need on-disk sb updates... */
283 xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len);
284 /* ...but non-reserved blocks need in-core and on-disk updates. */
285 if (args->len > len)
286 xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS,
287 -((int64_t)args->len - len));
288}
289
290/* Free a block to the reservation. */
291void
292xfs_ag_resv_free_extent(
293 struct xfs_perag *pag,
294 enum xfs_ag_resv_type type,
295 struct xfs_trans *tp,
296 xfs_extlen_t len)
297{
298 xfs_extlen_t leftover;
299 struct xfs_ag_resv *resv;
300
301 trace_xfs_ag_resv_free_extent(pag, type, len);
302
303 switch (type) {
304 case XFS_AG_RESV_METADATA:
305 case XFS_AG_RESV_AGFL:
306 resv = xfs_perag_resv(pag, type);
307 break;
308 default:
309 ASSERT(0);
310 /* fall through */
311 case XFS_AG_RESV_NONE:
312 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
313 return;
314 }
315
316 leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved);
317 resv->ar_reserved += leftover;
318 if (type == XFS_AG_RESV_AGFL)
319 return;
320 /* Freeing into the reserved pool only requires on-disk update... */
321 xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len);
322 /* ...but freeing beyond that requires in-core and on-disk update. */
323 if (len > leftover)
324 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover);
325}
diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h
new file mode 100644
index 000000000000..8d6c687deef3
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag_resv.h
@@ -0,0 +1,35 @@
1/*
2 * Copyright (C) 2016 Oracle. All Rights Reserved.
3 *
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it would be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
19 */
20#ifndef __XFS_AG_RESV_H__
21#define __XFS_AG_RESV_H__
22
23int xfs_ag_resv_free(struct xfs_perag *pag);
24int xfs_ag_resv_init(struct xfs_perag *pag);
25
26bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type);
27xfs_extlen_t xfs_ag_resv_needed(struct xfs_perag *pag,
28 enum xfs_ag_resv_type type);
29
30void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
31 struct xfs_alloc_arg *args);
32void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
33 struct xfs_trans *tp, xfs_extlen_t len);
34
35#endif /* __XFS_AG_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 05b5243d89f6..ca75dc90ebe0 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -37,6 +37,7 @@
37#include "xfs_trans.h" 37#include "xfs_trans.h"
38#include "xfs_buf_item.h" 38#include "xfs_buf_item.h"
39#include "xfs_log.h" 39#include "xfs_log.h"
40#include "xfs_ag_resv.h"
40 41
41struct workqueue_struct *xfs_alloc_wq; 42struct workqueue_struct *xfs_alloc_wq;
42 43
@@ -74,14 +75,8 @@ xfs_prealloc_blocks(
74 * extents need to be actually allocated. To get around this, we explicitly set 75 * extents need to be actually allocated. To get around this, we explicitly set
75 * aside a few blocks which will not be reserved in delayed allocation. 76 * aside a few blocks which will not be reserved in delayed allocation.
76 * 77 *
77 * When rmap is disabled, we need to reserve 4 fsbs _per AG_ for the freelist 78 * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a
78 * and 4 more to handle a potential split of the file's bmap btree. 79 * potential split of the file's bmap btree.
79 *
80 * When rmap is enabled, we must also be able to handle two rmap btree inserts
81 * to record both the file data extent and a new bmbt block. The bmbt block
82 * might not be in the same AG as the file data extent. In the worst case
83 * the bmap btree splits multiple levels and all the new blocks come from
84 * different AGs, so set aside enough to handle rmap btree splits in all AGs.
85 */ 80 */
86unsigned int 81unsigned int
87xfs_alloc_set_aside( 82xfs_alloc_set_aside(
@@ -90,8 +85,6 @@ xfs_alloc_set_aside(
90 unsigned int blocks; 85 unsigned int blocks;
91 86
92 blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE); 87 blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE);
93 if (xfs_sb_version_hasrmapbt(&mp->m_sb))
94 blocks += mp->m_sb.sb_agcount * mp->m_rmap_maxlevels;
95 return blocks; 88 return blocks;
96} 89}
97 90
@@ -265,7 +258,7 @@ xfs_alloc_compute_diff(
265 xfs_agblock_t wantbno, /* target starting block */ 258 xfs_agblock_t wantbno, /* target starting block */
266 xfs_extlen_t wantlen, /* target length */ 259 xfs_extlen_t wantlen, /* target length */
267 xfs_extlen_t alignment, /* target alignment */ 260 xfs_extlen_t alignment, /* target alignment */
268 char userdata, /* are we allocating data? */ 261 int datatype, /* are we allocating data? */
269 xfs_agblock_t freebno, /* freespace's starting block */ 262 xfs_agblock_t freebno, /* freespace's starting block */
270 xfs_extlen_t freelen, /* freespace's length */ 263 xfs_extlen_t freelen, /* freespace's length */
271 xfs_agblock_t *newbnop) /* result: best start block from free */ 264 xfs_agblock_t *newbnop) /* result: best start block from free */
@@ -276,6 +269,7 @@ xfs_alloc_compute_diff(
276 xfs_extlen_t newlen1=0; /* length with newbno1 */ 269 xfs_extlen_t newlen1=0; /* length with newbno1 */
277 xfs_extlen_t newlen2=0; /* length with newbno2 */ 270 xfs_extlen_t newlen2=0; /* length with newbno2 */
278 xfs_agblock_t wantend; /* end of target extent */ 271 xfs_agblock_t wantend; /* end of target extent */
272 bool userdata = xfs_alloc_is_userdata(datatype);
279 273
280 ASSERT(freelen >= wantlen); 274 ASSERT(freelen >= wantlen);
281 freeend = freebno + freelen; 275 freeend = freebno + freelen;
@@ -680,12 +674,29 @@ xfs_alloc_ag_vextent(
680 xfs_alloc_arg_t *args) /* argument structure for allocation */ 674 xfs_alloc_arg_t *args) /* argument structure for allocation */
681{ 675{
682 int error=0; 676 int error=0;
677 xfs_extlen_t reservation;
678 xfs_extlen_t oldmax;
683 679
684 ASSERT(args->minlen > 0); 680 ASSERT(args->minlen > 0);
685 ASSERT(args->maxlen > 0); 681 ASSERT(args->maxlen > 0);
686 ASSERT(args->minlen <= args->maxlen); 682 ASSERT(args->minlen <= args->maxlen);
687 ASSERT(args->mod < args->prod); 683 ASSERT(args->mod < args->prod);
688 ASSERT(args->alignment > 0); 684 ASSERT(args->alignment > 0);
685
686 /*
687 * Clamp maxlen to the amount of free space minus any reservations
688 * that have been made.
689 */
690 oldmax = args->maxlen;
691 reservation = xfs_ag_resv_needed(args->pag, args->resv);
692 if (args->maxlen > args->pag->pagf_freeblks - reservation)
693 args->maxlen = args->pag->pagf_freeblks - reservation;
694 if (args->maxlen == 0) {
695 args->agbno = NULLAGBLOCK;
696 args->maxlen = oldmax;
697 return 0;
698 }
699
689 /* 700 /*
690 * Branch to correct routine based on the type. 701 * Branch to correct routine based on the type.
691 */ 702 */
@@ -705,12 +716,14 @@ xfs_alloc_ag_vextent(
705 /* NOTREACHED */ 716 /* NOTREACHED */
706 } 717 }
707 718
719 args->maxlen = oldmax;
720
708 if (error || args->agbno == NULLAGBLOCK) 721 if (error || args->agbno == NULLAGBLOCK)
709 return error; 722 return error;
710 723
711 ASSERT(args->len >= args->minlen); 724 ASSERT(args->len >= args->minlen);
712 ASSERT(args->len <= args->maxlen); 725 ASSERT(args->len <= args->maxlen);
713 ASSERT(!args->wasfromfl || !args->isfl); 726 ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
714 ASSERT(args->agbno % args->alignment == 0); 727 ASSERT(args->agbno % args->alignment == 0);
715 728
716 /* if not file data, insert new block into the reverse map btree */ 729 /* if not file data, insert new block into the reverse map btree */
@@ -732,12 +745,7 @@ xfs_alloc_ag_vextent(
732 args->agbno, args->len)); 745 args->agbno, args->len));
733 } 746 }
734 747
735 if (!args->isfl) { 748 xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
736 xfs_trans_mod_sb(args->tp, args->wasdel ?
737 XFS_TRANS_SB_RES_FDBLOCKS :
738 XFS_TRANS_SB_FDBLOCKS,
739 -((long)(args->len)));
740 }
741 749
742 XFS_STATS_INC(args->mp, xs_allocx); 750 XFS_STATS_INC(args->mp, xs_allocx);
743 XFS_STATS_ADD(args->mp, xs_allocb, args->len); 751 XFS_STATS_ADD(args->mp, xs_allocb, args->len);
@@ -917,7 +925,7 @@ xfs_alloc_find_best_extent(
917 925
918 sdiff = xfs_alloc_compute_diff(args->agbno, args->len, 926 sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
919 args->alignment, 927 args->alignment,
920 args->userdata, *sbnoa, 928 args->datatype, *sbnoa,
921 *slena, &new); 929 *slena, &new);
922 930
923 /* 931 /*
@@ -1101,7 +1109,7 @@ restart:
1101 if (args->len < blen) 1109 if (args->len < blen)
1102 continue; 1110 continue;
1103 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, 1111 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1104 args->alignment, args->userdata, ltbnoa, 1112 args->alignment, args->datatype, ltbnoa,
1105 ltlena, &ltnew); 1113 ltlena, &ltnew);
1106 if (ltnew != NULLAGBLOCK && 1114 if (ltnew != NULLAGBLOCK &&
1107 (args->len > blen || ltdiff < bdiff)) { 1115 (args->len > blen || ltdiff < bdiff)) {
@@ -1254,7 +1262,7 @@ restart:
1254 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 1262 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
1255 xfs_alloc_fix_len(args); 1263 xfs_alloc_fix_len(args);
1256 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, 1264 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1257 args->alignment, args->userdata, ltbnoa, 1265 args->alignment, args->datatype, ltbnoa,
1258 ltlena, &ltnew); 1266 ltlena, &ltnew);
1259 1267
1260 error = xfs_alloc_find_best_extent(args, 1268 error = xfs_alloc_find_best_extent(args,
@@ -1271,7 +1279,7 @@ restart:
1271 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); 1279 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
1272 xfs_alloc_fix_len(args); 1280 xfs_alloc_fix_len(args);
1273 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, 1281 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1274 args->alignment, args->userdata, gtbnoa, 1282 args->alignment, args->datatype, gtbnoa,
1275 gtlena, &gtnew); 1283 gtlena, &gtnew);
1276 1284
1277 error = xfs_alloc_find_best_extent(args, 1285 error = xfs_alloc_find_best_extent(args,
@@ -1331,7 +1339,7 @@ restart:
1331 } 1339 }
1332 rlen = args->len; 1340 rlen = args->len;
1333 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, 1341 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
1334 args->userdata, ltbnoa, ltlena, &ltnew); 1342 args->datatype, ltbnoa, ltlena, &ltnew);
1335 ASSERT(ltnew >= ltbno); 1343 ASSERT(ltnew >= ltbno);
1336 ASSERT(ltnew + rlen <= ltbnoa + ltlena); 1344 ASSERT(ltnew + rlen <= ltbnoa + ltlena);
1337 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 1345 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
@@ -1583,6 +1591,7 @@ xfs_alloc_ag_vextent_small(
1583 int *stat) /* status: 0-freelist, 1-normal/none */ 1591 int *stat) /* status: 0-freelist, 1-normal/none */
1584{ 1592{
1585 struct xfs_owner_info oinfo; 1593 struct xfs_owner_info oinfo;
1594 struct xfs_perag *pag;
1586 int error; 1595 int error;
1587 xfs_agblock_t fbno; 1596 xfs_agblock_t fbno;
1588 xfs_extlen_t flen; 1597 xfs_extlen_t flen;
@@ -1600,7 +1609,8 @@ xfs_alloc_ag_vextent_small(
1600 * to respect minleft even when pulling from the 1609 * to respect minleft even when pulling from the
1601 * freelist. 1610 * freelist.
1602 */ 1611 */
1603 else if (args->minlen == 1 && args->alignment == 1 && !args->isfl && 1612 else if (args->minlen == 1 && args->alignment == 1 &&
1613 args->resv != XFS_AG_RESV_AGFL &&
1604 (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) 1614 (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
1605 > args->minleft)) { 1615 > args->minleft)) {
1606 error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); 1616 error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
@@ -1608,9 +1618,9 @@ xfs_alloc_ag_vextent_small(
1608 goto error0; 1618 goto error0;
1609 if (fbno != NULLAGBLOCK) { 1619 if (fbno != NULLAGBLOCK) {
1610 xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, 1620 xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
1611 args->userdata); 1621 xfs_alloc_allow_busy_reuse(args->datatype));
1612 1622
1613 if (args->userdata) { 1623 if (xfs_alloc_is_userdata(args->datatype)) {
1614 xfs_buf_t *bp; 1624 xfs_buf_t *bp;
1615 1625
1616 bp = xfs_btree_get_bufs(args->mp, args->tp, 1626 bp = xfs_btree_get_bufs(args->mp, args->tp,
@@ -1629,13 +1639,18 @@ xfs_alloc_ag_vextent_small(
1629 /* 1639 /*
1630 * If we're feeding an AGFL block to something that 1640 * If we're feeding an AGFL block to something that
1631 * doesn't live in the free space, we need to clear 1641 * doesn't live in the free space, we need to clear
1632 * out the OWN_AG rmap. 1642 * out the OWN_AG rmap and add the block back to
1643 * the AGFL per-AG reservation.
1633 */ 1644 */
1634 xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); 1645 xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
1635 error = xfs_rmap_free(args->tp, args->agbp, args->agno, 1646 error = xfs_rmap_free(args->tp, args->agbp, args->agno,
1636 fbno, 1, &oinfo); 1647 fbno, 1, &oinfo);
1637 if (error) 1648 if (error)
1638 goto error0; 1649 goto error0;
1650 pag = xfs_perag_get(args->mp, args->agno);
1651 xfs_ag_resv_free_extent(pag, XFS_AG_RESV_AGFL,
1652 args->tp, 1);
1653 xfs_perag_put(pag);
1639 1654
1640 *stat = 0; 1655 *stat = 0;
1641 return 0; 1656 return 0;
@@ -1683,7 +1698,7 @@ xfs_free_ag_extent(
1683 xfs_agblock_t bno, 1698 xfs_agblock_t bno,
1684 xfs_extlen_t len, 1699 xfs_extlen_t len,
1685 struct xfs_owner_info *oinfo, 1700 struct xfs_owner_info *oinfo,
1686 int isfl) 1701 enum xfs_ag_resv_type type)
1687{ 1702{
1688 xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */ 1703 xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */
1689 xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */ 1704 xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */
@@ -1911,21 +1926,22 @@ xfs_free_ag_extent(
1911 */ 1926 */
1912 pag = xfs_perag_get(mp, agno); 1927 pag = xfs_perag_get(mp, agno);
1913 error = xfs_alloc_update_counters(tp, pag, agbp, len); 1928 error = xfs_alloc_update_counters(tp, pag, agbp, len);
1929 xfs_ag_resv_free_extent(pag, type, tp, len);
1914 xfs_perag_put(pag); 1930 xfs_perag_put(pag);
1915 if (error) 1931 if (error)
1916 goto error0; 1932 goto error0;
1917 1933
1918 if (!isfl)
1919 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
1920 XFS_STATS_INC(mp, xs_freex); 1934 XFS_STATS_INC(mp, xs_freex);
1921 XFS_STATS_ADD(mp, xs_freeb, len); 1935 XFS_STATS_ADD(mp, xs_freeb, len);
1922 1936
1923 trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); 1937 trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL,
1938 haveleft, haveright);
1924 1939
1925 return 0; 1940 return 0;
1926 1941
1927 error0: 1942 error0:
1928 trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1); 1943 trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL,
1944 -1, -1);
1929 if (bno_cur) 1945 if (bno_cur)
1930 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 1946 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1931 if (cnt_cur) 1947 if (cnt_cur)
@@ -1950,21 +1966,43 @@ xfs_alloc_compute_maxlevels(
1950} 1966}
1951 1967
1952/* 1968/*
1953 * Find the length of the longest extent in an AG. 1969 * Find the length of the longest extent in an AG. The 'need' parameter
1970 * specifies how much space we're going to need for the AGFL and the
1971 * 'reserved' parameter tells us how many blocks in this AG are reserved for
1972 * other callers.
1954 */ 1973 */
1955xfs_extlen_t 1974xfs_extlen_t
1956xfs_alloc_longest_free_extent( 1975xfs_alloc_longest_free_extent(
1957 struct xfs_mount *mp, 1976 struct xfs_mount *mp,
1958 struct xfs_perag *pag, 1977 struct xfs_perag *pag,
1959 xfs_extlen_t need) 1978 xfs_extlen_t need,
1979 xfs_extlen_t reserved)
1960{ 1980{
1961 xfs_extlen_t delta = 0; 1981 xfs_extlen_t delta = 0;
1962 1982
1983 /*
1984 * If the AGFL needs a recharge, we'll have to subtract that from the
1985 * longest extent.
1986 */
1963 if (need > pag->pagf_flcount) 1987 if (need > pag->pagf_flcount)
1964 delta = need - pag->pagf_flcount; 1988 delta = need - pag->pagf_flcount;
1965 1989
1990 /*
1991 * If we cannot maintain others' reservations with space from the
1992 * not-longest freesp extents, we'll have to subtract /that/ from
1993 * the longest extent too.
1994 */
1995 if (pag->pagf_freeblks - pag->pagf_longest < reserved)
1996 delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
1997
1998 /*
1999 * If the longest extent is long enough to satisfy all the
2000 * reservations and AGFL rules in place, we can return this extent.
2001 */
1966 if (pag->pagf_longest > delta) 2002 if (pag->pagf_longest > delta)
1967 return pag->pagf_longest - delta; 2003 return pag->pagf_longest - delta;
2004
2005 /* Otherwise, let the caller try for 1 block if there's space. */
1968 return pag->pagf_flcount > 0 || pag->pagf_longest > 0; 2006 return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
1969} 2007}
1970 2008
@@ -2004,20 +2042,24 @@ xfs_alloc_space_available(
2004{ 2042{
2005 struct xfs_perag *pag = args->pag; 2043 struct xfs_perag *pag = args->pag;
2006 xfs_extlen_t longest; 2044 xfs_extlen_t longest;
2045 xfs_extlen_t reservation; /* blocks that are still reserved */
2007 int available; 2046 int available;
2008 2047
2009 if (flags & XFS_ALLOC_FLAG_FREEING) 2048 if (flags & XFS_ALLOC_FLAG_FREEING)
2010 return true; 2049 return true;
2011 2050
2051 reservation = xfs_ag_resv_needed(pag, args->resv);
2052
2012 /* do we have enough contiguous free space for the allocation? */ 2053 /* do we have enough contiguous free space for the allocation? */
2013 longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free); 2054 longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free,
2055 reservation);
2014 if ((args->minlen + args->alignment + args->minalignslop - 1) > longest) 2056 if ((args->minlen + args->alignment + args->minalignslop - 1) > longest)
2015 return false; 2057 return false;
2016 2058
2017 /* do have enough free space remaining for the allocation? */ 2059 /* do we have enough free space remaining for the allocation? */
2018 available = (int)(pag->pagf_freeblks + pag->pagf_flcount - 2060 available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
2019 min_free - args->total); 2061 reservation - min_free - args->total);
2020 if (available < (int)args->minleft) 2062 if (available < (int)args->minleft || available <= 0)
2021 return false; 2063 return false;
2022 2064
2023 return true; 2065 return true;
@@ -2058,7 +2100,7 @@ xfs_alloc_fix_freelist(
2058 * somewhere else if we are not being asked to try harder at this 2100 * somewhere else if we are not being asked to try harder at this
2059 * point 2101 * point
2060 */ 2102 */
2061 if (pag->pagf_metadata && args->userdata && 2103 if (pag->pagf_metadata && xfs_alloc_is_userdata(args->datatype) &&
2062 (flags & XFS_ALLOC_FLAG_TRYLOCK)) { 2104 (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
2063 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); 2105 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
2064 goto out_agbp_relse; 2106 goto out_agbp_relse;
@@ -2124,7 +2166,7 @@ xfs_alloc_fix_freelist(
2124 if (error) 2166 if (error)
2125 goto out_agbp_relse; 2167 goto out_agbp_relse;
2126 error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 2168 error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1,
2127 &targs.oinfo, 1); 2169 &targs.oinfo, XFS_AG_RESV_AGFL);
2128 if (error) 2170 if (error)
2129 goto out_agbp_relse; 2171 goto out_agbp_relse;
2130 bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); 2172 bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
@@ -2135,7 +2177,7 @@ xfs_alloc_fix_freelist(
2135 targs.mp = mp; 2177 targs.mp = mp;
2136 targs.agbp = agbp; 2178 targs.agbp = agbp;
2137 targs.agno = args->agno; 2179 targs.agno = args->agno;
2138 targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; 2180 targs.alignment = targs.minlen = targs.prod = 1;
2139 targs.type = XFS_ALLOCTYPE_THIS_AG; 2181 targs.type = XFS_ALLOCTYPE_THIS_AG;
2140 targs.pag = pag; 2182 targs.pag = pag;
2141 error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp); 2183 error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
@@ -2146,6 +2188,7 @@ xfs_alloc_fix_freelist(
2146 while (pag->pagf_flcount < need) { 2188 while (pag->pagf_flcount < need) {
2147 targs.agbno = 0; 2189 targs.agbno = 0;
2148 targs.maxlen = need - pag->pagf_flcount; 2190 targs.maxlen = need - pag->pagf_flcount;
2191 targs.resv = XFS_AG_RESV_AGFL;
2149 2192
2150 /* Allocate as many blocks as possible at once. */ 2193 /* Allocate as many blocks as possible at once. */
2151 error = xfs_alloc_ag_vextent(&targs); 2194 error = xfs_alloc_ag_vextent(&targs);
@@ -2633,7 +2676,7 @@ xfs_alloc_vextent(
2633 * Try near allocation first, then anywhere-in-ag after 2676 * Try near allocation first, then anywhere-in-ag after
2634 * the first a.g. fails. 2677 * the first a.g. fails.
2635 */ 2678 */
2636 if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) && 2679 if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
2637 (mp->m_flags & XFS_MOUNT_32BITINODES)) { 2680 (mp->m_flags & XFS_MOUNT_32BITINODES)) {
2638 args->fsbno = XFS_AGB_TO_FSB(mp, 2681 args->fsbno = XFS_AGB_TO_FSB(mp,
2639 ((mp->m_agfrotor / rotorstep) % 2682 ((mp->m_agfrotor / rotorstep) %
@@ -2766,7 +2809,7 @@ xfs_alloc_vextent(
2766#endif 2809#endif
2767 2810
2768 /* Zero the extent if we were asked to do so */ 2811 /* Zero the extent if we were asked to do so */
2769 if (args->userdata & XFS_ALLOC_USERDATA_ZERO) { 2812 if (args->datatype & XFS_ALLOC_USERDATA_ZERO) {
2770 error = xfs_zero_extent(args->ip, args->fsbno, args->len); 2813 error = xfs_zero_extent(args->ip, args->fsbno, args->len);
2771 if (error) 2814 if (error)
2772 goto error0; 2815 goto error0;
@@ -2825,7 +2868,8 @@ xfs_free_extent(
2825 struct xfs_trans *tp, /* transaction pointer */ 2868 struct xfs_trans *tp, /* transaction pointer */
2826 xfs_fsblock_t bno, /* starting block number of extent */ 2869 xfs_fsblock_t bno, /* starting block number of extent */
2827 xfs_extlen_t len, /* length of extent */ 2870 xfs_extlen_t len, /* length of extent */
2828 struct xfs_owner_info *oinfo) /* extent owner */ 2871 struct xfs_owner_info *oinfo, /* extent owner */
2872 enum xfs_ag_resv_type type) /* block reservation type */
2829{ 2873{
2830 struct xfs_mount *mp = tp->t_mountp; 2874 struct xfs_mount *mp = tp->t_mountp;
2831 struct xfs_buf *agbp; 2875 struct xfs_buf *agbp;
@@ -2834,6 +2878,7 @@ xfs_free_extent(
2834 int error; 2878 int error;
2835 2879
2836 ASSERT(len != 0); 2880 ASSERT(len != 0);
2881 ASSERT(type != XFS_AG_RESV_AGFL);
2837 2882
2838 if (XFS_TEST_ERROR(false, mp, 2883 if (XFS_TEST_ERROR(false, mp,
2839 XFS_ERRTAG_FREE_EXTENT, 2884 XFS_ERRTAG_FREE_EXTENT,
@@ -2851,7 +2896,7 @@ xfs_free_extent(
2851 agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length), 2896 agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
2852 err); 2897 err);
2853 2898
2854 error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, 0); 2899 error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
2855 if (error) 2900 if (error)
2856 goto err; 2901 goto err;
2857 2902
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 6fe2d6b7cfe9..7c404a6b0ae3 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -85,20 +85,33 @@ typedef struct xfs_alloc_arg {
85 xfs_extlen_t len; /* output: actual size of extent */ 85 xfs_extlen_t len; /* output: actual size of extent */
86 xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */ 86 xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */
87 xfs_alloctype_t otype; /* original allocation type */ 87 xfs_alloctype_t otype; /* original allocation type */
88 int datatype; /* mask defining data type treatment */
88 char wasdel; /* set if allocation was prev delayed */ 89 char wasdel; /* set if allocation was prev delayed */
89 char wasfromfl; /* set if allocation is from freelist */ 90 char wasfromfl; /* set if allocation is from freelist */
90 char isfl; /* set if is freelist blocks - !acctg */
91 char userdata; /* mask defining userdata treatment */
92 xfs_fsblock_t firstblock; /* io first block allocated */ 91 xfs_fsblock_t firstblock; /* io first block allocated */
93 struct xfs_owner_info oinfo; /* owner of blocks being allocated */ 92 struct xfs_owner_info oinfo; /* owner of blocks being allocated */
93 enum xfs_ag_resv_type resv; /* block reservation to use */
94} xfs_alloc_arg_t; 94} xfs_alloc_arg_t;
95 95
96/* 96/*
97 * Defines for userdata 97 * Defines for datatype
98 */ 98 */
99#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/ 99#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/
100#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ 100#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */
101#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ 101#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */
102#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */
103
104static inline bool
105xfs_alloc_is_userdata(int datatype)
106{
107 return (datatype & ~XFS_ALLOC_NOBUSY) != 0;
108}
109
110static inline bool
111xfs_alloc_allow_busy_reuse(int datatype)
112{
113 return (datatype & XFS_ALLOC_NOBUSY) == 0;
114}
102 115
103/* freespace limit calculations */ 116/* freespace limit calculations */
104#define XFS_ALLOC_AGFL_RESERVE 4 117#define XFS_ALLOC_AGFL_RESERVE 4
@@ -106,7 +119,8 @@ unsigned int xfs_alloc_set_aside(struct xfs_mount *mp);
106unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp); 119unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp);
107 120
108xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, 121xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
109 struct xfs_perag *pag, xfs_extlen_t need); 122 struct xfs_perag *pag, xfs_extlen_t need,
123 xfs_extlen_t reserved);
110unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, 124unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
111 struct xfs_perag *pag); 125 struct xfs_perag *pag);
112 126
@@ -184,7 +198,8 @@ xfs_free_extent(
184 struct xfs_trans *tp, /* transaction pointer */ 198 struct xfs_trans *tp, /* transaction pointer */
185 xfs_fsblock_t bno, /* starting block number of extent */ 199 xfs_fsblock_t bno, /* starting block number of extent */
186 xfs_extlen_t len, /* length of extent */ 200 xfs_extlen_t len, /* length of extent */
187 struct xfs_owner_info *oinfo);/* extent owner */ 201 struct xfs_owner_info *oinfo, /* extent owner */
202 enum xfs_ag_resv_type type); /* block reservation type */
188 203
189int /* error */ 204int /* error */
190xfs_alloc_lookup_ge( 205xfs_alloc_lookup_ge(
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index b060bca93402..9d7f61d36645 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -47,6 +47,7 @@
47#include "xfs_attr_leaf.h" 47#include "xfs_attr_leaf.h"
48#include "xfs_filestream.h" 48#include "xfs_filestream.h"
49#include "xfs_rmap.h" 49#include "xfs_rmap.h"
50#include "xfs_ag_resv.h"
50 51
51 52
52kmem_zone_t *xfs_bmap_free_item_zone; 53kmem_zone_t *xfs_bmap_free_item_zone;
@@ -1388,7 +1389,7 @@ xfs_bmap_search_multi_extents(
1388 * Else, *lastxp will be set to the index of the found 1389 * Else, *lastxp will be set to the index of the found
1389 * entry; *gotp will contain the entry. 1390 * entry; *gotp will contain the entry.
1390 */ 1391 */
1391STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ 1392xfs_bmbt_rec_host_t * /* pointer to found extent entry */
1392xfs_bmap_search_extents( 1393xfs_bmap_search_extents(
1393 xfs_inode_t *ip, /* incore inode pointer */ 1394 xfs_inode_t *ip, /* incore inode pointer */
1394 xfs_fileoff_t bno, /* block number searched for */ 1395 xfs_fileoff_t bno, /* block number searched for */
@@ -3347,7 +3348,8 @@ xfs_bmap_adjacent(
3347 3348
3348 mp = ap->ip->i_mount; 3349 mp = ap->ip->i_mount;
3349 nullfb = *ap->firstblock == NULLFSBLOCK; 3350 nullfb = *ap->firstblock == NULLFSBLOCK;
3350 rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata; 3351 rt = XFS_IS_REALTIME_INODE(ap->ip) &&
3352 xfs_alloc_is_userdata(ap->datatype);
3351 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); 3353 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
3352 /* 3354 /*
3353 * If allocating at eof, and there's a previous real block, 3355 * If allocating at eof, and there's a previous real block,
@@ -3501,7 +3503,8 @@ xfs_bmap_longest_free_extent(
3501 } 3503 }
3502 3504
3503 longest = xfs_alloc_longest_free_extent(mp, pag, 3505 longest = xfs_alloc_longest_free_extent(mp, pag,
3504 xfs_alloc_min_freelist(mp, pag)); 3506 xfs_alloc_min_freelist(mp, pag),
3507 xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
3505 if (*blen < longest) 3508 if (*blen < longest)
3506 *blen = longest; 3509 *blen = longest;
3507 3510
@@ -3622,7 +3625,7 @@ xfs_bmap_btalloc(
3622{ 3625{
3623 xfs_mount_t *mp; /* mount point structure */ 3626 xfs_mount_t *mp; /* mount point structure */
3624 xfs_alloctype_t atype = 0; /* type for allocation routines */ 3627 xfs_alloctype_t atype = 0; /* type for allocation routines */
3625 xfs_extlen_t align; /* minimum allocation alignment */ 3628 xfs_extlen_t align = 0; /* minimum allocation alignment */
3626 xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ 3629 xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
3627 xfs_agnumber_t ag; 3630 xfs_agnumber_t ag;
3628 xfs_alloc_arg_t args; 3631 xfs_alloc_arg_t args;
@@ -3645,7 +3648,8 @@ xfs_bmap_btalloc(
3645 else if (mp->m_dalign) 3648 else if (mp->m_dalign)
3646 stripe_align = mp->m_dalign; 3649 stripe_align = mp->m_dalign;
3647 3650
3648 align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; 3651 if (xfs_alloc_is_userdata(ap->datatype))
3652 align = xfs_get_extsz_hint(ap->ip);
3649 if (unlikely(align)) { 3653 if (unlikely(align)) {
3650 error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, 3654 error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
3651 align, 0, ap->eof, 0, ap->conv, 3655 align, 0, ap->eof, 0, ap->conv,
@@ -3658,7 +3662,8 @@ xfs_bmap_btalloc(
3658 nullfb = *ap->firstblock == NULLFSBLOCK; 3662 nullfb = *ap->firstblock == NULLFSBLOCK;
3659 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); 3663 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
3660 if (nullfb) { 3664 if (nullfb) {
3661 if (ap->userdata && xfs_inode_is_filestream(ap->ip)) { 3665 if (xfs_alloc_is_userdata(ap->datatype) &&
3666 xfs_inode_is_filestream(ap->ip)) {
3662 ag = xfs_filestream_lookup_ag(ap->ip); 3667 ag = xfs_filestream_lookup_ag(ap->ip);
3663 ag = (ag != NULLAGNUMBER) ? ag : 0; 3668 ag = (ag != NULLAGNUMBER) ? ag : 0;
3664 ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0); 3669 ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
@@ -3698,7 +3703,8 @@ xfs_bmap_btalloc(
3698 * enough for the request. If one isn't found, then adjust 3703 * enough for the request. If one isn't found, then adjust
3699 * the minimum allocation size to the largest space found. 3704 * the minimum allocation size to the largest space found.
3700 */ 3705 */
3701 if (ap->userdata && xfs_inode_is_filestream(ap->ip)) 3706 if (xfs_alloc_is_userdata(ap->datatype) &&
3707 xfs_inode_is_filestream(ap->ip))
3702 error = xfs_bmap_btalloc_filestreams(ap, &args, &blen); 3708 error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
3703 else 3709 else
3704 error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); 3710 error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
@@ -3781,9 +3787,9 @@ xfs_bmap_btalloc(
3781 } 3787 }
3782 args.minleft = ap->minleft; 3788 args.minleft = ap->minleft;
3783 args.wasdel = ap->wasdel; 3789 args.wasdel = ap->wasdel;
3784 args.isfl = 0; 3790 args.resv = XFS_AG_RESV_NONE;
3785 args.userdata = ap->userdata; 3791 args.datatype = ap->datatype;
3786 if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) 3792 if (ap->datatype & XFS_ALLOC_USERDATA_ZERO)
3787 args.ip = ap->ip; 3793 args.ip = ap->ip;
3788 3794
3789 error = xfs_alloc_vextent(&args); 3795 error = xfs_alloc_vextent(&args);
@@ -3877,7 +3883,8 @@ STATIC int
3877xfs_bmap_alloc( 3883xfs_bmap_alloc(
3878 struct xfs_bmalloca *ap) /* bmap alloc argument struct */ 3884 struct xfs_bmalloca *ap) /* bmap alloc argument struct */
3879{ 3885{
3880 if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata) 3886 if (XFS_IS_REALTIME_INODE(ap->ip) &&
3887 xfs_alloc_is_userdata(ap->datatype))
3881 return xfs_bmap_rtalloc(ap); 3888 return xfs_bmap_rtalloc(ap);
3882 return xfs_bmap_btalloc(ap); 3889 return xfs_bmap_btalloc(ap);
3883} 3890}
@@ -4074,7 +4081,7 @@ xfs_bmapi_read(
4074 return 0; 4081 return 0;
4075} 4082}
4076 4083
4077STATIC int 4084int
4078xfs_bmapi_reserve_delalloc( 4085xfs_bmapi_reserve_delalloc(
4079 struct xfs_inode *ip, 4086 struct xfs_inode *ip,
4080 xfs_fileoff_t aoff, 4087 xfs_fileoff_t aoff,
@@ -4170,91 +4177,6 @@ out_unreserve_quota:
4170 return error; 4177 return error;
4171} 4178}
4172 4179
4173/*
4174 * Map file blocks to filesystem blocks, adding delayed allocations as needed.
4175 */
4176int
4177xfs_bmapi_delay(
4178 struct xfs_inode *ip, /* incore inode */
4179 xfs_fileoff_t bno, /* starting file offs. mapped */
4180 xfs_filblks_t len, /* length to map in file */
4181 struct xfs_bmbt_irec *mval, /* output: map values */
4182 int *nmap, /* i/o: mval size/count */
4183 int flags) /* XFS_BMAPI_... */
4184{
4185 struct xfs_mount *mp = ip->i_mount;
4186 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
4187 struct xfs_bmbt_irec got; /* current file extent record */
4188 struct xfs_bmbt_irec prev; /* previous file extent record */
4189 xfs_fileoff_t obno; /* old block number (offset) */
4190 xfs_fileoff_t end; /* end of mapped file region */
4191 xfs_extnum_t lastx; /* last useful extent number */
4192 int eof; /* we've hit the end of extents */
4193 int n = 0; /* current extent index */
4194 int error = 0;
4195
4196 ASSERT(*nmap >= 1);
4197 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
4198 ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
4199 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
4200
4201 if (unlikely(XFS_TEST_ERROR(
4202 (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
4203 XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
4204 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
4205 XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
4206 return -EFSCORRUPTED;
4207 }
4208
4209 if (XFS_FORCED_SHUTDOWN(mp))
4210 return -EIO;
4211
4212 XFS_STATS_INC(mp, xs_blk_mapw);
4213
4214 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
4215 error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
4216 if (error)
4217 return error;
4218 }
4219
4220 xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev);
4221 end = bno + len;
4222 obno = bno;
4223
4224 while (bno < end && n < *nmap) {
4225 if (eof || got.br_startoff > bno) {
4226 error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
4227 &prev, &lastx, eof);
4228 if (error) {
4229 if (n == 0) {
4230 *nmap = 0;
4231 return error;
4232 }
4233 break;
4234 }
4235 }
4236
4237 /* set up the extent map to return. */
4238 xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
4239 xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
4240
4241 /* If we're done, stop now. */
4242 if (bno >= end || n >= *nmap)
4243 break;
4244
4245 /* Else go on to the next record. */
4246 prev = got;
4247 if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
4248 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
4249 else
4250 eof = 1;
4251 }
4252
4253 *nmap = n;
4254 return 0;
4255}
4256
4257
4258static int 4180static int
4259xfs_bmapi_allocate( 4181xfs_bmapi_allocate(
4260 struct xfs_bmalloca *bma) 4182 struct xfs_bmalloca *bma)
@@ -4287,15 +4209,21 @@ xfs_bmapi_allocate(
4287 } 4209 }
4288 4210
4289 /* 4211 /*
4290 * Indicate if this is the first user data in the file, or just any 4212 * Set the data type being allocated. For the data fork, the first data
4291 * user data. And if it is userdata, indicate whether it needs to 4213 * in the file is treated differently to all other allocations. For the
4292 * be initialised to zero during allocation. 4214 * attribute fork, we only need to ensure the allocated range is not on
4215 * the busy list.
4293 */ 4216 */
4294 if (!(bma->flags & XFS_BMAPI_METADATA)) { 4217 if (!(bma->flags & XFS_BMAPI_METADATA)) {
4295 bma->userdata = (bma->offset == 0) ? 4218 bma->datatype = XFS_ALLOC_NOBUSY;
4296 XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; 4219 if (whichfork == XFS_DATA_FORK) {
4220 if (bma->offset == 0)
4221 bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
4222 else
4223 bma->datatype |= XFS_ALLOC_USERDATA;
4224 }
4297 if (bma->flags & XFS_BMAPI_ZERO) 4225 if (bma->flags & XFS_BMAPI_ZERO)
4298 bma->userdata |= XFS_ALLOC_USERDATA_ZERO; 4226 bma->datatype |= XFS_ALLOC_USERDATA_ZERO;
4299 } 4227 }
4300 4228
4301 bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; 4229 bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
@@ -4565,7 +4493,7 @@ xfs_bmapi_write(
4565 bma.tp = tp; 4493 bma.tp = tp;
4566 bma.ip = ip; 4494 bma.ip = ip;
4567 bma.total = total; 4495 bma.total = total;
4568 bma.userdata = 0; 4496 bma.datatype = 0;
4569 bma.dfops = dfops; 4497 bma.dfops = dfops;
4570 bma.firstblock = firstblock; 4498 bma.firstblock = firstblock;
4571 4499
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 254034f96941..8395f6e8cf7d 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -54,7 +54,7 @@ struct xfs_bmalloca {
54 bool wasdel; /* replacing a delayed allocation */ 54 bool wasdel; /* replacing a delayed allocation */
55 bool aeof; /* allocated space at eof */ 55 bool aeof; /* allocated space at eof */
56 bool conv; /* overwriting unwritten extents */ 56 bool conv; /* overwriting unwritten extents */
57 char userdata;/* userdata mask */ 57 int datatype;/* data type being allocated */
58 int flags; 58 int flags;
59}; 59};
60 60
@@ -181,9 +181,6 @@ int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
181int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, 181int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
182 xfs_filblks_t len, struct xfs_bmbt_irec *mval, 182 xfs_filblks_t len, struct xfs_bmbt_irec *mval,
183 int *nmap, int flags); 183 int *nmap, int flags);
184int xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno,
185 xfs_filblks_t len, struct xfs_bmbt_irec *mval,
186 int *nmap, int flags);
187int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, 184int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
188 xfs_fileoff_t bno, xfs_filblks_t len, int flags, 185 xfs_fileoff_t bno, xfs_filblks_t len, int flags,
189 xfs_fsblock_t *firstblock, xfs_extlen_t total, 186 xfs_fsblock_t *firstblock, xfs_extlen_t total,
@@ -202,5 +199,12 @@ int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
202 struct xfs_defer_ops *dfops, enum shift_direction direction, 199 struct xfs_defer_ops *dfops, enum shift_direction direction,
203 int num_exts); 200 int num_exts);
204int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); 201int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
202struct xfs_bmbt_rec_host *
203 xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno,
204 int fork, int *eofp, xfs_extnum_t *lastxp,
205 struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp);
206int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, xfs_fileoff_t aoff,
207 xfs_filblks_t len, struct xfs_bmbt_irec *got,
208 struct xfs_bmbt_irec *prev, xfs_extnum_t *lastx, int eof);
205 209
206#endif /* __XFS_BMAP_H__ */ 210#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 08569792fe20..aa1752f918b8 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2070,7 +2070,7 @@ __xfs_btree_updkeys(
2070 struct xfs_buf *bp0, 2070 struct xfs_buf *bp0,
2071 bool force_all) 2071 bool force_all)
2072{ 2072{
2073 union xfs_btree_bigkey key; /* keys from current level */ 2073 union xfs_btree_key key; /* keys from current level */
2074 union xfs_btree_key *lkey; /* keys from the next level up */ 2074 union xfs_btree_key *lkey; /* keys from the next level up */
2075 union xfs_btree_key *hkey; 2075 union xfs_btree_key *hkey;
2076 union xfs_btree_key *nlkey; /* keys from the next level up */ 2076 union xfs_btree_key *nlkey; /* keys from the next level up */
@@ -2086,7 +2086,7 @@ __xfs_btree_updkeys(
2086 2086
2087 trace_xfs_btree_updkeys(cur, level, bp0); 2087 trace_xfs_btree_updkeys(cur, level, bp0);
2088 2088
2089 lkey = (union xfs_btree_key *)&key; 2089 lkey = &key;
2090 hkey = xfs_btree_high_key_from_key(cur, lkey); 2090 hkey = xfs_btree_high_key_from_key(cur, lkey);
2091 xfs_btree_get_keys(cur, block, lkey); 2091 xfs_btree_get_keys(cur, block, lkey);
2092 for (level++; level < cur->bc_nlevels; level++) { 2092 for (level++; level < cur->bc_nlevels; level++) {
@@ -3226,7 +3226,7 @@ xfs_btree_insrec(
3226 struct xfs_buf *bp; /* buffer for block */ 3226 struct xfs_buf *bp; /* buffer for block */
3227 union xfs_btree_ptr nptr; /* new block ptr */ 3227 union xfs_btree_ptr nptr; /* new block ptr */
3228 struct xfs_btree_cur *ncur; /* new btree cursor */ 3228 struct xfs_btree_cur *ncur; /* new btree cursor */
3229 union xfs_btree_bigkey nkey; /* new block key */ 3229 union xfs_btree_key nkey; /* new block key */
3230 union xfs_btree_key *lkey; 3230 union xfs_btree_key *lkey;
3231 int optr; /* old key/record index */ 3231 int optr; /* old key/record index */
3232 int ptr; /* key/record index */ 3232 int ptr; /* key/record index */
@@ -3241,7 +3241,7 @@ xfs_btree_insrec(
3241 XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec); 3241 XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec);
3242 3242
3243 ncur = NULL; 3243 ncur = NULL;
3244 lkey = (union xfs_btree_key *)&nkey; 3244 lkey = &nkey;
3245 3245
3246 /* 3246 /*
3247 * If we have an external root pointer, and we've made it to the 3247 * If we have an external root pointer, and we've made it to the
@@ -3444,14 +3444,14 @@ xfs_btree_insert(
3444 union xfs_btree_ptr nptr; /* new block number (split result) */ 3444 union xfs_btree_ptr nptr; /* new block number (split result) */
3445 struct xfs_btree_cur *ncur; /* new cursor (split result) */ 3445 struct xfs_btree_cur *ncur; /* new cursor (split result) */
3446 struct xfs_btree_cur *pcur; /* previous level's cursor */ 3446 struct xfs_btree_cur *pcur; /* previous level's cursor */
3447 union xfs_btree_bigkey bkey; /* key of block to insert */ 3447 union xfs_btree_key bkey; /* key of block to insert */
3448 union xfs_btree_key *key; 3448 union xfs_btree_key *key;
3449 union xfs_btree_rec rec; /* record to insert */ 3449 union xfs_btree_rec rec; /* record to insert */
3450 3450
3451 level = 0; 3451 level = 0;
3452 ncur = NULL; 3452 ncur = NULL;
3453 pcur = cur; 3453 pcur = cur;
3454 key = (union xfs_btree_key *)&bkey; 3454 key = &bkey;
3455 3455
3456 xfs_btree_set_ptr_null(cur, &nptr); 3456 xfs_btree_set_ptr_null(cur, &nptr);
3457 3457
@@ -4797,3 +4797,50 @@ xfs_btree_query_range(
4797 return xfs_btree_overlapped_query_range(cur, &low_key, &high_key, 4797 return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
4798 fn, priv); 4798 fn, priv);
4799} 4799}
4800
4801/*
4802 * Calculate the number of blocks needed to store a given number of records
4803 * in a short-format (per-AG metadata) btree.
4804 */
4805xfs_extlen_t
4806xfs_btree_calc_size(
4807 struct xfs_mount *mp,
4808 uint *limits,
4809 unsigned long long len)
4810{
4811 int level;
4812 int maxrecs;
4813 xfs_extlen_t rval;
4814
4815 maxrecs = limits[0];
4816 for (level = 0, rval = 0; len > 1; level++) {
4817 len += maxrecs - 1;
4818 do_div(len, maxrecs);
4819 maxrecs = limits[1];
4820 rval += len;
4821 }
4822 return rval;
4823}
4824
4825int
4826xfs_btree_count_blocks_helper(
4827 struct xfs_btree_cur *cur,
4828 int level,
4829 void *data)
4830{
4831 xfs_extlen_t *blocks = data;
4832 (*blocks)++;
4833
4834 return 0;
4835}
4836
4837/* Count the blocks in a btree and return the result in *blocks. */
4838int
4839xfs_btree_count_blocks(
4840 struct xfs_btree_cur *cur,
4841 xfs_extlen_t *blocks)
4842{
4843 *blocks = 0;
4844 return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
4845 blocks);
4846}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 04d0865e5e6d..3f8556a5c2ad 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -37,30 +37,18 @@ union xfs_btree_ptr {
37 __be64 l; /* long form ptr */ 37 __be64 l; /* long form ptr */
38}; 38};
39 39
40union xfs_btree_key {
41 struct xfs_bmbt_key bmbt;
42 xfs_bmdr_key_t bmbr; /* bmbt root block */
43 xfs_alloc_key_t alloc;
44 struct xfs_inobt_key inobt;
45 struct xfs_rmap_key rmap;
46};
47
48/* 40/*
49 * In-core key that holds both low and high keys for overlapped btrees. 41 * The in-core btree key. Overlapping btrees actually store two keys
50 * The two keys are packed next to each other on disk, so do the same 42 * per pointer, so we reserve enough memory to hold both. The __*bigkey
51 * in memory. Preserve the existing xfs_btree_key as a single key to 43 * items should never be accessed directly.
52 * avoid the mental model breakage that would happen if we passed a
53 * bigkey into a function that operates on a single key.
54 */ 44 */
55union xfs_btree_bigkey { 45union xfs_btree_key {
56 struct xfs_bmbt_key bmbt; 46 struct xfs_bmbt_key bmbt;
57 xfs_bmdr_key_t bmbr; /* bmbt root block */ 47 xfs_bmdr_key_t bmbr; /* bmbt root block */
58 xfs_alloc_key_t alloc; 48 xfs_alloc_key_t alloc;
59 struct xfs_inobt_key inobt; 49 struct xfs_inobt_key inobt;
60 struct { 50 struct xfs_rmap_key rmap;
61 struct xfs_rmap_key rmap; 51 struct xfs_rmap_key __rmap_bigkey[2];
62 struct xfs_rmap_key rmap_hi;
63 };
64}; 52};
65 53
66union xfs_btree_rec { 54union xfs_btree_rec {
@@ -513,6 +501,8 @@ bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
513bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); 501bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
514uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits, 502uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits,
515 unsigned long len); 503 unsigned long len);
504xfs_extlen_t xfs_btree_calc_size(struct xfs_mount *mp, uint *limits,
505 unsigned long long len);
516 506
517/* return codes */ 507/* return codes */
518#define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */ 508#define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */
@@ -529,4 +519,6 @@ typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
529int xfs_btree_visit_blocks(struct xfs_btree_cur *cur, 519int xfs_btree_visit_blocks(struct xfs_btree_cur *cur,
530 xfs_btree_visit_blocks_fn fn, void *data); 520 xfs_btree_visit_blocks_fn fn, void *data);
531 521
522int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks);
523
532#endif /* __XFS_BTREE_H__ */ 524#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index c221d0ecd52e..613c5cf19436 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -81,6 +81,10 @@
81 * - For each work item attached to the log intent item, 81 * - For each work item attached to the log intent item,
82 * * Perform the described action. 82 * * Perform the described action.
83 * * Attach the work item to the log done item. 83 * * Attach the work item to the log done item.
84 * * If the result of doing the work was -EAGAIN, ->finish work
85 * wants a new transaction. See the "Requesting a Fresh
86 * Transaction while Finishing Deferred Work" section below for
87 * details.
84 * 88 *
85 * The key here is that we must log an intent item for all pending 89 * The key here is that we must log an intent item for all pending
86 * work items every time we roll the transaction, and that we must log 90 * work items every time we roll the transaction, and that we must log
@@ -88,6 +92,34 @@
88 * we can perform complex remapping operations, chaining intent items 92 * we can perform complex remapping operations, chaining intent items
89 * as needed. 93 * as needed.
90 * 94 *
95 * Requesting a Fresh Transaction while Finishing Deferred Work
96 *
97 * If ->finish_item decides that it needs a fresh transaction to
98 * finish the work, it must ask its caller (xfs_defer_finish) for a
99 * continuation. The most likely cause of this circumstance are the
100 * refcount adjust functions deciding that they've logged enough items
101 * to be at risk of exceeding the transaction reservation.
102 *
103 * To get a fresh transaction, we want to log the existing log done
104 * item to prevent the log intent item from replaying, immediately log
105 * a new log intent item with the unfinished work items, roll the
106 * transaction, and re-call ->finish_item wherever it left off. The
107 * log done item and the new log intent item must be in the same
108 * transaction or atomicity cannot be guaranteed; defer_finish ensures
109 * that this happens.
110 *
111 * This requires some coordination between ->finish_item and
112 * defer_finish. Upon deciding to request a new transaction,
113 * ->finish_item should update the current work item to reflect the
114 * unfinished work. Next, it should reset the log done item's list
115 * count to the number of items finished, and return -EAGAIN.
116 * defer_finish sees the -EAGAIN, logs the new log intent item
117 * with the remaining work items, and leaves the xfs_defer_pending
118 * item at the head of the dop_work queue. Then it rolls the
119 * transaction and picks up processing where it left off. It is
120 * required that ->finish_item must be careful to leave enough
121 * transaction reservation to fit the new log intent item.
122 *
91 * This is an example of remapping the extent (E, E+B) into file X at 123 * This is an example of remapping the extent (E, E+B) into file X at
92 * offset A and dealing with the extent (C, C+B) already being mapped 124 * offset A and dealing with the extent (C, C+B) already being mapped
93 * there: 125 * there:
@@ -104,21 +136,26 @@
104 * | Intent to add rmap (X, E, A, B) | 136 * | Intent to add rmap (X, E, A, B) |
105 * +-------------------------------------------------+ 137 * +-------------------------------------------------+
106 * | Reduce refcount for extent (C, B) | t2 138 * | Reduce refcount for extent (C, B) | t2
107 * | Done reducing refcount for extent (C, B) | 139 * | Done reducing refcount for extent (C, 9) |
140 * | Intent to reduce refcount for extent (C+9, B-9) |
141 * | (ran out of space after 9 refcount updates) |
142 * +-------------------------------------------------+
143 * | Reduce refcount for extent (C+9, B+9) | t3
144 * | Done reducing refcount for extent (C+9, B-9) |
108 * | Increase refcount for extent (E, B) | 145 * | Increase refcount for extent (E, B) |
109 * | Done increasing refcount for extent (E, B) | 146 * | Done increasing refcount for extent (E, B) |
110 * | Intent to free extent (C, B) | 147 * | Intent to free extent (C, B) |
111 * | Intent to free extent (F, 1) (refcountbt block) | 148 * | Intent to free extent (F, 1) (refcountbt block) |
112 * | Intent to remove rmap (F, 1, REFC) | 149 * | Intent to remove rmap (F, 1, REFC) |
113 * +-------------------------------------------------+ 150 * +-------------------------------------------------+
114 * | Remove rmap (X, C, A, B) | t3 151 * | Remove rmap (X, C, A, B) | t4
115 * | Done removing rmap (X, C, A, B) | 152 * | Done removing rmap (X, C, A, B) |
116 * | Add rmap (X, E, A, B) | 153 * | Add rmap (X, E, A, B) |
117 * | Done adding rmap (X, E, A, B) | 154 * | Done adding rmap (X, E, A, B) |
118 * | Remove rmap (F, 1, REFC) | 155 * | Remove rmap (F, 1, REFC) |
119 * | Done removing rmap (F, 1, REFC) | 156 * | Done removing rmap (F, 1, REFC) |
120 * +-------------------------------------------------+ 157 * +-------------------------------------------------+
121 * | Free extent (C, B) | t4 158 * | Free extent (C, B) | t5
122 * | Done freeing extent (C, B) | 159 * | Done freeing extent (C, B) |
123 * | Free extent (D, 1) | 160 * | Free extent (D, 1) |
124 * | Done freeing extent (D, 1) | 161 * | Done freeing extent (D, 1) |
@@ -141,6 +178,9 @@
141 * - Intent to free extent (C, B) 178 * - Intent to free extent (C, B)
142 * - Intent to free extent (F, 1) (refcountbt block) 179 * - Intent to free extent (F, 1) (refcountbt block)
143 * - Intent to remove rmap (F, 1, REFC) 180 * - Intent to remove rmap (F, 1, REFC)
181 *
182 * Note that the continuation requested between t2 and t3 is likely to
183 * reoccur.
144 */ 184 */
145 185
146static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX]; 186static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX];
@@ -323,7 +363,16 @@ xfs_defer_finish(
323 dfp->dfp_count--; 363 dfp->dfp_count--;
324 error = dfp->dfp_type->finish_item(*tp, dop, li, 364 error = dfp->dfp_type->finish_item(*tp, dop, li,
325 dfp->dfp_done, &state); 365 dfp->dfp_done, &state);
326 if (error) { 366 if (error == -EAGAIN) {
367 /*
368 * Caller wants a fresh transaction;
369 * put the work item back on the list
370 * and jump out.
371 */
372 list_add(li, &dfp->dfp_work);
373 dfp->dfp_count++;
374 break;
375 } else if (error) {
327 /* 376 /*
328 * Clean up after ourselves and jump out. 377 * Clean up after ourselves and jump out.
329 * xfs_defer_cancel will take care of freeing 378 * xfs_defer_cancel will take care of freeing
@@ -335,9 +384,25 @@ xfs_defer_finish(
335 goto out; 384 goto out;
336 } 385 }
337 } 386 }
338 /* Done with the dfp, free it. */ 387 if (error == -EAGAIN) {
339 list_del(&dfp->dfp_list); 388 /*
340 kmem_free(dfp); 389 * Caller wants a fresh transaction, so log a
390 * new log intent item to replace the old one
391 * and roll the transaction. See "Requesting
392 * a Fresh Transaction while Finishing
393 * Deferred Work" above.
394 */
395 dfp->dfp_intent = dfp->dfp_type->create_intent(*tp,
396 dfp->dfp_count);
397 dfp->dfp_done = NULL;
398 list_for_each(li, &dfp->dfp_work)
399 dfp->dfp_type->log_item(*tp, dfp->dfp_intent,
400 li);
401 } else {
402 /* Done with the dfp, free it. */
403 list_del(&dfp->dfp_list);
404 kmem_free(dfp);
405 }
341 406
342 if (cleanup_fn) 407 if (cleanup_fn)
343 cleanup_fn(*tp, state, error); 408 cleanup_fn(*tp, state, error);
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 31ca2208c03d..eab68ae2e011 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -132,7 +132,7 @@ xfs_inobt_free_block(
132 xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); 132 xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
133 return xfs_free_extent(cur->bc_tp, 133 return xfs_free_extent(cur->bc_tp,
134 XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1, 134 XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1,
135 &oinfo); 135 &oinfo, XFS_AG_RESV_NONE);
136} 136}
137 137
138STATIC int 138STATIC int
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index a6eed43fa7cd..fc5eef85d61e 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -647,9 +647,17 @@ struct xfs_rui_log_format {
647 __uint16_t rui_size; /* size of this item */ 647 __uint16_t rui_size; /* size of this item */
648 __uint32_t rui_nextents; /* # extents to free */ 648 __uint32_t rui_nextents; /* # extents to free */
649 __uint64_t rui_id; /* rui identifier */ 649 __uint64_t rui_id; /* rui identifier */
650 struct xfs_map_extent rui_extents[1]; /* array of extents to rmap */ 650 struct xfs_map_extent rui_extents[]; /* array of extents to rmap */
651}; 651};
652 652
653static inline size_t
654xfs_rui_log_format_sizeof(
655 unsigned int nr)
656{
657 return sizeof(struct xfs_rui_log_format) +
658 nr * sizeof(struct xfs_map_extent);
659}
660
653/* 661/*
654 * This is the structure used to lay out an rud log item in the 662 * This is the structure used to lay out an rud log item in the
655 * log. The rud_extents array is a variable size array whose 663 * log. The rud_extents array is a variable size array whose
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 7575cfc3ad15..4a28fa91e3b1 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -200,7 +200,7 @@ xfs_setfilesize_trans_alloc(
200 * Update on-disk file size now that data has been written to disk. 200 * Update on-disk file size now that data has been written to disk.
201 */ 201 */
202STATIC int 202STATIC int
203xfs_setfilesize( 203__xfs_setfilesize(
204 struct xfs_inode *ip, 204 struct xfs_inode *ip,
205 struct xfs_trans *tp, 205 struct xfs_trans *tp,
206 xfs_off_t offset, 206 xfs_off_t offset,
@@ -225,6 +225,23 @@ xfs_setfilesize(
225 return xfs_trans_commit(tp); 225 return xfs_trans_commit(tp);
226} 226}
227 227
228int
229xfs_setfilesize(
230 struct xfs_inode *ip,
231 xfs_off_t offset,
232 size_t size)
233{
234 struct xfs_mount *mp = ip->i_mount;
235 struct xfs_trans *tp;
236 int error;
237
238 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
239 if (error)
240 return error;
241
242 return __xfs_setfilesize(ip, tp, offset, size);
243}
244
228STATIC int 245STATIC int
229xfs_setfilesize_ioend( 246xfs_setfilesize_ioend(
230 struct xfs_ioend *ioend, 247 struct xfs_ioend *ioend,
@@ -247,7 +264,7 @@ xfs_setfilesize_ioend(
247 return error; 264 return error;
248 } 265 }
249 266
250 return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); 267 return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
251} 268}
252 269
253/* 270/*
@@ -1336,13 +1353,12 @@ xfs_end_io_direct_write(
1336{ 1353{
1337 struct inode *inode = file_inode(iocb->ki_filp); 1354 struct inode *inode = file_inode(iocb->ki_filp);
1338 struct xfs_inode *ip = XFS_I(inode); 1355 struct xfs_inode *ip = XFS_I(inode);
1339 struct xfs_mount *mp = ip->i_mount;
1340 uintptr_t flags = (uintptr_t)private; 1356 uintptr_t flags = (uintptr_t)private;
1341 int error = 0; 1357 int error = 0;
1342 1358
1343 trace_xfs_end_io_direct_write(ip, offset, size); 1359 trace_xfs_end_io_direct_write(ip, offset, size);
1344 1360
1345 if (XFS_FORCED_SHUTDOWN(mp)) 1361 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1346 return -EIO; 1362 return -EIO;
1347 1363
1348 if (size <= 0) 1364 if (size <= 0)
@@ -1380,14 +1396,9 @@ xfs_end_io_direct_write(
1380 1396
1381 error = xfs_iomap_write_unwritten(ip, offset, size); 1397 error = xfs_iomap_write_unwritten(ip, offset, size);
1382 } else if (flags & XFS_DIO_FLAG_APPEND) { 1398 } else if (flags & XFS_DIO_FLAG_APPEND) {
1383 struct xfs_trans *tp;
1384
1385 trace_xfs_end_io_direct_write_append(ip, offset, size); 1399 trace_xfs_end_io_direct_write_append(ip, offset, size);
1386 1400
1387 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, 1401 error = xfs_setfilesize(ip, offset, size);
1388 &tp);
1389 if (!error)
1390 error = xfs_setfilesize(ip, tp, offset, size);
1391 } 1402 }
1392 1403
1393 return error; 1404 return error;
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index bf2d9a141a73..1950e3bca2ac 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -62,6 +62,7 @@ int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
62 62
63int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset, 63int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
64 ssize_t size, void *private); 64 ssize_t size, void *private);
65int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
65 66
66extern void xfs_count_page_state(struct page *, int *, int *); 67extern void xfs_count_page_state(struct page *, int *, int *);
67extern struct block_device *xfs_find_bdev_for_inode(struct inode *); 68extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 4ece4f2ffc72..e827d657c314 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -182,7 +182,7 @@ xfs_bmap_rtalloc(
182 XFS_TRANS_DQ_RTBCOUNT, (long) ralen); 182 XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
183 183
184 /* Zero the extent if we were asked to do so */ 184 /* Zero the extent if we were asked to do so */
185 if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) { 185 if (ap->datatype & XFS_ALLOC_USERDATA_ZERO) {
186 error = xfs_zero_extent(ap->ip, ap->blkno, ap->length); 186 error = xfs_zero_extent(ap->ip, ap->blkno, ap->length);
187 if (error) 187 if (error)
188 return error; 188 return error;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index e455f9098d49..2975cb2319f4 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -865,7 +865,7 @@ xfs_buf_item_log_segment(
865 */ 865 */
866 if (bit) { 866 if (bit) {
867 end_bit = MIN(bit + bits_to_set, (uint)NBWORD); 867 end_bit = MIN(bit + bits_to_set, (uint)NBWORD);
868 mask = ((1 << (end_bit - bit)) - 1) << bit; 868 mask = ((1U << (end_bit - bit)) - 1) << bit;
869 *wordp |= mask; 869 *wordp |= mask;
870 wordp++; 870 wordp++;
871 bits_set = end_bit - bit; 871 bits_set = end_bit - bit;
@@ -888,7 +888,7 @@ xfs_buf_item_log_segment(
888 */ 888 */
889 end_bit = bits_to_set - bits_set; 889 end_bit = bits_to_set - bits_set;
890 if (end_bit) { 890 if (end_bit) {
891 mask = (1 << end_bit) - 1; 891 mask = (1U << end_bit) - 1;
892 *wordp |= mask; 892 *wordp |= mask;
893 } 893 }
894} 894}
@@ -1095,7 +1095,8 @@ xfs_buf_iodone_callback_error(
1095 bp->b_last_error != bp->b_error) { 1095 bp->b_last_error != bp->b_error) {
1096 bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); 1096 bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
1097 bp->b_last_error = bp->b_error; 1097 bp->b_last_error = bp->b_error;
1098 if (cfg->retry_timeout && !bp->b_first_retry_time) 1098 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
1099 !bp->b_first_retry_time)
1099 bp->b_first_retry_time = jiffies; 1100 bp->b_first_retry_time = jiffies;
1100 1101
1101 xfs_buf_ioerror(bp, 0); 1102 xfs_buf_ioerror(bp, 0);
@@ -1111,7 +1112,7 @@ xfs_buf_iodone_callback_error(
1111 if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && 1112 if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
1112 ++bp->b_retries > cfg->max_retries) 1113 ++bp->b_retries > cfg->max_retries)
1113 goto permanent_error; 1114 goto permanent_error;
1114 if (cfg->retry_timeout && 1115 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
1115 time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) 1116 time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
1116 goto permanent_error; 1117 goto permanent_error;
1117 1118
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index c263e079273e..162dc186cf04 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -384,7 +384,7 @@ restart:
384 * If this is a metadata allocation, try to reuse the busy 384 * If this is a metadata allocation, try to reuse the busy
385 * extent instead of trimming the allocation. 385 * extent instead of trimming the allocation.
386 */ 386 */
387 if (!args->userdata && 387 if (!xfs_alloc_is_userdata(args->datatype) &&
388 !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { 388 !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
389 if (!xfs_extent_busy_update_extent(args->mp, args->pag, 389 if (!xfs_extent_busy_update_extent(args->mp, args->pag,
390 busyp, fbno, flen, 390 busyp, fbno, flen,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e612a0233710..c68517b0f248 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -269,6 +269,8 @@ xfs_file_dio_aio_read(
269 return -EINVAL; 269 return -EINVAL;
270 } 270 }
271 271
272 file_accessed(iocb->ki_filp);
273
272 /* 274 /*
273 * Locking is a bit tricky here. If we take an exclusive lock for direct 275 * Locking is a bit tricky here. If we take an exclusive lock for direct
274 * IO, we effectively serialise all new concurrent read IO to this file 276 * IO, we effectively serialise all new concurrent read IO to this file
@@ -323,7 +325,6 @@ xfs_file_dio_aio_read(
323 } 325 }
324 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); 326 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
325 327
326 file_accessed(iocb->ki_filp);
327 return ret; 328 return ret;
328} 329}
329 330
@@ -332,10 +333,7 @@ xfs_file_dax_read(
332 struct kiocb *iocb, 333 struct kiocb *iocb,
333 struct iov_iter *to) 334 struct iov_iter *to)
334{ 335{
335 struct address_space *mapping = iocb->ki_filp->f_mapping; 336 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
336 struct inode *inode = mapping->host;
337 struct xfs_inode *ip = XFS_I(inode);
338 struct iov_iter data = *to;
339 size_t count = iov_iter_count(to); 337 size_t count = iov_iter_count(to);
340 ssize_t ret = 0; 338 ssize_t ret = 0;
341 339
@@ -345,11 +343,7 @@ xfs_file_dax_read(
345 return 0; /* skip atime */ 343 return 0; /* skip atime */
346 344
347 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); 345 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
348 ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0); 346 ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops);
349 if (ret > 0) {
350 iocb->ki_pos += ret;
351 iov_iter_advance(to, ret);
352 }
353 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); 347 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
354 348
355 file_accessed(iocb->ki_filp); 349 file_accessed(iocb->ki_filp);
@@ -711,70 +705,32 @@ xfs_file_dax_write(
711 struct kiocb *iocb, 705 struct kiocb *iocb,
712 struct iov_iter *from) 706 struct iov_iter *from)
713{ 707{
714 struct address_space *mapping = iocb->ki_filp->f_mapping; 708 struct inode *inode = iocb->ki_filp->f_mapping->host;
715 struct inode *inode = mapping->host;
716 struct xfs_inode *ip = XFS_I(inode); 709 struct xfs_inode *ip = XFS_I(inode);
717 struct xfs_mount *mp = ip->i_mount; 710 int iolock = XFS_IOLOCK_EXCL;
718 ssize_t ret = 0; 711 ssize_t ret, error = 0;
719 int unaligned_io = 0; 712 size_t count;
720 int iolock; 713 loff_t pos;
721 struct iov_iter data;
722 714
723 /* "unaligned" here means not aligned to a filesystem block */
724 if ((iocb->ki_pos & mp->m_blockmask) ||
725 ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
726 unaligned_io = 1;
727 iolock = XFS_IOLOCK_EXCL;
728 } else if (mapping->nrpages) {
729 iolock = XFS_IOLOCK_EXCL;
730 } else {
731 iolock = XFS_IOLOCK_SHARED;
732 }
733 xfs_rw_ilock(ip, iolock); 715 xfs_rw_ilock(ip, iolock);
734
735 ret = xfs_file_aio_write_checks(iocb, from, &iolock); 716 ret = xfs_file_aio_write_checks(iocb, from, &iolock);
736 if (ret) 717 if (ret)
737 goto out; 718 goto out;
738 719
739 /* 720 pos = iocb->ki_pos;
740 * Yes, even DAX files can have page cache attached to them: A zeroed 721 count = iov_iter_count(from);
741 * page is inserted into the pagecache when we have to serve a write
742 * fault on a hole. It should never be dirtied and can simply be
743 * dropped from the pagecache once we get real data for the page.
744 *
745 * XXX: This is racy against mmap, and there's nothing we can do about
746 * it. dax_do_io() should really do this invalidation internally as
747 * it will know if we've allocated over a holei for this specific IO and
748 * if so it needs to update the mapping tree and invalidate existing
749 * PTEs over the newly allocated range. Remove this invalidation when
750 * dax_do_io() is fixed up.
751 */
752 if (mapping->nrpages) {
753 loff_t end = iocb->ki_pos + iov_iter_count(from) - 1;
754 722
755 ret = invalidate_inode_pages2_range(mapping, 723 trace_xfs_file_dax_write(ip, count, pos);
756 iocb->ki_pos >> PAGE_SHIFT,
757 end >> PAGE_SHIFT);
758 WARN_ON_ONCE(ret);
759 }
760 724
761 if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) { 725 ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops);
762 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 726 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
763 iolock = XFS_IOLOCK_SHARED; 727 i_size_write(inode, iocb->ki_pos);
728 error = xfs_setfilesize(ip, pos, ret);
764 } 729 }
765 730
766 trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
767
768 data = *from;
769 ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
770 xfs_end_io_direct_write, 0);
771 if (ret > 0) {
772 iocb->ki_pos += ret;
773 iov_iter_advance(from, ret);
774 }
775out: 731out:
776 xfs_rw_iunlock(ip, iolock); 732 xfs_rw_iunlock(ip, iolock);
777 return ret; 733 return error ? error : ret;
778} 734}
779 735
780STATIC ssize_t 736STATIC ssize_t
@@ -1513,7 +1469,7 @@ xfs_filemap_page_mkwrite(
1513 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1469 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1514 1470
1515 if (IS_DAX(inode)) { 1471 if (IS_DAX(inode)) {
1516 ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); 1472 ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
1517 } else { 1473 } else {
1518 ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); 1474 ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
1519 ret = block_page_mkwrite_return(ret); 1475 ret = block_page_mkwrite_return(ret);
@@ -1547,7 +1503,7 @@ xfs_filemap_fault(
1547 * changes to xfs_get_blocks_direct() to map unwritten extent 1503 * changes to xfs_get_blocks_direct() to map unwritten extent
1548 * ioend for conversion on read-only mappings. 1504 * ioend for conversion on read-only mappings.
1549 */ 1505 */
1550 ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault); 1506 ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
1551 } else 1507 } else
1552 ret = filemap_fault(vma, vmf); 1508 ret = filemap_fault(vma, vmf);
1553 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1509 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 4a33a3304369..043ca3808ea2 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -30,6 +30,7 @@
30#include "xfs_mru_cache.h" 30#include "xfs_mru_cache.h"
31#include "xfs_filestream.h" 31#include "xfs_filestream.h"
32#include "xfs_trace.h" 32#include "xfs_trace.h"
33#include "xfs_ag_resv.h"
33 34
34struct xfs_fstrm_item { 35struct xfs_fstrm_item {
35 struct xfs_mru_cache_elem mru; 36 struct xfs_mru_cache_elem mru;
@@ -198,7 +199,8 @@ xfs_filestream_pick_ag(
198 } 199 }
199 200
200 longest = xfs_alloc_longest_free_extent(mp, pag, 201 longest = xfs_alloc_longest_free_extent(mp, pag,
201 xfs_alloc_min_freelist(mp, pag)); 202 xfs_alloc_min_freelist(mp, pag),
203 xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
202 if (((minlen && longest >= minlen) || 204 if (((minlen && longest >= minlen) ||
203 (!minlen && pag->pagf_freeblks >= minfree)) && 205 (!minlen && pag->pagf_freeblks >= minfree)) &&
204 (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || 206 (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
@@ -369,7 +371,8 @@ xfs_filestream_new_ag(
369 struct xfs_mount *mp = ip->i_mount; 371 struct xfs_mount *mp = ip->i_mount;
370 xfs_extlen_t minlen = ap->length; 372 xfs_extlen_t minlen = ap->length;
371 xfs_agnumber_t startag = 0; 373 xfs_agnumber_t startag = 0;
372 int flags, err = 0; 374 int flags = 0;
375 int err = 0;
373 struct xfs_mru_cache_elem *mru; 376 struct xfs_mru_cache_elem *mru;
374 377
375 *agp = NULLAGNUMBER; 378 *agp = NULLAGNUMBER;
@@ -385,8 +388,10 @@ xfs_filestream_new_ag(
385 startag = (item->ag + 1) % mp->m_sb.sb_agcount; 388 startag = (item->ag + 1) % mp->m_sb.sb_agcount;
386 } 389 }
387 390
388 flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | 391 if (xfs_alloc_is_userdata(ap->datatype))
389 (ap->dfops->dop_low ? XFS_PICK_LOWSPACE : 0); 392 flags |= XFS_PICK_USERDATA;
393 if (ap->dfops->dop_low)
394 flags |= XFS_PICK_LOWSPACE;
390 395
391 err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen); 396 err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen);
392 397
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 0b7f986745c1..94ac06f3d908 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -553,7 +553,7 @@ xfs_growfs_data_private(
553 error = xfs_free_extent(tp, 553 error = xfs_free_extent(tp,
554 XFS_AGB_TO_FSB(mp, agno, 554 XFS_AGB_TO_FSB(mp, agno,
555 be32_to_cpu(agf->agf_length) - new), 555 be32_to_cpu(agf->agf_length) - new),
556 new, &oinfo); 556 new, &oinfo, XFS_AG_RESV_NONE);
557 if (error) 557 if (error)
558 goto error0; 558 goto error0;
559 } 559 }
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index fb39a66914dd..65b2e3f85f52 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1414,6 +1414,16 @@ xfs_inode_set_eofblocks_tag(
1414 struct xfs_perag *pag; 1414 struct xfs_perag *pag;
1415 int tagged; 1415 int tagged;
1416 1416
1417 /*
1418 * Don't bother locking the AG and looking up in the radix trees
1419 * if we already know that we have the tag set.
1420 */
1421 if (ip->i_flags & XFS_IEOFBLOCKS)
1422 return;
1423 spin_lock(&ip->i_flags_lock);
1424 ip->i_flags |= XFS_IEOFBLOCKS;
1425 spin_unlock(&ip->i_flags_lock);
1426
1417 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1427 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1418 spin_lock(&pag->pag_ici_lock); 1428 spin_lock(&pag->pag_ici_lock);
1419 trace_xfs_inode_set_eofblocks_tag(ip); 1429 trace_xfs_inode_set_eofblocks_tag(ip);
@@ -1449,6 +1459,10 @@ xfs_inode_clear_eofblocks_tag(
1449 struct xfs_mount *mp = ip->i_mount; 1459 struct xfs_mount *mp = ip->i_mount;
1450 struct xfs_perag *pag; 1460 struct xfs_perag *pag;
1451 1461
1462 spin_lock(&ip->i_flags_lock);
1463 ip->i_flags &= ~XFS_IEOFBLOCKS;
1464 spin_unlock(&ip->i_flags_lock);
1465
1452 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1466 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1453 spin_lock(&pag->pag_ici_lock); 1467 spin_lock(&pag->pag_ici_lock);
1454 trace_xfs_inode_clear_eofblocks_tag(ip); 1468 trace_xfs_inode_clear_eofblocks_tag(ip);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index e1a411e08f00..8f30d2533b48 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -216,6 +216,7 @@ xfs_get_initial_prid(struct xfs_inode *dp)
216#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ 216#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */
217#define XFS_IPINNED (1 << __XFS_IPINNED_BIT) 217#define XFS_IPINNED (1 << __XFS_IPINNED_BIT)
218#define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */ 218#define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */
219#define XFS_IEOFBLOCKS (1 << 10)/* has the preallocblocks tag set */
219 220
220/* 221/*
221 * Per-lifetime flags need to be reset when re-using a reclaimable inode during 222 * Per-lifetime flags need to be reset when re-using a reclaimable inode during
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 2af0dda1c978..c08253e11545 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * Copyright (c) 2016 Christoph Hellwig.
3 * All Rights Reserved. 4 * All Rights Reserved.
4 * 5 *
5 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
@@ -42,17 +43,40 @@
42 43
43#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ 44#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
44 << mp->m_writeio_log) 45 << mp->m_writeio_log)
45#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
46 46
47STATIC int 47void
48xfs_iomap_eof_align_last_fsb( 48xfs_bmbt_to_iomap(
49 xfs_mount_t *mp, 49 struct xfs_inode *ip,
50 xfs_inode_t *ip, 50 struct iomap *iomap,
51 xfs_extlen_t extsize, 51 struct xfs_bmbt_irec *imap)
52 xfs_fileoff_t *last_fsb) 52{
53 struct xfs_mount *mp = ip->i_mount;
54
55 if (imap->br_startblock == HOLESTARTBLOCK) {
56 iomap->blkno = IOMAP_NULL_BLOCK;
57 iomap->type = IOMAP_HOLE;
58 } else if (imap->br_startblock == DELAYSTARTBLOCK) {
59 iomap->blkno = IOMAP_NULL_BLOCK;
60 iomap->type = IOMAP_DELALLOC;
61 } else {
62 iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
63 if (imap->br_state == XFS_EXT_UNWRITTEN)
64 iomap->type = IOMAP_UNWRITTEN;
65 else
66 iomap->type = IOMAP_MAPPED;
67 }
68 iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
69 iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
70 iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
71}
72
73static xfs_extlen_t
74xfs_eof_alignment(
75 struct xfs_inode *ip,
76 xfs_extlen_t extsize)
53{ 77{
54 xfs_extlen_t align = 0; 78 struct xfs_mount *mp = ip->i_mount;
55 int eof, error; 79 xfs_extlen_t align = 0;
56 80
57 if (!XFS_IS_REALTIME_INODE(ip)) { 81 if (!XFS_IS_REALTIME_INODE(ip)) {
58 /* 82 /*
@@ -83,8 +107,21 @@ xfs_iomap_eof_align_last_fsb(
83 align = extsize; 107 align = extsize;
84 } 108 }
85 109
110 return align;
111}
112
113STATIC int
114xfs_iomap_eof_align_last_fsb(
115 struct xfs_inode *ip,
116 xfs_extlen_t extsize,
117 xfs_fileoff_t *last_fsb)
118{
119 xfs_extlen_t align = xfs_eof_alignment(ip, extsize);
120
86 if (align) { 121 if (align) {
87 xfs_fileoff_t new_last_fsb = roundup_64(*last_fsb, align); 122 xfs_fileoff_t new_last_fsb = roundup_64(*last_fsb, align);
123 int eof, error;
124
88 error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof); 125 error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
89 if (error) 126 if (error)
90 return error; 127 return error;
@@ -154,7 +191,7 @@ xfs_iomap_write_direct(
154 */ 191 */
155 ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags & 192 ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags &
156 XFS_IFEXTENTS); 193 XFS_IFEXTENTS);
157 error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); 194 error = xfs_iomap_eof_align_last_fsb(ip, extsz, &last_fsb);
158 if (error) 195 if (error)
159 goto out_unlock; 196 goto out_unlock;
160 } else { 197 } else {
@@ -274,130 +311,6 @@ out_trans_cancel:
274 goto out_unlock; 311 goto out_unlock;
275} 312}
276 313
277/*
278 * If the caller is doing a write at the end of the file, then extend the
279 * allocation out to the file system's write iosize. We clean up any extra
280 * space left over when the file is closed in xfs_inactive().
281 *
282 * If we find we already have delalloc preallocation beyond EOF, don't do more
283 * preallocation as it it not needed.
284 */
285STATIC int
286xfs_iomap_eof_want_preallocate(
287 xfs_mount_t *mp,
288 xfs_inode_t *ip,
289 xfs_off_t offset,
290 size_t count,
291 xfs_bmbt_irec_t *imap,
292 int nimaps,
293 int *prealloc)
294{
295 xfs_fileoff_t start_fsb;
296 xfs_filblks_t count_fsb;
297 int n, error, imaps;
298 int found_delalloc = 0;
299
300 *prealloc = 0;
301 if (offset + count <= XFS_ISIZE(ip))
302 return 0;
303
304 /*
305 * If the file is smaller than the minimum prealloc and we are using
306 * dynamic preallocation, don't do any preallocation at all as it is
307 * likely this is the only write to the file that is going to be done.
308 */
309 if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
310 XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))
311 return 0;
312
313 /*
314 * If there are any real blocks past eof, then don't
315 * do any speculative allocation.
316 */
317 start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1)));
318 count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
319 while (count_fsb > 0) {
320 imaps = nimaps;
321 error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps,
322 0);
323 if (error)
324 return error;
325 for (n = 0; n < imaps; n++) {
326 if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
327 (imap[n].br_startblock != DELAYSTARTBLOCK))
328 return 0;
329 start_fsb += imap[n].br_blockcount;
330 count_fsb -= imap[n].br_blockcount;
331
332 if (imap[n].br_startblock == DELAYSTARTBLOCK)
333 found_delalloc = 1;
334 }
335 }
336 if (!found_delalloc)
337 *prealloc = 1;
338 return 0;
339}
340
341/*
342 * Determine the initial size of the preallocation. We are beyond the current
343 * EOF here, but we need to take into account whether this is a sparse write or
344 * an extending write when determining the preallocation size. Hence we need to
345 * look up the extent that ends at the current write offset and use the result
346 * to determine the preallocation size.
347 *
348 * If the extent is a hole, then preallocation is essentially disabled.
349 * Otherwise we take the size of the preceeding data extent as the basis for the
350 * preallocation size. If the size of the extent is greater than half the
351 * maximum extent length, then use the current offset as the basis. This ensures
352 * that for large files the preallocation size always extends to MAXEXTLEN
353 * rather than falling short due to things like stripe unit/width alignment of
354 * real extents.
355 */
356STATIC xfs_fsblock_t
357xfs_iomap_eof_prealloc_initial_size(
358 struct xfs_mount *mp,
359 struct xfs_inode *ip,
360 xfs_off_t offset,
361 xfs_bmbt_irec_t *imap,
362 int nimaps)
363{
364 xfs_fileoff_t start_fsb;
365 int imaps = 1;
366 int error;
367
368 ASSERT(nimaps >= imaps);
369
370 /* if we are using a specific prealloc size, return now */
371 if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
372 return 0;
373
374 /* If the file is small, then use the minimum prealloc */
375 if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign))
376 return 0;
377
378 /*
379 * As we write multiple pages, the offset will always align to the
380 * start of a page and hence point to a hole at EOF. i.e. if the size is
381 * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096)
382 * will return FSB 1. Hence if there are blocks in the file, we want to
383 * point to the block prior to the EOF block and not the hole that maps
384 * directly at @offset.
385 */
386 start_fsb = XFS_B_TO_FSB(mp, offset);
387 if (start_fsb)
388 start_fsb--;
389 error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE);
390 if (error)
391 return 0;
392
393 ASSERT(imaps == 1);
394 if (imap[0].br_startblock == HOLESTARTBLOCK)
395 return 0;
396 if (imap[0].br_blockcount <= (MAXEXTLEN >> 1))
397 return imap[0].br_blockcount << 1;
398 return XFS_B_TO_FSB(mp, offset);
399}
400
401STATIC bool 314STATIC bool
402xfs_quota_need_throttle( 315xfs_quota_need_throttle(
403 struct xfs_inode *ip, 316 struct xfs_inode *ip,
@@ -459,27 +372,76 @@ xfs_quota_calc_throttle(
459} 372}
460 373
461/* 374/*
375 * If we are doing a write at the end of the file and there are no allocations
376 * past this one, then extend the allocation out to the file system's write
377 * iosize.
378 *
462 * If we don't have a user specified preallocation size, dynamically increase 379 * If we don't have a user specified preallocation size, dynamically increase
463 * the preallocation size as the size of the file grows. Cap the maximum size 380 * the preallocation size as the size of the file grows. Cap the maximum size
464 * at a single extent or less if the filesystem is near full. The closer the 381 * at a single extent or less if the filesystem is near full. The closer the
465 * filesystem is to full, the smaller the maximum prealocation. 382 * filesystem is to full, the smaller the maximum prealocation.
383 *
384 * As an exception we don't do any preallocation at all if the file is smaller
385 * than the minimum preallocation and we are using the default dynamic
386 * preallocation scheme, as it is likely this is the only write to the file that
387 * is going to be done.
388 *
389 * We clean up any extra space left over when the file is closed in
390 * xfs_inactive().
466 */ 391 */
467STATIC xfs_fsblock_t 392STATIC xfs_fsblock_t
468xfs_iomap_prealloc_size( 393xfs_iomap_prealloc_size(
469 struct xfs_mount *mp,
470 struct xfs_inode *ip, 394 struct xfs_inode *ip,
471 xfs_off_t offset, 395 loff_t offset,
472 struct xfs_bmbt_irec *imap, 396 loff_t count,
473 int nimaps) 397 xfs_extnum_t idx,
398 struct xfs_bmbt_irec *prev)
474{ 399{
475 xfs_fsblock_t alloc_blocks = 0; 400 struct xfs_mount *mp = ip->i_mount;
401 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
476 int shift = 0; 402 int shift = 0;
477 int64_t freesp; 403 int64_t freesp;
478 xfs_fsblock_t qblocks; 404 xfs_fsblock_t qblocks;
479 int qshift = 0; 405 int qshift = 0;
406 xfs_fsblock_t alloc_blocks = 0;
407
408 if (offset + count <= XFS_ISIZE(ip))
409 return 0;
480 410
481 alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset, 411 if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
482 imap, nimaps); 412 (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks)))
413 return 0;
414
415 /*
416 * If an explicit allocsize is set, the file is small, or we
417 * are writing behind a hole, then use the minimum prealloc:
418 */
419 if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ||
420 XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) ||
421 idx == 0 ||
422 prev->br_startoff + prev->br_blockcount < offset_fsb)
423 return mp->m_writeio_blocks;
424
425 /*
426 * Determine the initial size of the preallocation. We are beyond the
427 * current EOF here, but we need to take into account whether this is
428 * a sparse write or an extending write when determining the
429 * preallocation size. Hence we need to look up the extent that ends
430 * at the current write offset and use the result to determine the
431 * preallocation size.
432 *
433 * If the extent is a hole, then preallocation is essentially disabled.
434 * Otherwise we take the size of the preceding data extent as the basis
435 * for the preallocation size. If the size of the extent is greater than
436 * half the maximum extent length, then use the current offset as the
437 * basis. This ensures that for large files the preallocation size
438 * always extends to MAXEXTLEN rather than falling short due to things
439 * like stripe unit/width alignment of real extents.
440 */
441 if (prev->br_blockcount <= (MAXEXTLEN >> 1))
442 alloc_blocks = prev->br_blockcount << 1;
443 else
444 alloc_blocks = XFS_B_TO_FSB(mp, offset);
483 if (!alloc_blocks) 445 if (!alloc_blocks)
484 goto check_writeio; 446 goto check_writeio;
485 qblocks = alloc_blocks; 447 qblocks = alloc_blocks;
@@ -550,120 +512,145 @@ xfs_iomap_prealloc_size(
550 */ 512 */
551 while (alloc_blocks && alloc_blocks >= freesp) 513 while (alloc_blocks && alloc_blocks >= freesp)
552 alloc_blocks >>= 4; 514 alloc_blocks >>= 4;
553
554check_writeio: 515check_writeio:
555 if (alloc_blocks < mp->m_writeio_blocks) 516 if (alloc_blocks < mp->m_writeio_blocks)
556 alloc_blocks = mp->m_writeio_blocks; 517 alloc_blocks = mp->m_writeio_blocks;
557
558 trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift, 518 trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift,
559 mp->m_writeio_blocks); 519 mp->m_writeio_blocks);
560
561 return alloc_blocks; 520 return alloc_blocks;
562} 521}
563 522
564int 523static int
565xfs_iomap_write_delay( 524xfs_file_iomap_begin_delay(
566 xfs_inode_t *ip, 525 struct inode *inode,
567 xfs_off_t offset, 526 loff_t offset,
568 size_t count, 527 loff_t count,
569 xfs_bmbt_irec_t *ret_imap) 528 unsigned flags,
529 struct iomap *iomap)
570{ 530{
571 xfs_mount_t *mp = ip->i_mount; 531 struct xfs_inode *ip = XFS_I(inode);
572 xfs_fileoff_t offset_fsb; 532 struct xfs_mount *mp = ip->i_mount;
573 xfs_fileoff_t last_fsb; 533 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
574 xfs_off_t aligned_offset; 534 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
575 xfs_fileoff_t ioalign; 535 xfs_fileoff_t maxbytes_fsb =
576 xfs_extlen_t extsz; 536 XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
577 int nimaps; 537 xfs_fileoff_t end_fsb, orig_end_fsb;
578 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; 538 int error = 0, eof = 0;
579 int prealloc; 539 struct xfs_bmbt_irec got;
580 int error; 540 struct xfs_bmbt_irec prev;
581 541 xfs_extnum_t idx;
582 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
583
584 /*
585 * Make sure that the dquots are there. This doesn't hold
586 * the ilock across a disk read.
587 */
588 error = xfs_qm_dqattach_locked(ip, 0);
589 if (error)
590 return error;
591 542
592 extsz = xfs_get_extsz_hint(ip); 543 ASSERT(!XFS_IS_REALTIME_INODE(ip));
593 offset_fsb = XFS_B_TO_FSBT(mp, offset); 544 ASSERT(!xfs_get_extsz_hint(ip));
594 545
595 error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, 546 xfs_ilock(ip, XFS_ILOCK_EXCL);
596 imap, XFS_WRITE_IMAPS, &prealloc);
597 if (error)
598 return error;
599 547
600retry: 548 if (unlikely(XFS_TEST_ERROR(
601 if (prealloc) { 549 (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
602 xfs_fsblock_t alloc_blocks; 550 XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
551 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
552 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
553 error = -EFSCORRUPTED;
554 goto out_unlock;
555 }
603 556
604 alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap, 557 XFS_STATS_INC(mp, xs_blk_mapw);
605 XFS_WRITE_IMAPS);
606 558
607 aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); 559 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
608 ioalign = XFS_B_TO_FSBT(mp, aligned_offset); 560 error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
609 last_fsb = ioalign + alloc_blocks; 561 if (error)
610 } else { 562 goto out_unlock;
611 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
612 } 563 }
613 564
614 if (prealloc || extsz) { 565 xfs_bmap_search_extents(ip, offset_fsb, XFS_DATA_FORK, &eof, &idx,
615 error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); 566 &got, &prev);
616 if (error) 567 if (!eof && got.br_startoff <= offset_fsb) {
617 return error; 568 trace_xfs_iomap_found(ip, offset, count, 0, &got);
569 goto done;
618 } 570 }
619 571
572 error = xfs_qm_dqattach_locked(ip, 0);
573 if (error)
574 goto out_unlock;
575
620 /* 576 /*
621 * Make sure preallocation does not create extents beyond the range we 577 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
622 * actually support in this filesystem. 578 * to keep the chunks of work done where somewhat symmetric with the
579 * work writeback does. This is a completely arbitrary number pulled
580 * out of thin air as a best guess for initial testing.
581 *
582 * Note that the values needs to be less than 32-bits wide until
583 * the lower level functions are updated.
623 */ 584 */
624 if (last_fsb > XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)) 585 count = min_t(loff_t, count, 1024 * PAGE_SIZE);
625 last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); 586 end_fsb = orig_end_fsb =
587 min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
588
589 if (eof) {
590 xfs_fsblock_t prealloc_blocks;
626 591
627 ASSERT(last_fsb > offset_fsb); 592 prealloc_blocks =
593 xfs_iomap_prealloc_size(ip, offset, count, idx, &prev);
594 if (prealloc_blocks) {
595 xfs_extlen_t align;
596 xfs_off_t end_offset;
628 597
629 nimaps = XFS_WRITE_IMAPS; 598 end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1);
630 error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb, 599 end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
631 imap, &nimaps, XFS_BMAPI_ENTIRE); 600 prealloc_blocks;
601
602 align = xfs_eof_alignment(ip, 0);
603 if (align)
604 end_fsb = roundup_64(end_fsb, align);
605
606 end_fsb = min(end_fsb, maxbytes_fsb);
607 ASSERT(end_fsb > offset_fsb);
608 }
609 }
610
611retry:
612 error = xfs_bmapi_reserve_delalloc(ip, offset_fsb,
613 end_fsb - offset_fsb, &got,
614 &prev, &idx, eof);
632 switch (error) { 615 switch (error) {
633 case 0: 616 case 0:
617 break;
634 case -ENOSPC: 618 case -ENOSPC:
635 case -EDQUOT: 619 case -EDQUOT:
636 break; 620 /* retry without any preallocation */
637 default:
638 return error;
639 }
640
641 /*
642 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
643 * without EOF preallocation.
644 */
645 if (nimaps == 0) {
646 trace_xfs_delalloc_enospc(ip, offset, count); 621 trace_xfs_delalloc_enospc(ip, offset, count);
647 if (prealloc) { 622 if (end_fsb != orig_end_fsb) {
648 prealloc = 0; 623 end_fsb = orig_end_fsb;
649 error = 0;
650 goto retry; 624 goto retry;
651 } 625 }
652 return error ? error : -ENOSPC; 626 /*FALLTHRU*/
627 default:
628 goto out_unlock;
653 } 629 }
654 630
655 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
656 return xfs_alert_fsblock_zero(ip, &imap[0]);
657
658 /* 631 /*
659 * Tag the inode as speculatively preallocated so we can reclaim this 632 * Tag the inode as speculatively preallocated so we can reclaim this
660 * space on demand, if necessary. 633 * space on demand, if necessary.
661 */ 634 */
662 if (prealloc) 635 if (end_fsb != orig_end_fsb)
663 xfs_inode_set_eofblocks_tag(ip); 636 xfs_inode_set_eofblocks_tag(ip);
664 637
665 *ret_imap = imap[0]; 638 trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
666 return 0; 639done:
640 if (isnullstartblock(got.br_startblock))
641 got.br_startblock = DELAYSTARTBLOCK;
642
643 if (!got.br_startblock) {
644 error = xfs_alert_fsblock_zero(ip, &got);
645 if (error)
646 goto out_unlock;
647 }
648
649 xfs_bmbt_to_iomap(ip, iomap, &got);
650
651out_unlock:
652 xfs_iunlock(ip, XFS_ILOCK_EXCL);
653 return error;
667} 654}
668 655
669/* 656/*
@@ -947,37 +934,13 @@ error_on_bmapi_transaction:
947 return error; 934 return error;
948} 935}
949 936
950void 937static inline bool imap_needs_alloc(struct inode *inode,
951xfs_bmbt_to_iomap( 938 struct xfs_bmbt_irec *imap, int nimaps)
952 struct xfs_inode *ip,
953 struct iomap *iomap,
954 struct xfs_bmbt_irec *imap)
955{
956 struct xfs_mount *mp = ip->i_mount;
957
958 if (imap->br_startblock == HOLESTARTBLOCK) {
959 iomap->blkno = IOMAP_NULL_BLOCK;
960 iomap->type = IOMAP_HOLE;
961 } else if (imap->br_startblock == DELAYSTARTBLOCK) {
962 iomap->blkno = IOMAP_NULL_BLOCK;
963 iomap->type = IOMAP_DELALLOC;
964 } else {
965 iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
966 if (imap->br_state == XFS_EXT_UNWRITTEN)
967 iomap->type = IOMAP_UNWRITTEN;
968 else
969 iomap->type = IOMAP_MAPPED;
970 }
971 iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
972 iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
973 iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
974}
975
976static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
977{ 939{
978 return !nimaps || 940 return !nimaps ||
979 imap->br_startblock == HOLESTARTBLOCK || 941 imap->br_startblock == HOLESTARTBLOCK ||
980 imap->br_startblock == DELAYSTARTBLOCK; 942 imap->br_startblock == DELAYSTARTBLOCK ||
943 (IS_DAX(inode) && ISUNWRITTEN(imap));
981} 944}
982 945
983static int 946static int
@@ -993,11 +956,18 @@ xfs_file_iomap_begin(
993 struct xfs_bmbt_irec imap; 956 struct xfs_bmbt_irec imap;
994 xfs_fileoff_t offset_fsb, end_fsb; 957 xfs_fileoff_t offset_fsb, end_fsb;
995 int nimaps = 1, error = 0; 958 int nimaps = 1, error = 0;
959 unsigned lockmode;
996 960
997 if (XFS_FORCED_SHUTDOWN(mp)) 961 if (XFS_FORCED_SHUTDOWN(mp))
998 return -EIO; 962 return -EIO;
999 963
1000 xfs_ilock(ip, XFS_ILOCK_EXCL); 964 if ((flags & IOMAP_WRITE) &&
965 !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
966 return xfs_file_iomap_begin_delay(inode, offset, length, flags,
967 iomap);
968 }
969
970 lockmode = xfs_ilock_data_map_shared(ip);
1001 971
1002 ASSERT(offset <= mp->m_super->s_maxbytes); 972 ASSERT(offset <= mp->m_super->s_maxbytes);
1003 if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) 973 if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
@@ -1008,11 +978,11 @@ xfs_file_iomap_begin(
1008 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, 978 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
1009 &nimaps, XFS_BMAPI_ENTIRE); 979 &nimaps, XFS_BMAPI_ENTIRE);
1010 if (error) { 980 if (error) {
1011 xfs_iunlock(ip, XFS_ILOCK_EXCL); 981 xfs_iunlock(ip, lockmode);
1012 return error; 982 return error;
1013 } 983 }
1014 984
1015 if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) { 985 if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) {
1016 /* 986 /*
1017 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES 987 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
1018 * pages to keep the chunks of work done where somewhat symmetric 988 * pages to keep the chunks of work done where somewhat symmetric
@@ -1024,27 +994,23 @@ xfs_file_iomap_begin(
1024 * the lower level functions are updated. 994 * the lower level functions are updated.
1025 */ 995 */
1026 length = min_t(loff_t, length, 1024 * PAGE_SIZE); 996 length = min_t(loff_t, length, 1024 * PAGE_SIZE);
1027 if (xfs_get_extsz_hint(ip)) { 997 /*
1028 /* 998 * xfs_iomap_write_direct() expects the shared lock. It
1029 * xfs_iomap_write_direct() expects the shared lock. It 999 * is unlocked on return.
1030 * is unlocked on return. 1000 */
1031 */ 1001 if (lockmode == XFS_ILOCK_EXCL)
1032 xfs_ilock_demote(ip, XFS_ILOCK_EXCL); 1002 xfs_ilock_demote(ip, lockmode);
1033 error = xfs_iomap_write_direct(ip, offset, length, &imap, 1003 error = xfs_iomap_write_direct(ip, offset, length, &imap,
1034 nimaps); 1004 nimaps);
1035 } else {
1036 error = xfs_iomap_write_delay(ip, offset, length, &imap);
1037 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1038 }
1039
1040 if (error) 1005 if (error)
1041 return error; 1006 return error;
1042 1007
1008 iomap->flags = IOMAP_F_NEW;
1043 trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); 1009 trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
1044 } else { 1010 } else {
1045 ASSERT(nimaps); 1011 ASSERT(nimaps);
1046 1012
1047 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1013 xfs_iunlock(ip, lockmode);
1048 trace_xfs_iomap_found(ip, offset, length, 0, &imap); 1014 trace_xfs_iomap_found(ip, offset, length, 0, &imap);
1049 } 1015 }
1050 1016
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index fb8aca3d69ab..6498be485932 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -25,8 +25,6 @@ struct xfs_bmbt_irec;
25 25
26int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, 26int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
27 struct xfs_bmbt_irec *, int); 27 struct xfs_bmbt_irec *, int);
28int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
29 struct xfs_bmbt_irec *);
30int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, 28int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
31 struct xfs_bmbt_irec *); 29 struct xfs_bmbt_irec *);
32int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); 30int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 765f084759b5..2b6eec52178e 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -413,7 +413,8 @@ struct xlog {
413 /* log record crc error injection factor */ 413 /* log record crc error injection factor */
414 uint32_t l_badcrc_factor; 414 uint32_t l_badcrc_factor;
415#endif 415#endif
416 416 /* log recovery lsn tracking (for buffer submission */
417 xfs_lsn_t l_recovery_lsn;
417}; 418};
418 419
419#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ 420#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index e8638fd2c0c3..846483d56949 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -44,6 +44,7 @@
44#include "xfs_error.h" 44#include "xfs_error.h"
45#include "xfs_dir2.h" 45#include "xfs_dir2.h"
46#include "xfs_rmap_item.h" 46#include "xfs_rmap_item.h"
47#include "xfs_buf_item.h"
47 48
48#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) 49#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
49 50
@@ -381,6 +382,15 @@ xlog_recover_iodone(
381 SHUTDOWN_META_IO_ERROR); 382 SHUTDOWN_META_IO_ERROR);
382 } 383 }
383 } 384 }
385
386 /*
387 * On v5 supers, a bli could be attached to update the metadata LSN.
388 * Clean it up.
389 */
390 if (bp->b_fspriv)
391 xfs_buf_item_relse(bp);
392 ASSERT(bp->b_fspriv == NULL);
393
384 bp->b_iodone = NULL; 394 bp->b_iodone = NULL;
385 xfs_buf_ioend(bp); 395 xfs_buf_ioend(bp);
386} 396}
@@ -2360,12 +2370,14 @@ static void
2360xlog_recover_validate_buf_type( 2370xlog_recover_validate_buf_type(
2361 struct xfs_mount *mp, 2371 struct xfs_mount *mp,
2362 struct xfs_buf *bp, 2372 struct xfs_buf *bp,
2363 xfs_buf_log_format_t *buf_f) 2373 xfs_buf_log_format_t *buf_f,
2374 xfs_lsn_t current_lsn)
2364{ 2375{
2365 struct xfs_da_blkinfo *info = bp->b_addr; 2376 struct xfs_da_blkinfo *info = bp->b_addr;
2366 __uint32_t magic32; 2377 __uint32_t magic32;
2367 __uint16_t magic16; 2378 __uint16_t magic16;
2368 __uint16_t magicda; 2379 __uint16_t magicda;
2380 char *warnmsg = NULL;
2369 2381
2370 /* 2382 /*
2371 * We can only do post recovery validation on items on CRC enabled 2383 * We can only do post recovery validation on items on CRC enabled
@@ -2404,31 +2416,27 @@ xlog_recover_validate_buf_type(
2404 bp->b_ops = &xfs_rmapbt_buf_ops; 2416 bp->b_ops = &xfs_rmapbt_buf_ops;
2405 break; 2417 break;
2406 default: 2418 default:
2407 xfs_warn(mp, "Bad btree block magic!"); 2419 warnmsg = "Bad btree block magic!";
2408 ASSERT(0);
2409 break; 2420 break;
2410 } 2421 }
2411 break; 2422 break;
2412 case XFS_BLFT_AGF_BUF: 2423 case XFS_BLFT_AGF_BUF:
2413 if (magic32 != XFS_AGF_MAGIC) { 2424 if (magic32 != XFS_AGF_MAGIC) {
2414 xfs_warn(mp, "Bad AGF block magic!"); 2425 warnmsg = "Bad AGF block magic!";
2415 ASSERT(0);
2416 break; 2426 break;
2417 } 2427 }
2418 bp->b_ops = &xfs_agf_buf_ops; 2428 bp->b_ops = &xfs_agf_buf_ops;
2419 break; 2429 break;
2420 case XFS_BLFT_AGFL_BUF: 2430 case XFS_BLFT_AGFL_BUF:
2421 if (magic32 != XFS_AGFL_MAGIC) { 2431 if (magic32 != XFS_AGFL_MAGIC) {
2422 xfs_warn(mp, "Bad AGFL block magic!"); 2432 warnmsg = "Bad AGFL block magic!";
2423 ASSERT(0);
2424 break; 2433 break;
2425 } 2434 }
2426 bp->b_ops = &xfs_agfl_buf_ops; 2435 bp->b_ops = &xfs_agfl_buf_ops;
2427 break; 2436 break;
2428 case XFS_BLFT_AGI_BUF: 2437 case XFS_BLFT_AGI_BUF:
2429 if (magic32 != XFS_AGI_MAGIC) { 2438 if (magic32 != XFS_AGI_MAGIC) {
2430 xfs_warn(mp, "Bad AGI block magic!"); 2439 warnmsg = "Bad AGI block magic!";
2431 ASSERT(0);
2432 break; 2440 break;
2433 } 2441 }
2434 bp->b_ops = &xfs_agi_buf_ops; 2442 bp->b_ops = &xfs_agi_buf_ops;
@@ -2438,8 +2446,7 @@ xlog_recover_validate_buf_type(
2438 case XFS_BLFT_GDQUOT_BUF: 2446 case XFS_BLFT_GDQUOT_BUF:
2439#ifdef CONFIG_XFS_QUOTA 2447#ifdef CONFIG_XFS_QUOTA
2440 if (magic16 != XFS_DQUOT_MAGIC) { 2448 if (magic16 != XFS_DQUOT_MAGIC) {
2441 xfs_warn(mp, "Bad DQUOT block magic!"); 2449 warnmsg = "Bad DQUOT block magic!";
2442 ASSERT(0);
2443 break; 2450 break;
2444 } 2451 }
2445 bp->b_ops = &xfs_dquot_buf_ops; 2452 bp->b_ops = &xfs_dquot_buf_ops;
@@ -2451,16 +2458,14 @@ xlog_recover_validate_buf_type(
2451 break; 2458 break;
2452 case XFS_BLFT_DINO_BUF: 2459 case XFS_BLFT_DINO_BUF:
2453 if (magic16 != XFS_DINODE_MAGIC) { 2460 if (magic16 != XFS_DINODE_MAGIC) {
2454 xfs_warn(mp, "Bad INODE block magic!"); 2461 warnmsg = "Bad INODE block magic!";
2455 ASSERT(0);
2456 break; 2462 break;
2457 } 2463 }
2458 bp->b_ops = &xfs_inode_buf_ops; 2464 bp->b_ops = &xfs_inode_buf_ops;
2459 break; 2465 break;
2460 case XFS_BLFT_SYMLINK_BUF: 2466 case XFS_BLFT_SYMLINK_BUF:
2461 if (magic32 != XFS_SYMLINK_MAGIC) { 2467 if (magic32 != XFS_SYMLINK_MAGIC) {
2462 xfs_warn(mp, "Bad symlink block magic!"); 2468 warnmsg = "Bad symlink block magic!";
2463 ASSERT(0);
2464 break; 2469 break;
2465 } 2470 }
2466 bp->b_ops = &xfs_symlink_buf_ops; 2471 bp->b_ops = &xfs_symlink_buf_ops;
@@ -2468,8 +2473,7 @@ xlog_recover_validate_buf_type(
2468 case XFS_BLFT_DIR_BLOCK_BUF: 2473 case XFS_BLFT_DIR_BLOCK_BUF:
2469 if (magic32 != XFS_DIR2_BLOCK_MAGIC && 2474 if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
2470 magic32 != XFS_DIR3_BLOCK_MAGIC) { 2475 magic32 != XFS_DIR3_BLOCK_MAGIC) {
2471 xfs_warn(mp, "Bad dir block magic!"); 2476 warnmsg = "Bad dir block magic!";
2472 ASSERT(0);
2473 break; 2477 break;
2474 } 2478 }
2475 bp->b_ops = &xfs_dir3_block_buf_ops; 2479 bp->b_ops = &xfs_dir3_block_buf_ops;
@@ -2477,8 +2481,7 @@ xlog_recover_validate_buf_type(
2477 case XFS_BLFT_DIR_DATA_BUF: 2481 case XFS_BLFT_DIR_DATA_BUF:
2478 if (magic32 != XFS_DIR2_DATA_MAGIC && 2482 if (magic32 != XFS_DIR2_DATA_MAGIC &&
2479 magic32 != XFS_DIR3_DATA_MAGIC) { 2483 magic32 != XFS_DIR3_DATA_MAGIC) {
2480 xfs_warn(mp, "Bad dir data magic!"); 2484 warnmsg = "Bad dir data magic!";
2481 ASSERT(0);
2482 break; 2485 break;
2483 } 2486 }
2484 bp->b_ops = &xfs_dir3_data_buf_ops; 2487 bp->b_ops = &xfs_dir3_data_buf_ops;
@@ -2486,8 +2489,7 @@ xlog_recover_validate_buf_type(
2486 case XFS_BLFT_DIR_FREE_BUF: 2489 case XFS_BLFT_DIR_FREE_BUF:
2487 if (magic32 != XFS_DIR2_FREE_MAGIC && 2490 if (magic32 != XFS_DIR2_FREE_MAGIC &&
2488 magic32 != XFS_DIR3_FREE_MAGIC) { 2491 magic32 != XFS_DIR3_FREE_MAGIC) {
2489 xfs_warn(mp, "Bad dir3 free magic!"); 2492 warnmsg = "Bad dir3 free magic!";
2490 ASSERT(0);
2491 break; 2493 break;
2492 } 2494 }
2493 bp->b_ops = &xfs_dir3_free_buf_ops; 2495 bp->b_ops = &xfs_dir3_free_buf_ops;
@@ -2495,8 +2497,7 @@ xlog_recover_validate_buf_type(
2495 case XFS_BLFT_DIR_LEAF1_BUF: 2497 case XFS_BLFT_DIR_LEAF1_BUF:
2496 if (magicda != XFS_DIR2_LEAF1_MAGIC && 2498 if (magicda != XFS_DIR2_LEAF1_MAGIC &&
2497 magicda != XFS_DIR3_LEAF1_MAGIC) { 2499 magicda != XFS_DIR3_LEAF1_MAGIC) {
2498 xfs_warn(mp, "Bad dir leaf1 magic!"); 2500 warnmsg = "Bad dir leaf1 magic!";
2499 ASSERT(0);
2500 break; 2501 break;
2501 } 2502 }
2502 bp->b_ops = &xfs_dir3_leaf1_buf_ops; 2503 bp->b_ops = &xfs_dir3_leaf1_buf_ops;
@@ -2504,8 +2505,7 @@ xlog_recover_validate_buf_type(
2504 case XFS_BLFT_DIR_LEAFN_BUF: 2505 case XFS_BLFT_DIR_LEAFN_BUF:
2505 if (magicda != XFS_DIR2_LEAFN_MAGIC && 2506 if (magicda != XFS_DIR2_LEAFN_MAGIC &&
2506 magicda != XFS_DIR3_LEAFN_MAGIC) { 2507 magicda != XFS_DIR3_LEAFN_MAGIC) {
2507 xfs_warn(mp, "Bad dir leafn magic!"); 2508 warnmsg = "Bad dir leafn magic!";
2508 ASSERT(0);
2509 break; 2509 break;
2510 } 2510 }
2511 bp->b_ops = &xfs_dir3_leafn_buf_ops; 2511 bp->b_ops = &xfs_dir3_leafn_buf_ops;
@@ -2513,8 +2513,7 @@ xlog_recover_validate_buf_type(
2513 case XFS_BLFT_DA_NODE_BUF: 2513 case XFS_BLFT_DA_NODE_BUF:
2514 if (magicda != XFS_DA_NODE_MAGIC && 2514 if (magicda != XFS_DA_NODE_MAGIC &&
2515 magicda != XFS_DA3_NODE_MAGIC) { 2515 magicda != XFS_DA3_NODE_MAGIC) {
2516 xfs_warn(mp, "Bad da node magic!"); 2516 warnmsg = "Bad da node magic!";
2517 ASSERT(0);
2518 break; 2517 break;
2519 } 2518 }
2520 bp->b_ops = &xfs_da3_node_buf_ops; 2519 bp->b_ops = &xfs_da3_node_buf_ops;
@@ -2522,24 +2521,21 @@ xlog_recover_validate_buf_type(
2522 case XFS_BLFT_ATTR_LEAF_BUF: 2521 case XFS_BLFT_ATTR_LEAF_BUF:
2523 if (magicda != XFS_ATTR_LEAF_MAGIC && 2522 if (magicda != XFS_ATTR_LEAF_MAGIC &&
2524 magicda != XFS_ATTR3_LEAF_MAGIC) { 2523 magicda != XFS_ATTR3_LEAF_MAGIC) {
2525 xfs_warn(mp, "Bad attr leaf magic!"); 2524 warnmsg = "Bad attr leaf magic!";
2526 ASSERT(0);
2527 break; 2525 break;
2528 } 2526 }
2529 bp->b_ops = &xfs_attr3_leaf_buf_ops; 2527 bp->b_ops = &xfs_attr3_leaf_buf_ops;
2530 break; 2528 break;
2531 case XFS_BLFT_ATTR_RMT_BUF: 2529 case XFS_BLFT_ATTR_RMT_BUF:
2532 if (magic32 != XFS_ATTR3_RMT_MAGIC) { 2530 if (magic32 != XFS_ATTR3_RMT_MAGIC) {
2533 xfs_warn(mp, "Bad attr remote magic!"); 2531 warnmsg = "Bad attr remote magic!";
2534 ASSERT(0);
2535 break; 2532 break;
2536 } 2533 }
2537 bp->b_ops = &xfs_attr3_rmt_buf_ops; 2534 bp->b_ops = &xfs_attr3_rmt_buf_ops;
2538 break; 2535 break;
2539 case XFS_BLFT_SB_BUF: 2536 case XFS_BLFT_SB_BUF:
2540 if (magic32 != XFS_SB_MAGIC) { 2537 if (magic32 != XFS_SB_MAGIC) {
2541 xfs_warn(mp, "Bad SB block magic!"); 2538 warnmsg = "Bad SB block magic!";
2542 ASSERT(0);
2543 break; 2539 break;
2544 } 2540 }
2545 bp->b_ops = &xfs_sb_buf_ops; 2541 bp->b_ops = &xfs_sb_buf_ops;
@@ -2556,6 +2552,40 @@ xlog_recover_validate_buf_type(
2556 xfs_blft_from_flags(buf_f)); 2552 xfs_blft_from_flags(buf_f));
2557 break; 2553 break;
2558 } 2554 }
2555
2556 /*
2557 * Nothing else to do in the case of a NULL current LSN as this means
2558 * the buffer is more recent than the change in the log and will be
2559 * skipped.
2560 */
2561 if (current_lsn == NULLCOMMITLSN)
2562 return;
2563
2564 if (warnmsg) {
2565 xfs_warn(mp, warnmsg);
2566 ASSERT(0);
2567 }
2568
2569 /*
2570 * We must update the metadata LSN of the buffer as it is written out to
2571 * ensure that older transactions never replay over this one and corrupt
2572 * the buffer. This can occur if log recovery is interrupted at some
2573 * point after the current transaction completes, at which point a
2574 * subsequent mount starts recovery from the beginning.
2575 *
2576 * Write verifiers update the metadata LSN from log items attached to
2577 * the buffer. Therefore, initialize a bli purely to carry the LSN to
2578 * the verifier. We'll clean it up in our ->iodone() callback.
2579 */
2580 if (bp->b_ops) {
2581 struct xfs_buf_log_item *bip;
2582
2583 ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone);
2584 bp->b_iodone = xlog_recover_iodone;
2585 xfs_buf_item_init(bp, mp);
2586 bip = bp->b_fspriv;
2587 bip->bli_item.li_lsn = current_lsn;
2588 }
2559} 2589}
2560 2590
2561/* 2591/*
@@ -2569,7 +2599,8 @@ xlog_recover_do_reg_buffer(
2569 struct xfs_mount *mp, 2599 struct xfs_mount *mp,
2570 xlog_recover_item_t *item, 2600 xlog_recover_item_t *item,
2571 struct xfs_buf *bp, 2601 struct xfs_buf *bp,
2572 xfs_buf_log_format_t *buf_f) 2602 xfs_buf_log_format_t *buf_f,
2603 xfs_lsn_t current_lsn)
2573{ 2604{
2574 int i; 2605 int i;
2575 int bit; 2606 int bit;
@@ -2642,7 +2673,7 @@ xlog_recover_do_reg_buffer(
2642 /* Shouldn't be any more regions */ 2673 /* Shouldn't be any more regions */
2643 ASSERT(i == item->ri_total); 2674 ASSERT(i == item->ri_total);
2644 2675
2645 xlog_recover_validate_buf_type(mp, bp, buf_f); 2676 xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn);
2646} 2677}
2647 2678
2648/* 2679/*
@@ -2685,7 +2716,7 @@ xlog_recover_do_dquot_buffer(
2685 if (log->l_quotaoffs_flag & type) 2716 if (log->l_quotaoffs_flag & type)
2686 return false; 2717 return false;
2687 2718
2688 xlog_recover_do_reg_buffer(mp, item, bp, buf_f); 2719 xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN);
2689 return true; 2720 return true;
2690} 2721}
2691 2722
@@ -2773,7 +2804,8 @@ xlog_recover_buffer_pass2(
2773 */ 2804 */
2774 lsn = xlog_recover_get_buf_lsn(mp, bp); 2805 lsn = xlog_recover_get_buf_lsn(mp, bp);
2775 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { 2806 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
2776 xlog_recover_validate_buf_type(mp, bp, buf_f); 2807 trace_xfs_log_recover_buf_skip(log, buf_f);
2808 xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
2777 goto out_release; 2809 goto out_release;
2778 } 2810 }
2779 2811
@@ -2789,7 +2821,7 @@ xlog_recover_buffer_pass2(
2789 if (!dirty) 2821 if (!dirty)
2790 goto out_release; 2822 goto out_release;
2791 } else { 2823 } else {
2792 xlog_recover_do_reg_buffer(mp, item, bp, buf_f); 2824 xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
2793 } 2825 }
2794 2826
2795 /* 2827 /*
@@ -3846,14 +3878,13 @@ STATIC int
3846xlog_recover_commit_trans( 3878xlog_recover_commit_trans(
3847 struct xlog *log, 3879 struct xlog *log,
3848 struct xlog_recover *trans, 3880 struct xlog_recover *trans,
3849 int pass) 3881 int pass,
3882 struct list_head *buffer_list)
3850{ 3883{
3851 int error = 0; 3884 int error = 0;
3852 int error2;
3853 int items_queued = 0; 3885 int items_queued = 0;
3854 struct xlog_recover_item *item; 3886 struct xlog_recover_item *item;
3855 struct xlog_recover_item *next; 3887 struct xlog_recover_item *next;
3856 LIST_HEAD (buffer_list);
3857 LIST_HEAD (ra_list); 3888 LIST_HEAD (ra_list);
3858 LIST_HEAD (done_list); 3889 LIST_HEAD (done_list);
3859 3890
@@ -3876,7 +3907,7 @@ xlog_recover_commit_trans(
3876 items_queued++; 3907 items_queued++;
3877 if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) { 3908 if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
3878 error = xlog_recover_items_pass2(log, trans, 3909 error = xlog_recover_items_pass2(log, trans,
3879 &buffer_list, &ra_list); 3910 buffer_list, &ra_list);
3880 list_splice_tail_init(&ra_list, &done_list); 3911 list_splice_tail_init(&ra_list, &done_list);
3881 items_queued = 0; 3912 items_queued = 0;
3882 } 3913 }
@@ -3894,15 +3925,14 @@ out:
3894 if (!list_empty(&ra_list)) { 3925 if (!list_empty(&ra_list)) {
3895 if (!error) 3926 if (!error)
3896 error = xlog_recover_items_pass2(log, trans, 3927 error = xlog_recover_items_pass2(log, trans,
3897 &buffer_list, &ra_list); 3928 buffer_list, &ra_list);
3898 list_splice_tail_init(&ra_list, &done_list); 3929 list_splice_tail_init(&ra_list, &done_list);
3899 } 3930 }
3900 3931
3901 if (!list_empty(&done_list)) 3932 if (!list_empty(&done_list))
3902 list_splice_init(&done_list, &trans->r_itemq); 3933 list_splice_init(&done_list, &trans->r_itemq);
3903 3934
3904 error2 = xfs_buf_delwri_submit(&buffer_list); 3935 return error;
3905 return error ? error : error2;
3906} 3936}
3907 3937
3908STATIC void 3938STATIC void
@@ -4085,7 +4115,8 @@ xlog_recovery_process_trans(
4085 char *dp, 4115 char *dp,
4086 unsigned int len, 4116 unsigned int len,
4087 unsigned int flags, 4117 unsigned int flags,
4088 int pass) 4118 int pass,
4119 struct list_head *buffer_list)
4089{ 4120{
4090 int error = 0; 4121 int error = 0;
4091 bool freeit = false; 4122 bool freeit = false;
@@ -4109,7 +4140,8 @@ xlog_recovery_process_trans(
4109 error = xlog_recover_add_to_cont_trans(log, trans, dp, len); 4140 error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
4110 break; 4141 break;
4111 case XLOG_COMMIT_TRANS: 4142 case XLOG_COMMIT_TRANS:
4112 error = xlog_recover_commit_trans(log, trans, pass); 4143 error = xlog_recover_commit_trans(log, trans, pass,
4144 buffer_list);
4113 /* success or fail, we are now done with this transaction. */ 4145 /* success or fail, we are now done with this transaction. */
4114 freeit = true; 4146 freeit = true;
4115 break; 4147 break;
@@ -4191,10 +4223,12 @@ xlog_recover_process_ophdr(
4191 struct xlog_op_header *ohead, 4223 struct xlog_op_header *ohead,
4192 char *dp, 4224 char *dp,
4193 char *end, 4225 char *end,
4194 int pass) 4226 int pass,
4227 struct list_head *buffer_list)
4195{ 4228{
4196 struct xlog_recover *trans; 4229 struct xlog_recover *trans;
4197 unsigned int len; 4230 unsigned int len;
4231 int error;
4198 4232
4199 /* Do we understand who wrote this op? */ 4233 /* Do we understand who wrote this op? */
4200 if (ohead->oh_clientid != XFS_TRANSACTION && 4234 if (ohead->oh_clientid != XFS_TRANSACTION &&
@@ -4221,8 +4255,39 @@ xlog_recover_process_ophdr(
4221 return 0; 4255 return 0;
4222 } 4256 }
4223 4257
4258 /*
4259 * The recovered buffer queue is drained only once we know that all
4260 * recovery items for the current LSN have been processed. This is
4261 * required because:
4262 *
4263 * - Buffer write submission updates the metadata LSN of the buffer.
4264 * - Log recovery skips items with a metadata LSN >= the current LSN of
4265 * the recovery item.
4266 * - Separate recovery items against the same metadata buffer can share
4267 * a current LSN. I.e., consider that the LSN of a recovery item is
4268 * defined as the starting LSN of the first record in which its
4269 * transaction appears, that a record can hold multiple transactions,
4270 * and/or that a transaction can span multiple records.
4271 *
4272 * In other words, we are allowed to submit a buffer from log recovery
4273 * once per current LSN. Otherwise, we may incorrectly skip recovery
4274 * items and cause corruption.
4275 *
4276 * We don't know up front whether buffers are updated multiple times per
4277 * LSN. Therefore, track the current LSN of each commit log record as it
4278 * is processed and drain the queue when it changes. Use commit records
4279 * because they are ordered correctly by the logging code.
4280 */
4281 if (log->l_recovery_lsn != trans->r_lsn &&
4282 ohead->oh_flags & XLOG_COMMIT_TRANS) {
4283 error = xfs_buf_delwri_submit(buffer_list);
4284 if (error)
4285 return error;
4286 log->l_recovery_lsn = trans->r_lsn;
4287 }
4288
4224 return xlog_recovery_process_trans(log, trans, dp, len, 4289 return xlog_recovery_process_trans(log, trans, dp, len,
4225 ohead->oh_flags, pass); 4290 ohead->oh_flags, pass, buffer_list);
4226} 4291}
4227 4292
4228/* 4293/*
@@ -4240,7 +4305,8 @@ xlog_recover_process_data(
4240 struct hlist_head rhash[], 4305 struct hlist_head rhash[],
4241 struct xlog_rec_header *rhead, 4306 struct xlog_rec_header *rhead,
4242 char *dp, 4307 char *dp,
4243 int pass) 4308 int pass,
4309 struct list_head *buffer_list)
4244{ 4310{
4245 struct xlog_op_header *ohead; 4311 struct xlog_op_header *ohead;
4246 char *end; 4312 char *end;
@@ -4254,6 +4320,7 @@ xlog_recover_process_data(
4254 if (xlog_header_check_recover(log->l_mp, rhead)) 4320 if (xlog_header_check_recover(log->l_mp, rhead))
4255 return -EIO; 4321 return -EIO;
4256 4322
4323 trace_xfs_log_recover_record(log, rhead, pass);
4257 while ((dp < end) && num_logops) { 4324 while ((dp < end) && num_logops) {
4258 4325
4259 ohead = (struct xlog_op_header *)dp; 4326 ohead = (struct xlog_op_header *)dp;
@@ -4262,7 +4329,7 @@ xlog_recover_process_data(
4262 4329
4263 /* errors will abort recovery */ 4330 /* errors will abort recovery */
4264 error = xlog_recover_process_ophdr(log, rhash, rhead, ohead, 4331 error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
4265 dp, end, pass); 4332 dp, end, pass, buffer_list);
4266 if (error) 4333 if (error)
4267 return error; 4334 return error;
4268 4335
@@ -4685,7 +4752,8 @@ xlog_recover_process(
4685 struct hlist_head rhash[], 4752 struct hlist_head rhash[],
4686 struct xlog_rec_header *rhead, 4753 struct xlog_rec_header *rhead,
4687 char *dp, 4754 char *dp,
4688 int pass) 4755 int pass,
4756 struct list_head *buffer_list)
4689{ 4757{
4690 int error; 4758 int error;
4691 __le32 crc; 4759 __le32 crc;
@@ -4732,7 +4800,8 @@ xlog_recover_process(
4732 if (error) 4800 if (error)
4733 return error; 4801 return error;
4734 4802
4735 return xlog_recover_process_data(log, rhash, rhead, dp, pass); 4803 return xlog_recover_process_data(log, rhash, rhead, dp, pass,
4804 buffer_list);
4736} 4805}
4737 4806
4738STATIC int 4807STATIC int
@@ -4793,9 +4862,11 @@ xlog_do_recovery_pass(
4793 char *offset; 4862 char *offset;
4794 xfs_buf_t *hbp, *dbp; 4863 xfs_buf_t *hbp, *dbp;
4795 int error = 0, h_size, h_len; 4864 int error = 0, h_size, h_len;
4865 int error2 = 0;
4796 int bblks, split_bblks; 4866 int bblks, split_bblks;
4797 int hblks, split_hblks, wrapped_hblks; 4867 int hblks, split_hblks, wrapped_hblks;
4798 struct hlist_head rhash[XLOG_RHASH_SIZE]; 4868 struct hlist_head rhash[XLOG_RHASH_SIZE];
4869 LIST_HEAD (buffer_list);
4799 4870
4800 ASSERT(head_blk != tail_blk); 4871 ASSERT(head_blk != tail_blk);
4801 rhead_blk = 0; 4872 rhead_blk = 0;
@@ -4981,7 +5052,7 @@ xlog_do_recovery_pass(
4981 } 5052 }
4982 5053
4983 error = xlog_recover_process(log, rhash, rhead, offset, 5054 error = xlog_recover_process(log, rhash, rhead, offset,
4984 pass); 5055 pass, &buffer_list);
4985 if (error) 5056 if (error)
4986 goto bread_err2; 5057 goto bread_err2;
4987 5058
@@ -5012,7 +5083,8 @@ xlog_do_recovery_pass(
5012 if (error) 5083 if (error)
5013 goto bread_err2; 5084 goto bread_err2;
5014 5085
5015 error = xlog_recover_process(log, rhash, rhead, offset, pass); 5086 error = xlog_recover_process(log, rhash, rhead, offset, pass,
5087 &buffer_list);
5016 if (error) 5088 if (error)
5017 goto bread_err2; 5089 goto bread_err2;
5018 5090
@@ -5025,10 +5097,17 @@ xlog_do_recovery_pass(
5025 bread_err1: 5097 bread_err1:
5026 xlog_put_bp(hbp); 5098 xlog_put_bp(hbp);
5027 5099
5100 /*
5101 * Submit buffers that have been added from the last record processed,
5102 * regardless of error status.
5103 */
5104 if (!list_empty(&buffer_list))
5105 error2 = xfs_buf_delwri_submit(&buffer_list);
5106
5028 if (error && first_bad) 5107 if (error && first_bad)
5029 *first_bad = rhead_blk; 5108 *first_bad = rhead_blk;
5030 5109
5031 return error; 5110 return error ? error : error2;
5032} 5111}
5033 5112
5034/* 5113/*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index faeead671f9f..56e85a6c85c7 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -934,6 +934,20 @@ xfs_mountfs(
934 } 934 }
935 935
936 /* 936 /*
937 * Now the log is fully replayed, we can transition to full read-only
938 * mode for read-only mounts. This will sync all the metadata and clean
939 * the log so that the recovery we just performed does not have to be
940 * replayed again on the next mount.
941 *
942 * We use the same quiesce mechanism as the rw->ro remount, as they are
943 * semantically identical operations.
944 */
945 if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) ==
946 XFS_MOUNT_RDONLY) {
947 xfs_quiesce_attr(mp);
948 }
949
950 /*
937 * Complete the quota initialisation, post-log-replay component. 951 * Complete the quota initialisation, post-log-replay component.
938 */ 952 */
939 if (quotamount) { 953 if (quotamount) {
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b36676cde103..041d9493e798 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -57,10 +57,16 @@ enum {
57 57
58#define XFS_ERR_RETRY_FOREVER -1 58#define XFS_ERR_RETRY_FOREVER -1
59 59
60/*
61 * Although retry_timeout is in jiffies which is normally an unsigned long,
62 * we limit the retry timeout to 86400 seconds, or one day. So even a
63 * signed 32-bit long is sufficient for a HZ value up to 24855. Making it
64 * signed lets us store the special "-1" value, meaning retry forever.
65 */
60struct xfs_error_cfg { 66struct xfs_error_cfg {
61 struct xfs_kobj kobj; 67 struct xfs_kobj kobj;
62 int max_retries; 68 int max_retries;
63 unsigned long retry_timeout; /* in jiffies, 0 = no timeout */ 69 long retry_timeout; /* in jiffies, -1 = infinite */
64}; 70};
65 71
66typedef struct xfs_mount { 72typedef struct xfs_mount {
@@ -325,6 +331,22 @@ xfs_mp_fail_writes(struct xfs_mount *mp)
325} 331}
326#endif 332#endif
327 333
334/* per-AG block reservation data structures*/
335enum xfs_ag_resv_type {
336 XFS_AG_RESV_NONE = 0,
337 XFS_AG_RESV_METADATA,
338 XFS_AG_RESV_AGFL,
339};
340
341struct xfs_ag_resv {
342 /* number of blocks originally reserved here */
343 xfs_extlen_t ar_orig_reserved;
344 /* number of blocks reserved here */
345 xfs_extlen_t ar_reserved;
346 /* number of blocks originally asked for */
347 xfs_extlen_t ar_asked;
348};
349
328/* 350/*
329 * Per-ag incore structure, copies of information in agf and agi, to improve the 351 * Per-ag incore structure, copies of information in agf and agi, to improve the
330 * performance of allocation group selection. 352 * performance of allocation group selection.
@@ -372,8 +394,28 @@ typedef struct xfs_perag {
372 /* for rcu-safe freeing */ 394 /* for rcu-safe freeing */
373 struct rcu_head rcu_head; 395 struct rcu_head rcu_head;
374 int pagb_count; /* pagb slots in use */ 396 int pagb_count; /* pagb slots in use */
397
398 /* Blocks reserved for all kinds of metadata. */
399 struct xfs_ag_resv pag_meta_resv;
400 /* Blocks reserved for just AGFL-based metadata. */
401 struct xfs_ag_resv pag_agfl_resv;
375} xfs_perag_t; 402} xfs_perag_t;
376 403
404static inline struct xfs_ag_resv *
405xfs_perag_resv(
406 struct xfs_perag *pag,
407 enum xfs_ag_resv_type type)
408{
409 switch (type) {
410 case XFS_AG_RESV_METADATA:
411 return &pag->pag_meta_resv;
412 case XFS_AG_RESV_AGFL:
413 return &pag->pag_agfl_resv;
414 default:
415 return NULL;
416 }
417}
418
377extern void xfs_uuid_table_free(void); 419extern void xfs_uuid_table_free(void);
378extern int xfs_log_sbcount(xfs_mount_t *); 420extern int xfs_log_sbcount(xfs_mount_t *);
379extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); 421extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 2500f28689d5..0432a459871c 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -51,28 +51,16 @@ xfs_rui_item_free(
51 kmem_zone_free(xfs_rui_zone, ruip); 51 kmem_zone_free(xfs_rui_zone, ruip);
52} 52}
53 53
54/*
55 * This returns the number of iovecs needed to log the given rui item.
56 * We only need 1 iovec for an rui item. It just logs the rui_log_format
57 * structure.
58 */
59static inline int
60xfs_rui_item_sizeof(
61 struct xfs_rui_log_item *ruip)
62{
63 return sizeof(struct xfs_rui_log_format) +
64 (ruip->rui_format.rui_nextents - 1) *
65 sizeof(struct xfs_map_extent);
66}
67
68STATIC void 54STATIC void
69xfs_rui_item_size( 55xfs_rui_item_size(
70 struct xfs_log_item *lip, 56 struct xfs_log_item *lip,
71 int *nvecs, 57 int *nvecs,
72 int *nbytes) 58 int *nbytes)
73{ 59{
60 struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
61
74 *nvecs += 1; 62 *nvecs += 1;
75 *nbytes += xfs_rui_item_sizeof(RUI_ITEM(lip)); 63 *nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents);
76} 64}
77 65
78/* 66/*
@@ -97,7 +85,7 @@ xfs_rui_item_format(
97 ruip->rui_format.rui_size = 1; 85 ruip->rui_format.rui_size = 1;
98 86
99 xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format, 87 xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format,
100 xfs_rui_item_sizeof(ruip)); 88 xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents));
101} 89}
102 90
103/* 91/*
@@ -205,16 +193,12 @@ xfs_rui_init(
205 193
206{ 194{
207 struct xfs_rui_log_item *ruip; 195 struct xfs_rui_log_item *ruip;
208 uint size;
209 196
210 ASSERT(nextents > 0); 197 ASSERT(nextents > 0);
211 if (nextents > XFS_RUI_MAX_FAST_EXTENTS) { 198 if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
212 size = (uint)(sizeof(struct xfs_rui_log_item) + 199 ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP);
213 ((nextents - 1) * sizeof(struct xfs_map_extent))); 200 else
214 ruip = kmem_zalloc(size, KM_SLEEP);
215 } else {
216 ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP); 201 ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP);
217 }
218 202
219 xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); 203 xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
220 ruip->rui_format.rui_nextents = nextents; 204 ruip->rui_format.rui_nextents = nextents;
@@ -239,14 +223,12 @@ xfs_rui_copy_format(
239 uint len; 223 uint len;
240 224
241 src_rui_fmt = buf->i_addr; 225 src_rui_fmt = buf->i_addr;
242 len = sizeof(struct xfs_rui_log_format) + 226 len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents);
243 (src_rui_fmt->rui_nextents - 1) *
244 sizeof(struct xfs_map_extent);
245 227
246 if (buf->i_len != len) 228 if (buf->i_len != len)
247 return -EFSCORRUPTED; 229 return -EFSCORRUPTED;
248 230
249 memcpy((char *)dst_rui_fmt, (char *)src_rui_fmt, len); 231 memcpy(dst_rui_fmt, src_rui_fmt, len);
250 return 0; 232 return 0;
251} 233}
252 234
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index aefcc3a318a5..340c968e1f9c 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -70,6 +70,14 @@ struct xfs_rui_log_item {
70 struct xfs_rui_log_format rui_format; 70 struct xfs_rui_log_format rui_format;
71}; 71};
72 72
73static inline size_t
74xfs_rui_log_item_sizeof(
75 unsigned int nr)
76{
77 return offsetof(struct xfs_rui_log_item, rui_format) +
78 xfs_rui_log_format_sizeof(nr);
79}
80
73/* 81/*
74 * This is the "rmap update done" log item. It is used to log the fact that 82 * This is the "rmap update done" log item. It is used to log the fact that
75 * some rmapbt updates mentioned in an earlier rui item have been performed. 83 * some rmapbt updates mentioned in an earlier rui item have been performed.
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index fd6be45b3a1e..2d092f9577ca 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1137,7 +1137,7 @@ xfs_restore_resvblks(struct xfs_mount *mp)
1137 * Note: xfs_log_quiesce() stops background log work - the callers must ensure 1137 * Note: xfs_log_quiesce() stops background log work - the callers must ensure
1138 * it is started again when appropriate. 1138 * it is started again when appropriate.
1139 */ 1139 */
1140static void 1140void
1141xfs_quiesce_attr( 1141xfs_quiesce_attr(
1142 struct xfs_mount *mp) 1142 struct xfs_mount *mp)
1143{ 1143{
@@ -1782,9 +1782,8 @@ xfs_init_zones(void)
1782 if (!xfs_rud_zone) 1782 if (!xfs_rud_zone)
1783 goto out_destroy_icreate_zone; 1783 goto out_destroy_icreate_zone;
1784 1784
1785 xfs_rui_zone = kmem_zone_init((sizeof(struct xfs_rui_log_item) + 1785 xfs_rui_zone = kmem_zone_init(
1786 ((XFS_RUI_MAX_FAST_EXTENTS - 1) * 1786 xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS),
1787 sizeof(struct xfs_map_extent))),
1788 "xfs_rui_item"); 1787 "xfs_rui_item");
1789 if (!xfs_rui_zone) 1788 if (!xfs_rui_zone)
1790 goto out_destroy_rud_zone; 1789 goto out_destroy_rud_zone;
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 529bce9fc37e..b6418abd85ad 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -61,6 +61,7 @@ struct xfs_mount;
61struct xfs_buftarg; 61struct xfs_buftarg;
62struct block_device; 62struct block_device;
63 63
64extern void xfs_quiesce_attr(struct xfs_mount *mp);
64extern void xfs_flush_inodes(struct xfs_mount *mp); 65extern void xfs_flush_inodes(struct xfs_mount *mp);
65extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); 66extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
66extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, 67extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 79cfd3fc5324..5f8d55d29a11 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -393,9 +393,15 @@ max_retries_show(
393 struct kobject *kobject, 393 struct kobject *kobject,
394 char *buf) 394 char *buf)
395{ 395{
396 int retries;
396 struct xfs_error_cfg *cfg = to_error_cfg(kobject); 397 struct xfs_error_cfg *cfg = to_error_cfg(kobject);
397 398
398 return snprintf(buf, PAGE_SIZE, "%d\n", cfg->max_retries); 399 if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER)
400 retries = -1;
401 else
402 retries = cfg->max_retries;
403
404 return snprintf(buf, PAGE_SIZE, "%d\n", retries);
399} 405}
400 406
401static ssize_t 407static ssize_t
@@ -415,7 +421,10 @@ max_retries_store(
415 if (val < -1) 421 if (val < -1)
416 return -EINVAL; 422 return -EINVAL;
417 423
418 cfg->max_retries = val; 424 if (val == -1)
425 cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
426 else
427 cfg->max_retries = val;
419 return count; 428 return count;
420} 429}
421XFS_SYSFS_ATTR_RW(max_retries); 430XFS_SYSFS_ATTR_RW(max_retries);
@@ -425,10 +434,15 @@ retry_timeout_seconds_show(
425 struct kobject *kobject, 434 struct kobject *kobject,
426 char *buf) 435 char *buf)
427{ 436{
437 int timeout;
428 struct xfs_error_cfg *cfg = to_error_cfg(kobject); 438 struct xfs_error_cfg *cfg = to_error_cfg(kobject);
429 439
430 return snprintf(buf, PAGE_SIZE, "%ld\n", 440 if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER)
431 jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC); 441 timeout = -1;
442 else
443 timeout = jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC;
444
445 return snprintf(buf, PAGE_SIZE, "%d\n", timeout);
432} 446}
433 447
434static ssize_t 448static ssize_t
@@ -445,11 +459,16 @@ retry_timeout_seconds_store(
445 if (ret) 459 if (ret)
446 return ret; 460 return ret;
447 461
448 /* 1 day timeout maximum */ 462 /* 1 day timeout maximum, -1 means infinite */
449 if (val < 0 || val > 86400) 463 if (val < -1 || val > 86400)
450 return -EINVAL; 464 return -EINVAL;
451 465
452 cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC); 466 if (val == -1)
467 cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
468 else {
469 cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC);
470 ASSERT(msecs_to_jiffies(val * MSEC_PER_SEC) < LONG_MAX);
471 }
453 return count; 472 return count;
454} 473}
455XFS_SYSFS_ATTR_RW(retry_timeout_seconds); 474XFS_SYSFS_ATTR_RW(retry_timeout_seconds);
@@ -519,18 +538,19 @@ struct xfs_error_init {
519static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = { 538static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = {
520 { .name = "default", 539 { .name = "default",
521 .max_retries = XFS_ERR_RETRY_FOREVER, 540 .max_retries = XFS_ERR_RETRY_FOREVER,
522 .retry_timeout = 0, 541 .retry_timeout = XFS_ERR_RETRY_FOREVER,
523 }, 542 },
524 { .name = "EIO", 543 { .name = "EIO",
525 .max_retries = XFS_ERR_RETRY_FOREVER, 544 .max_retries = XFS_ERR_RETRY_FOREVER,
526 .retry_timeout = 0, 545 .retry_timeout = XFS_ERR_RETRY_FOREVER,
527 }, 546 },
528 { .name = "ENOSPC", 547 { .name = "ENOSPC",
529 .max_retries = XFS_ERR_RETRY_FOREVER, 548 .max_retries = XFS_ERR_RETRY_FOREVER,
530 .retry_timeout = 0, 549 .retry_timeout = XFS_ERR_RETRY_FOREVER,
531 }, 550 },
532 { .name = "ENODEV", 551 { .name = "ENODEV",
533 .max_retries = 0, 552 .max_retries = 0, /* We can't recover from devices disappearing */
553 .retry_timeout = 0,
534 }, 554 },
535}; 555};
536 556
@@ -561,7 +581,10 @@ xfs_error_sysfs_init_class(
561 goto out_error; 581 goto out_error;
562 582
563 cfg->max_retries = init[i].max_retries; 583 cfg->max_retries = init[i].max_retries;
564 cfg->retry_timeout = msecs_to_jiffies( 584 if (init[i].retry_timeout == XFS_ERR_RETRY_FOREVER)
585 cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
586 else
587 cfg->retry_timeout = msecs_to_jiffies(
565 init[i].retry_timeout * MSEC_PER_SEC); 588 init[i].retry_timeout * MSEC_PER_SEC);
566 } 589 }
567 return 0; 590 return 0;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index d303a665dba9..c6b2b1dcde75 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1570,14 +1570,15 @@ TRACE_EVENT(xfs_agf,
1570 1570
1571TRACE_EVENT(xfs_free_extent, 1571TRACE_EVENT(xfs_free_extent,
1572 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, 1572 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
1573 xfs_extlen_t len, bool isfl, int haveleft, int haveright), 1573 xfs_extlen_t len, enum xfs_ag_resv_type resv, int haveleft,
1574 TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright), 1574 int haveright),
1575 TP_ARGS(mp, agno, agbno, len, resv, haveleft, haveright),
1575 TP_STRUCT__entry( 1576 TP_STRUCT__entry(
1576 __field(dev_t, dev) 1577 __field(dev_t, dev)
1577 __field(xfs_agnumber_t, agno) 1578 __field(xfs_agnumber_t, agno)
1578 __field(xfs_agblock_t, agbno) 1579 __field(xfs_agblock_t, agbno)
1579 __field(xfs_extlen_t, len) 1580 __field(xfs_extlen_t, len)
1580 __field(int, isfl) 1581 __field(int, resv)
1581 __field(int, haveleft) 1582 __field(int, haveleft)
1582 __field(int, haveright) 1583 __field(int, haveright)
1583 ), 1584 ),
@@ -1586,16 +1587,16 @@ TRACE_EVENT(xfs_free_extent,
1586 __entry->agno = agno; 1587 __entry->agno = agno;
1587 __entry->agbno = agbno; 1588 __entry->agbno = agbno;
1588 __entry->len = len; 1589 __entry->len = len;
1589 __entry->isfl = isfl; 1590 __entry->resv = resv;
1590 __entry->haveleft = haveleft; 1591 __entry->haveleft = haveleft;
1591 __entry->haveright = haveright; 1592 __entry->haveright = haveright;
1592 ), 1593 ),
1593 TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s", 1594 TP_printk("dev %d:%d agno %u agbno %u len %u resv %d %s",
1594 MAJOR(__entry->dev), MINOR(__entry->dev), 1595 MAJOR(__entry->dev), MINOR(__entry->dev),
1595 __entry->agno, 1596 __entry->agno,
1596 __entry->agbno, 1597 __entry->agbno,
1597 __entry->len, 1598 __entry->len,
1598 __entry->isfl, 1599 __entry->resv,
1599 __entry->haveleft ? 1600 __entry->haveleft ?
1600 (__entry->haveright ? "both" : "left") : 1601 (__entry->haveright ? "both" : "left") :
1601 (__entry->haveright ? "right" : "none")) 1602 (__entry->haveright ? "right" : "none"))
@@ -1622,8 +1623,8 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
1622 __field(short, otype) 1623 __field(short, otype)
1623 __field(char, wasdel) 1624 __field(char, wasdel)
1624 __field(char, wasfromfl) 1625 __field(char, wasfromfl)
1625 __field(char, isfl) 1626 __field(int, resv)
1626 __field(char, userdata) 1627 __field(int, datatype)
1627 __field(xfs_fsblock_t, firstblock) 1628 __field(xfs_fsblock_t, firstblock)
1628 ), 1629 ),
1629 TP_fast_assign( 1630 TP_fast_assign(
@@ -1643,14 +1644,14 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
1643 __entry->otype = args->otype; 1644 __entry->otype = args->otype;
1644 __entry->wasdel = args->wasdel; 1645 __entry->wasdel = args->wasdel;
1645 __entry->wasfromfl = args->wasfromfl; 1646 __entry->wasfromfl = args->wasfromfl;
1646 __entry->isfl = args->isfl; 1647 __entry->resv = args->resv;
1647 __entry->userdata = args->userdata; 1648 __entry->datatype = args->datatype;
1648 __entry->firstblock = args->firstblock; 1649 __entry->firstblock = args->firstblock;
1649 ), 1650 ),
1650 TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u " 1651 TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u "
1651 "prod %u minleft %u total %u alignment %u minalignslop %u " 1652 "prod %u minleft %u total %u alignment %u minalignslop %u "
1652 "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d " 1653 "len %u type %s otype %s wasdel %d wasfromfl %d resv %d "
1653 "userdata %d firstblock 0x%llx", 1654 "datatype 0x%x firstblock 0x%llx",
1654 MAJOR(__entry->dev), MINOR(__entry->dev), 1655 MAJOR(__entry->dev), MINOR(__entry->dev),
1655 __entry->agno, 1656 __entry->agno,
1656 __entry->agbno, 1657 __entry->agbno,
@@ -1667,8 +1668,8 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
1667 __print_symbolic(__entry->otype, XFS_ALLOC_TYPES), 1668 __print_symbolic(__entry->otype, XFS_ALLOC_TYPES),
1668 __entry->wasdel, 1669 __entry->wasdel,
1669 __entry->wasfromfl, 1670 __entry->wasfromfl,
1670 __entry->isfl, 1671 __entry->resv,
1671 __entry->userdata, 1672 __entry->datatype,
1672 (unsigned long long)__entry->firstblock) 1673 (unsigned long long)__entry->firstblock)
1673) 1674)
1674 1675
@@ -1984,6 +1985,29 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \
1984DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); 1985DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
1985DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); 1986DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
1986 1987
1988TRACE_EVENT(xfs_log_recover_record,
1989 TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass),
1990 TP_ARGS(log, rhead, pass),
1991 TP_STRUCT__entry(
1992 __field(dev_t, dev)
1993 __field(xfs_lsn_t, lsn)
1994 __field(int, len)
1995 __field(int, num_logops)
1996 __field(int, pass)
1997 ),
1998 TP_fast_assign(
1999 __entry->dev = log->l_mp->m_super->s_dev;
2000 __entry->lsn = be64_to_cpu(rhead->h_lsn);
2001 __entry->len = be32_to_cpu(rhead->h_len);
2002 __entry->num_logops = be32_to_cpu(rhead->h_num_logops);
2003 __entry->pass = pass;
2004 ),
2005 TP_printk("dev %d:%d lsn 0x%llx len 0x%x num_logops 0x%x pass %d",
2006 MAJOR(__entry->dev), MINOR(__entry->dev),
2007 __entry->lsn, __entry->len, __entry->num_logops,
2008 __entry->pass)
2009)
2010
1987DECLARE_EVENT_CLASS(xfs_log_recover_item_class, 2011DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
1988 TP_PROTO(struct xlog *log, struct xlog_recover *trans, 2012 TP_PROTO(struct xlog *log, struct xlog_recover *trans,
1989 struct xlog_recover_item *item, int pass), 2013 struct xlog_recover_item *item, int pass),
@@ -1992,6 +2016,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
1992 __field(dev_t, dev) 2016 __field(dev_t, dev)
1993 __field(unsigned long, item) 2017 __field(unsigned long, item)
1994 __field(xlog_tid_t, tid) 2018 __field(xlog_tid_t, tid)
2019 __field(xfs_lsn_t, lsn)
1995 __field(int, type) 2020 __field(int, type)
1996 __field(int, pass) 2021 __field(int, pass)
1997 __field(int, count) 2022 __field(int, count)
@@ -2001,15 +2026,17 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
2001 __entry->dev = log->l_mp->m_super->s_dev; 2026 __entry->dev = log->l_mp->m_super->s_dev;
2002 __entry->item = (unsigned long)item; 2027 __entry->item = (unsigned long)item;
2003 __entry->tid = trans->r_log_tid; 2028 __entry->tid = trans->r_log_tid;
2029 __entry->lsn = trans->r_lsn;
2004 __entry->type = ITEM_TYPE(item); 2030 __entry->type = ITEM_TYPE(item);
2005 __entry->pass = pass; 2031 __entry->pass = pass;
2006 __entry->count = item->ri_cnt; 2032 __entry->count = item->ri_cnt;
2007 __entry->total = item->ri_total; 2033 __entry->total = item->ri_total;
2008 ), 2034 ),
2009 TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s " 2035 TP_printk("dev %d:%d tid 0x%x lsn 0x%llx, pass %d, item 0x%p, "
2010 "item region count/total %d/%d", 2036 "item type %s item region count/total %d/%d",
2011 MAJOR(__entry->dev), MINOR(__entry->dev), 2037 MAJOR(__entry->dev), MINOR(__entry->dev),
2012 __entry->tid, 2038 __entry->tid,
2039 __entry->lsn,
2013 __entry->pass, 2040 __entry->pass,
2014 (void *)__entry->item, 2041 (void *)__entry->item,
2015 __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), 2042 __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
@@ -2068,6 +2095,7 @@ DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
2068DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add); 2095DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
2069DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc); 2096DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
2070DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover); 2097DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
2098DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_skip);
2071DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf); 2099DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
2072DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf); 2100DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
2073DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf); 2101DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
@@ -2558,6 +2586,60 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result);
2558DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result); 2586DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result);
2559DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result); 2587DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result);
2560 2588
2589/* per-AG reservation */
2590DECLARE_EVENT_CLASS(xfs_ag_resv_class,
2591 TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type resv,
2592 xfs_extlen_t len),
2593 TP_ARGS(pag, resv, len),
2594 TP_STRUCT__entry(
2595 __field(dev_t, dev)
2596 __field(xfs_agnumber_t, agno)
2597 __field(int, resv)
2598 __field(xfs_extlen_t, freeblks)
2599 __field(xfs_extlen_t, flcount)
2600 __field(xfs_extlen_t, reserved)
2601 __field(xfs_extlen_t, asked)
2602 __field(xfs_extlen_t, len)
2603 ),
2604 TP_fast_assign(
2605 struct xfs_ag_resv *r = xfs_perag_resv(pag, resv);
2606
2607 __entry->dev = pag->pag_mount->m_super->s_dev;
2608 __entry->agno = pag->pag_agno;
2609 __entry->resv = resv;
2610 __entry->freeblks = pag->pagf_freeblks;
2611 __entry->flcount = pag->pagf_flcount;
2612 __entry->reserved = r ? r->ar_reserved : 0;
2613 __entry->asked = r ? r->ar_asked : 0;
2614 __entry->len = len;
2615 ),
2616 TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u resv %u ask %u len %u\n",
2617 MAJOR(__entry->dev), MINOR(__entry->dev),
2618 __entry->agno,
2619 __entry->resv,
2620 __entry->freeblks,
2621 __entry->flcount,
2622 __entry->reserved,
2623 __entry->asked,
2624 __entry->len)
2625)
2626#define DEFINE_AG_RESV_EVENT(name) \
2627DEFINE_EVENT(xfs_ag_resv_class, name, \
2628 TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type type, \
2629 xfs_extlen_t len), \
2630 TP_ARGS(pag, type, len))
2631
2632/* per-AG reservation tracepoints */
2633DEFINE_AG_RESV_EVENT(xfs_ag_resv_init);
2634DEFINE_AG_RESV_EVENT(xfs_ag_resv_free);
2635DEFINE_AG_RESV_EVENT(xfs_ag_resv_alloc_extent);
2636DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent);
2637DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical);
2638DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed);
2639
2640DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error);
2641DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error);
2642
2561#endif /* _TRACE_XFS_H */ 2643#endif /* _TRACE_XFS_H */
2562 2644
2563#undef TRACE_INCLUDE_PATH 2645#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 5f3d33d16e67..70f42ea86dfb 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -217,7 +217,7 @@ undo_log:
217 217
218undo_blocks: 218undo_blocks:
219 if (blocks > 0) { 219 if (blocks > 0) {
220 xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); 220 xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd);
221 tp->t_blk_res = 0; 221 tp->t_blk_res = 0;
222 } 222 }
223 223
@@ -318,7 +318,6 @@ xfs_trans_mod_sb(
318 * in-core superblock's counter. This should only 318 * in-core superblock's counter. This should only
319 * be applied to the on-disk superblock. 319 * be applied to the on-disk superblock.
320 */ 320 */
321 ASSERT(delta < 0);
322 tp->t_res_fdblocks_delta += delta; 321 tp->t_res_fdblocks_delta += delta;
323 if (xfs_sb_version_haslazysbcount(&mp->m_sb)) 322 if (xfs_sb_version_haslazysbcount(&mp->m_sb))
324 flags &= ~XFS_TRANS_SB_DIRTY; 323 flags &= ~XFS_TRANS_SB_DIRTY;
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 459ddec137a4..ab438647592a 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -79,7 +79,8 @@ xfs_trans_free_extent(
79 79
80 trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len); 80 trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
81 81
82 error = xfs_free_extent(tp, start_block, ext_len, oinfo); 82 error = xfs_free_extent(tp, start_block, ext_len, oinfo,
83 XFS_AG_RESV_NONE);
83 84
84 /* 85 /*
85 * Mark the transaction dirty, even on error. This ensures the 86 * Mark the transaction dirty, even on error. This ensures the
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index ea62245fee26..62900938f26d 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -147,6 +147,7 @@ __xfs_xattr_put_listent(
147 arraytop = context->count + prefix_len + namelen + 1; 147 arraytop = context->count + prefix_len + namelen + 1;
148 if (arraytop > context->firstu) { 148 if (arraytop > context->firstu) {
149 context->count = -1; /* insufficient space */ 149 context->count = -1; /* insufficient space */
150 context->seen_enough = 1;
150 return 0; 151 return 0;
151 } 152 }
152 offset = (char *)context->alist + context->count; 153 offset = (char *)context->alist + context->count;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 9c6dc7704043..add6c4bc568f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -6,13 +6,19 @@
6#include <linux/radix-tree.h> 6#include <linux/radix-tree.h>
7#include <asm/pgtable.h> 7#include <asm/pgtable.h>
8 8
9struct iomap_ops;
10
9/* We use lowest available exceptional entry bit for locking */ 11/* We use lowest available exceptional entry bit for locking */
10#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) 12#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
11 13
14ssize_t iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
15 struct iomap_ops *ops);
12ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, 16ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
13 get_block_t, dio_iodone_t, int flags); 17 get_block_t, dio_iodone_t, int flags);
14int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); 18int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
15int dax_truncate_page(struct inode *, loff_t from, get_block_t); 19int dax_truncate_page(struct inode *, loff_t from, get_block_t);
20int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
21 struct iomap_ops *ops);
16int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); 22int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
17int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); 23int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
18void dax_wake_mapping_entry_waiter(struct address_space *mapping, 24void dax_wake_mapping_entry_waiter(struct address_space *mapping,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 3d70ece10313..e63e288dee83 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -22,6 +22,8 @@ struct vm_fault;
22 * Flags for iomap mappings: 22 * Flags for iomap mappings:
23 */ 23 */
24#define IOMAP_F_MERGED 0x01 /* contains multiple blocks/extents */ 24#define IOMAP_F_MERGED 0x01 /* contains multiple blocks/extents */
25#define IOMAP_F_SHARED 0x02 /* block shared with another file */
26#define IOMAP_F_NEW 0x04 /* blocks have been newly allocated */
25 27
26/* 28/*
27 * Magic value for blkno: 29 * Magic value for blkno:
@@ -64,6 +66,8 @@ struct iomap_ops {
64 66
65ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, 67ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
66 struct iomap_ops *ops); 68 struct iomap_ops *ops);
69int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
70 struct iomap_ops *ops);
67int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, 71int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
68 bool *did_zero, struct iomap_ops *ops); 72 bool *did_zero, struct iomap_ops *ops);
69int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, 73int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
diff --git a/mm/filemap.c b/mm/filemap.c
index 4bad32dd4b3b..68f1813fbdc3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1923,16 +1923,18 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
1923 if (iocb->ki_flags & IOCB_DIRECT) { 1923 if (iocb->ki_flags & IOCB_DIRECT) {
1924 struct address_space *mapping = file->f_mapping; 1924 struct address_space *mapping = file->f_mapping;
1925 struct inode *inode = mapping->host; 1925 struct inode *inode = mapping->host;
1926 struct iov_iter data = *iter;
1926 loff_t size; 1927 loff_t size;
1927 1928
1928 size = i_size_read(inode); 1929 size = i_size_read(inode);
1929 retval = filemap_write_and_wait_range(mapping, iocb->ki_pos, 1930 retval = filemap_write_and_wait_range(mapping, iocb->ki_pos,
1930 iocb->ki_pos + count - 1); 1931 iocb->ki_pos + count - 1);
1931 if (!retval) { 1932 if (retval < 0)
1932 struct iov_iter data = *iter; 1933 goto out;
1933 retval = mapping->a_ops->direct_IO(iocb, &data);
1934 }
1935 1934
1935 file_accessed(file);
1936
1937 retval = mapping->a_ops->direct_IO(iocb, &data);
1936 if (retval > 0) { 1938 if (retval > 0) {
1937 iocb->ki_pos += retval; 1939 iocb->ki_pos += retval;
1938 iov_iter_advance(iter, retval); 1940 iov_iter_advance(iter, retval);
@@ -1948,10 +1950,8 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
1948 * DAX files, so don't bother trying. 1950 * DAX files, so don't bother trying.
1949 */ 1951 */
1950 if (retval < 0 || !iov_iter_count(iter) || iocb->ki_pos >= size || 1952 if (retval < 0 || !iov_iter_count(iter) || iocb->ki_pos >= size ||
1951 IS_DAX(inode)) { 1953 IS_DAX(inode))
1952 file_accessed(file);
1953 goto out; 1954 goto out;
1954 }
1955 } 1955 }
1956 1956
1957 retval = do_generic_file_read(file, &iocb->ki_pos, iter, retval); 1957 retval = do_generic_file_read(file, &iocb->ki_pos, iter, retval);