aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/xfs.txt12
-rw-r--r--fs/dax.c34
-rw-r--r--fs/ext2/file.c4
-rw-r--r--fs/ext4/file.c16
-rw-r--r--fs/ext4/inode.c21
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c42
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h2
-rw-r--r--fs/xfs/libxfs/xfs_attr.c25
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c8
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c57
-rw-r--r--fs/xfs/libxfs/xfs_format.h52
-rw-r--r--fs/xfs/libxfs/xfs_fs.h1
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c551
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h12
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c93
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h10
-rw-r--r--fs/xfs/libxfs/xfs_sb.c34
-rw-r--r--fs/xfs/libxfs/xfs_shared.h6
-rw-r--r--fs/xfs/xfs_aops.c158
-rw-r--r--fs/xfs/xfs_aops.h7
-rw-r--r--fs/xfs/xfs_attr_inactive.c83
-rw-r--r--fs/xfs/xfs_bmap_util.c89
-rw-r--r--fs/xfs/xfs_dquot.c8
-rw-r--r--fs/xfs/xfs_file.c168
-rw-r--r--fs/xfs/xfs_fsops.c10
-rw-r--r--fs/xfs/xfs_inode.c224
-rw-r--r--fs/xfs/xfs_ioctl.c14
-rw-r--r--fs/xfs/xfs_iomap.c18
-rw-r--r--fs/xfs/xfs_iops.c48
-rw-r--r--fs/xfs/xfs_itable.c13
-rw-r--r--fs/xfs/xfs_log.c11
-rw-r--r--fs/xfs/xfs_log.h13
-rw-r--r--fs/xfs/xfs_log_cil.c12
-rw-r--r--fs/xfs/xfs_log_recover.c34
-rw-r--r--fs/xfs/xfs_mount.c50
-rw-r--r--fs/xfs/xfs_mount.h4
-rw-r--r--fs/xfs/xfs_pnfs.c4
-rw-r--r--fs/xfs/xfs_qm.c7
-rw-r--r--fs/xfs/xfs_qm_syscalls.c20
-rw-r--r--fs/xfs/xfs_quota.h1
-rw-r--r--fs/xfs/xfs_rtalloc.c16
-rw-r--r--fs/xfs/xfs_super.c25
-rw-r--r--fs/xfs/xfs_symlink.c17
-rw-r--r--fs/xfs/xfs_trace.h47
-rw-r--r--fs/xfs/xfs_trans.c91
-rw-r--r--fs/xfs/xfs_trans.h7
-rw-r--r--fs/xfs/xfs_trans_dquot.c32
-rw-r--r--fs/xfs/xfs_trans_priv.h2
-rw-r--r--include/linux/fs.h9
-rw-r--r--include/linux/percpu_counter.h13
-rw-r--r--lib/percpu_counter.c6
52 files changed, 1516 insertions, 727 deletions
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 5a5a05582b58..8146e9fd5ffc 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -236,10 +236,10 @@ Removed Mount Options
236 236
237 Name Removed 237 Name Removed
238 ---- ------- 238 ---- -------
239 delaylog/nodelaylog v3.20 239 delaylog/nodelaylog v4.0
240 ihashsize v3.20 240 ihashsize v4.0
241 irixsgid v3.20 241 irixsgid v4.0
242 osyncisdsync/osyncisosync v3.20 242 osyncisdsync/osyncisosync v4.0
243 243
244 244
245sysctls 245sysctls
@@ -346,5 +346,5 @@ Removed Sysctls
346 346
347 Name Removed 347 Name Removed
348 ---- ------- 348 ---- -------
349 fs.xfs.xfsbufd_centisec v3.20 349 fs.xfs.xfsbufd_centisec v4.0
350 fs.xfs.age_buffer_centisecs v3.20 350 fs.xfs.age_buffer_centisecs v4.0
diff --git a/fs/dax.c b/fs/dax.c
index 6f65f00e58ec..99b5fbc38992 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -309,14 +309,21 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
309 out: 309 out:
310 i_mmap_unlock_read(mapping); 310 i_mmap_unlock_read(mapping);
311 311
312 if (bh->b_end_io)
313 bh->b_end_io(bh, 1);
314
315 return error; 312 return error;
316} 313}
317 314
318static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 315/**
319 get_block_t get_block) 316 * __dax_fault - handle a page fault on a DAX file
317 * @vma: The virtual memory area where the fault occurred
318 * @vmf: The description of the fault
319 * @get_block: The filesystem method used to translate file offsets to blocks
320 *
321 * When a page fault occurs, filesystems may call this helper in their
322 * fault handler for DAX files. __dax_fault() assumes the caller has done all
323 * the necessary locking for the page fault to proceed successfully.
324 */
325int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
326 get_block_t get_block, dax_iodone_t complete_unwritten)
320{ 327{
321 struct file *file = vma->vm_file; 328 struct file *file = vma->vm_file;
322 struct address_space *mapping = file->f_mapping; 329 struct address_space *mapping = file->f_mapping;
@@ -417,7 +424,19 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
417 page_cache_release(page); 424 page_cache_release(page);
418 } 425 }
419 426
427 /*
428 * If we successfully insert the new mapping over an unwritten extent,
429 * we need to ensure we convert the unwritten extent. If there is an
430 * error inserting the mapping, the filesystem needs to leave it as
431 * unwritten to prevent exposure of the stale underlying data to
432 * userspace, but we still need to call the completion function so
433 * the private resources on the mapping buffer can be released. We
434 * indicate what the callback should do via the uptodate variable, same
435 * as for normal BH based IO completions.
436 */
420 error = dax_insert_mapping(inode, &bh, vma, vmf); 437 error = dax_insert_mapping(inode, &bh, vma, vmf);
438 if (buffer_unwritten(&bh))
439 complete_unwritten(&bh, !error);
421 440
422 out: 441 out:
423 if (error == -ENOMEM) 442 if (error == -ENOMEM)
@@ -434,6 +453,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
434 } 453 }
435 goto out; 454 goto out;
436} 455}
456EXPORT_SYMBOL(__dax_fault);
437 457
438/** 458/**
439 * dax_fault - handle a page fault on a DAX file 459 * dax_fault - handle a page fault on a DAX file
@@ -445,7 +465,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
445 * fault handler for DAX files. 465 * fault handler for DAX files.
446 */ 466 */
447int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 467int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
448 get_block_t get_block) 468 get_block_t get_block, dax_iodone_t complete_unwritten)
449{ 469{
450 int result; 470 int result;
451 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 471 struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -454,7 +474,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
454 sb_start_pagefault(sb); 474 sb_start_pagefault(sb);
455 file_update_time(vma->vm_file); 475 file_update_time(vma->vm_file);
456 } 476 }
457 result = do_dax_fault(vma, vmf, get_block); 477 result = __dax_fault(vma, vmf, get_block, complete_unwritten);
458 if (vmf->flags & FAULT_FLAG_WRITE) 478 if (vmf->flags & FAULT_FLAG_WRITE)
459 sb_end_pagefault(sb); 479 sb_end_pagefault(sb);
460 480
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 3a0a6c6406d0..3b57c9f83c9b 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -28,12 +28,12 @@
28#ifdef CONFIG_FS_DAX 28#ifdef CONFIG_FS_DAX
29static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 29static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
30{ 30{
31 return dax_fault(vma, vmf, ext2_get_block); 31 return dax_fault(vma, vmf, ext2_get_block, NULL);
32} 32}
33 33
34static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 34static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
35{ 35{
36 return dax_mkwrite(vma, vmf, ext2_get_block); 36 return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
37} 37}
38 38
39static const struct vm_operations_struct ext2_dax_vm_ops = { 39static const struct vm_operations_struct ext2_dax_vm_ops = {
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 0613c256c344..f713cfcc43a2 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -192,15 +192,27 @@ out:
192} 192}
193 193
194#ifdef CONFIG_FS_DAX 194#ifdef CONFIG_FS_DAX
195static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
196{
197 struct inode *inode = bh->b_assoc_map->host;
198 /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
199 loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
200 int err;
201 if (!uptodate)
202 return;
203 WARN_ON(!buffer_unwritten(bh));
204 err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
205}
206
195static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 207static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
196{ 208{
197 return dax_fault(vma, vmf, ext4_get_block); 209 return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
198 /* Is this the right get_block? */ 210 /* Is this the right get_block? */
199} 211}
200 212
201static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 213static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
202{ 214{
203 return dax_mkwrite(vma, vmf, ext4_get_block); 215 return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
204} 216}
205 217
206static const struct vm_operations_struct ext4_dax_vm_ops = { 218static const struct vm_operations_struct ext4_dax_vm_ops = {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 55b187c3bac1..7c38ed3494cb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -656,18 +656,6 @@ has_zeroout:
656 return retval; 656 return retval;
657} 657}
658 658
659static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
660{
661 struct inode *inode = bh->b_assoc_map->host;
662 /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
663 loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
664 int err;
665 if (!uptodate)
666 return;
667 WARN_ON(!buffer_unwritten(bh));
668 err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
669}
670
671/* Maximum number of blocks we map for direct IO at once. */ 659/* Maximum number of blocks we map for direct IO at once. */
672#define DIO_MAX_BLOCKS 4096 660#define DIO_MAX_BLOCKS 4096
673 661
@@ -705,10 +693,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
705 693
706 map_bh(bh, inode->i_sb, map.m_pblk); 694 map_bh(bh, inode->i_sb, map.m_pblk);
707 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; 695 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
708 if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) { 696 if (IS_DAX(inode) && buffer_unwritten(bh)) {
697 /*
698 * dgc: I suspect unwritten conversion on ext4+DAX is
699 * fundamentally broken here when there are concurrent
700 * read/write in progress on this inode.
701 */
702 WARN_ON_ONCE(io_end);
709 bh->b_assoc_map = inode->i_mapping; 703 bh->b_assoc_map = inode->i_mapping;
710 bh->b_private = (void *)(unsigned long)iblock; 704 bh->b_private = (void *)(unsigned long)iblock;
711 bh->b_end_io = ext4_end_io_unwritten;
712 } 705 }
713 if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) 706 if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
714 set_buffer_defer_completion(bh); 707 set_buffer_defer_completion(bh);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 029078167b64..f9e9ffe6fb46 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -149,13 +149,27 @@ xfs_alloc_compute_aligned(
149{ 149{
150 xfs_agblock_t bno; 150 xfs_agblock_t bno;
151 xfs_extlen_t len; 151 xfs_extlen_t len;
152 xfs_extlen_t diff;
152 153
153 /* Trim busy sections out of found extent */ 154 /* Trim busy sections out of found extent */
154 xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len); 155 xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
155 156
157 /*
158 * If we have a largish extent that happens to start before min_agbno,
159 * see if we can shift it into range...
160 */
161 if (bno < args->min_agbno && bno + len > args->min_agbno) {
162 diff = args->min_agbno - bno;
163 if (len > diff) {
164 bno += diff;
165 len -= diff;
166 }
167 }
168
156 if (args->alignment > 1 && len >= args->minlen) { 169 if (args->alignment > 1 && len >= args->minlen) {
157 xfs_agblock_t aligned_bno = roundup(bno, args->alignment); 170 xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
158 xfs_extlen_t diff = aligned_bno - bno; 171
172 diff = aligned_bno - bno;
159 173
160 *resbno = aligned_bno; 174 *resbno = aligned_bno;
161 *reslen = diff >= len ? 0 : len - diff; 175 *reslen = diff >= len ? 0 : len - diff;
@@ -795,9 +809,13 @@ xfs_alloc_find_best_extent(
795 * The good extent is closer than this one. 809 * The good extent is closer than this one.
796 */ 810 */
797 if (!dir) { 811 if (!dir) {
812 if (*sbnoa > args->max_agbno)
813 goto out_use_good;
798 if (*sbnoa >= args->agbno + gdiff) 814 if (*sbnoa >= args->agbno + gdiff)
799 goto out_use_good; 815 goto out_use_good;
800 } else { 816 } else {
817 if (*sbnoa < args->min_agbno)
818 goto out_use_good;
801 if (*sbnoa <= args->agbno - gdiff) 819 if (*sbnoa <= args->agbno - gdiff)
802 goto out_use_good; 820 goto out_use_good;
803 } 821 }
@@ -884,6 +902,17 @@ xfs_alloc_ag_vextent_near(
884 dofirst = prandom_u32() & 1; 902 dofirst = prandom_u32() & 1;
885#endif 903#endif
886 904
905 /* handle unitialized agbno range so caller doesn't have to */
906 if (!args->min_agbno && !args->max_agbno)
907 args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
908 ASSERT(args->min_agbno <= args->max_agbno);
909
910 /* clamp agbno to the range if it's outside */
911 if (args->agbno < args->min_agbno)
912 args->agbno = args->min_agbno;
913 if (args->agbno > args->max_agbno)
914 args->agbno = args->max_agbno;
915
887restart: 916restart:
888 bno_cur_lt = NULL; 917 bno_cur_lt = NULL;
889 bno_cur_gt = NULL; 918 bno_cur_gt = NULL;
@@ -976,6 +1005,8 @@ restart:
976 &ltbnoa, &ltlena); 1005 &ltbnoa, &ltlena);
977 if (ltlena < args->minlen) 1006 if (ltlena < args->minlen)
978 continue; 1007 continue;
1008 if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno)
1009 continue;
979 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 1010 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
980 xfs_alloc_fix_len(args); 1011 xfs_alloc_fix_len(args);
981 ASSERT(args->len >= args->minlen); 1012 ASSERT(args->len >= args->minlen);
@@ -1096,11 +1127,11 @@ restart:
1096 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); 1127 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
1097 xfs_alloc_compute_aligned(args, ltbno, ltlen, 1128 xfs_alloc_compute_aligned(args, ltbno, ltlen,
1098 &ltbnoa, &ltlena); 1129 &ltbnoa, &ltlena);
1099 if (ltlena >= args->minlen) 1130 if (ltlena >= args->minlen && ltbnoa >= args->min_agbno)
1100 break; 1131 break;
1101 if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) 1132 if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
1102 goto error0; 1133 goto error0;
1103 if (!i) { 1134 if (!i || ltbnoa < args->min_agbno) {
1104 xfs_btree_del_cursor(bno_cur_lt, 1135 xfs_btree_del_cursor(bno_cur_lt,
1105 XFS_BTREE_NOERROR); 1136 XFS_BTREE_NOERROR);
1106 bno_cur_lt = NULL; 1137 bno_cur_lt = NULL;
@@ -1112,11 +1143,11 @@ restart:
1112 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); 1143 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
1113 xfs_alloc_compute_aligned(args, gtbno, gtlen, 1144 xfs_alloc_compute_aligned(args, gtbno, gtlen,
1114 &gtbnoa, &gtlena); 1145 &gtbnoa, &gtlena);
1115 if (gtlena >= args->minlen) 1146 if (gtlena >= args->minlen && gtbnoa <= args->max_agbno)
1116 break; 1147 break;
1117 if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) 1148 if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
1118 goto error0; 1149 goto error0;
1119 if (!i) { 1150 if (!i || gtbnoa > args->max_agbno) {
1120 xfs_btree_del_cursor(bno_cur_gt, 1151 xfs_btree_del_cursor(bno_cur_gt,
1121 XFS_BTREE_NOERROR); 1152 XFS_BTREE_NOERROR);
1122 bno_cur_gt = NULL; 1153 bno_cur_gt = NULL;
@@ -1216,6 +1247,7 @@ restart:
1216 ASSERT(ltnew >= ltbno); 1247 ASSERT(ltnew >= ltbno);
1217 ASSERT(ltnew + rlen <= ltbnoa + ltlena); 1248 ASSERT(ltnew + rlen <= ltbnoa + ltlena);
1218 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 1249 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
1250 ASSERT(ltnew >= args->min_agbno && ltnew <= args->max_agbno);
1219 args->agbno = ltnew; 1251 args->agbno = ltnew;
1220 1252
1221 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, 1253 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 7d59b8f4bf9e..ca1c8168373a 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -112,6 +112,8 @@ typedef struct xfs_alloc_arg {
112 xfs_extlen_t total; /* total blocks needed in xaction */ 112 xfs_extlen_t total; /* total blocks needed in xaction */
113 xfs_extlen_t alignment; /* align answer to multiple of this */ 113 xfs_extlen_t alignment; /* align answer to multiple of this */
114 xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */ 114 xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */
115 xfs_agblock_t min_agbno; /* set an agbno range for NEAR allocs */
116 xfs_agblock_t max_agbno; /* ... */
115 xfs_extlen_t len; /* output: actual size of extent */ 117 xfs_extlen_t len; /* output: actual size of extent */
116 xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */ 118 xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */
117 xfs_alloctype_t otype; /* original allocation type */ 119 xfs_alloctype_t otype; /* original allocation type */
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 0a472fbe06d4..3349c9a1e845 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -266,7 +266,7 @@ xfs_attr_set(
266 tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; 266 tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
267 error = xfs_trans_reserve(args.trans, &tres, args.total, 0); 267 error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
268 if (error) { 268 if (error) {
269 xfs_trans_cancel(args.trans, 0); 269 xfs_trans_cancel(args.trans);
270 return error; 270 return error;
271 } 271 }
272 xfs_ilock(dp, XFS_ILOCK_EXCL); 272 xfs_ilock(dp, XFS_ILOCK_EXCL);
@@ -276,7 +276,7 @@ xfs_attr_set(
276 XFS_QMOPT_RES_REGBLKS); 276 XFS_QMOPT_RES_REGBLKS);
277 if (error) { 277 if (error) {
278 xfs_iunlock(dp, XFS_ILOCK_EXCL); 278 xfs_iunlock(dp, XFS_ILOCK_EXCL);
279 xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES); 279 xfs_trans_cancel(args.trans);
280 return error; 280 return error;
281 } 281 }
282 282
@@ -320,8 +320,7 @@ xfs_attr_set(
320 xfs_trans_ichgtime(args.trans, dp, 320 xfs_trans_ichgtime(args.trans, dp,
321 XFS_ICHGTIME_CHG); 321 XFS_ICHGTIME_CHG);
322 } 322 }
323 err2 = xfs_trans_commit(args.trans, 323 err2 = xfs_trans_commit(args.trans);
324 XFS_TRANS_RELEASE_LOG_RES);
325 xfs_iunlock(dp, XFS_ILOCK_EXCL); 324 xfs_iunlock(dp, XFS_ILOCK_EXCL);
326 325
327 return error ? error : err2; 326 return error ? error : err2;
@@ -383,16 +382,14 @@ xfs_attr_set(
383 * Commit the last in the sequence of transactions. 382 * Commit the last in the sequence of transactions.
384 */ 383 */
385 xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); 384 xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
386 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); 385 error = xfs_trans_commit(args.trans);
387 xfs_iunlock(dp, XFS_ILOCK_EXCL); 386 xfs_iunlock(dp, XFS_ILOCK_EXCL);
388 387
389 return error; 388 return error;
390 389
391out: 390out:
392 if (args.trans) { 391 if (args.trans)
393 xfs_trans_cancel(args.trans, 392 xfs_trans_cancel(args.trans);
394 XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
395 }
396 xfs_iunlock(dp, XFS_ILOCK_EXCL); 393 xfs_iunlock(dp, XFS_ILOCK_EXCL);
397 return error; 394 return error;
398} 395}
@@ -462,7 +459,7 @@ xfs_attr_remove(
462 error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm, 459 error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
463 XFS_ATTRRM_SPACE_RES(mp), 0); 460 XFS_ATTRRM_SPACE_RES(mp), 0);
464 if (error) { 461 if (error) {
465 xfs_trans_cancel(args.trans, 0); 462 xfs_trans_cancel(args.trans);
466 return error; 463 return error;
467 } 464 }
468 465
@@ -501,16 +498,14 @@ xfs_attr_remove(
501 * Commit the last in the sequence of transactions. 498 * Commit the last in the sequence of transactions.
502 */ 499 */
503 xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); 500 xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
504 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); 501 error = xfs_trans_commit(args.trans);
505 xfs_iunlock(dp, XFS_ILOCK_EXCL); 502 xfs_iunlock(dp, XFS_ILOCK_EXCL);
506 503
507 return error; 504 return error;
508 505
509out: 506out:
510 if (args.trans) { 507 if (args.trans)
511 xfs_trans_cancel(args.trans, 508 xfs_trans_cancel(args.trans);
512 XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
513 }
514 xfs_iunlock(dp, XFS_ILOCK_EXCL); 509 xfs_iunlock(dp, XFS_ILOCK_EXCL);
515 return error; 510 return error;
516} 511}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 04e79d57bca6..e9d401ce93bb 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -574,8 +574,8 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
574 * After the last attribute is removed revert to original inode format, 574 * After the last attribute is removed revert to original inode format,
575 * making all literal area available to the data fork once more. 575 * making all literal area available to the data fork once more.
576 */ 576 */
577STATIC void 577void
578xfs_attr_fork_reset( 578xfs_attr_fork_remove(
579 struct xfs_inode *ip, 579 struct xfs_inode *ip,
580 struct xfs_trans *tp) 580 struct xfs_trans *tp)
581{ 581{
@@ -641,7 +641,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
641 (mp->m_flags & XFS_MOUNT_ATTR2) && 641 (mp->m_flags & XFS_MOUNT_ATTR2) &&
642 (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && 642 (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
643 !(args->op_flags & XFS_DA_OP_ADDNAME)) { 643 !(args->op_flags & XFS_DA_OP_ADDNAME)) {
644 xfs_attr_fork_reset(dp, args->trans); 644 xfs_attr_fork_remove(dp, args->trans);
645 } else { 645 } else {
646 xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); 646 xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
647 dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); 647 dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
@@ -905,7 +905,7 @@ xfs_attr3_leaf_to_shortform(
905 if (forkoff == -1) { 905 if (forkoff == -1) {
906 ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); 906 ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
907 ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE); 907 ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE);
908 xfs_attr_fork_reset(dp, args->trans); 908 xfs_attr_fork_remove(dp, args->trans);
909 goto out; 909 goto out;
910 } 910 }
911 911
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 025c4b820c03..882c8d338891 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -53,7 +53,7 @@ int xfs_attr_shortform_remove(struct xfs_da_args *args);
53int xfs_attr_shortform_list(struct xfs_attr_list_context *context); 53int xfs_attr_shortform_list(struct xfs_attr_list_context *context);
54int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); 54int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
55int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); 55int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
56 56void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
57 57
58/* 58/*
59 * Internal routines when attribute fork size == XFS_LBSIZE(mp). 59 * Internal routines when attribute fork size == XFS_LBSIZE(mp).
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index d567159a3343..63e05b663380 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1112,7 +1112,6 @@ xfs_bmap_add_attrfork(
1112 int committed; /* xaction was committed */ 1112 int committed; /* xaction was committed */
1113 int logflags; /* logging flags */ 1113 int logflags; /* logging flags */
1114 int error; /* error return value */ 1114 int error; /* error return value */
1115 int cancel_flags = 0;
1116 1115
1117 ASSERT(XFS_IFORK_Q(ip) == 0); 1116 ASSERT(XFS_IFORK_Q(ip) == 0);
1118 1117
@@ -1124,17 +1123,15 @@ xfs_bmap_add_attrfork(
1124 tp->t_flags |= XFS_TRANS_RESERVE; 1123 tp->t_flags |= XFS_TRANS_RESERVE;
1125 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0); 1124 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
1126 if (error) { 1125 if (error) {
1127 xfs_trans_cancel(tp, 0); 1126 xfs_trans_cancel(tp);
1128 return error; 1127 return error;
1129 } 1128 }
1130 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1131 xfs_ilock(ip, XFS_ILOCK_EXCL); 1129 xfs_ilock(ip, XFS_ILOCK_EXCL);
1132 error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? 1130 error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
1133 XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : 1131 XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
1134 XFS_QMOPT_RES_REGBLKS); 1132 XFS_QMOPT_RES_REGBLKS);
1135 if (error) 1133 if (error)
1136 goto trans_cancel; 1134 goto trans_cancel;
1137 cancel_flags |= XFS_TRANS_ABORT;
1138 if (XFS_IFORK_Q(ip)) 1135 if (XFS_IFORK_Q(ip))
1139 goto trans_cancel; 1136 goto trans_cancel;
1140 if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { 1137 if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
@@ -1218,14 +1215,14 @@ xfs_bmap_add_attrfork(
1218 error = xfs_bmap_finish(&tp, &flist, &committed); 1215 error = xfs_bmap_finish(&tp, &flist, &committed);
1219 if (error) 1216 if (error)
1220 goto bmap_cancel; 1217 goto bmap_cancel;
1221 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1218 error = xfs_trans_commit(tp);
1222 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1219 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1223 return error; 1220 return error;
1224 1221
1225bmap_cancel: 1222bmap_cancel:
1226 xfs_bmap_cancel(&flist); 1223 xfs_bmap_cancel(&flist);
1227trans_cancel: 1224trans_cancel:
1228 xfs_trans_cancel(tp, cancel_flags); 1225 xfs_trans_cancel(tp);
1229 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1226 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1230 return error; 1227 return error;
1231} 1228}
@@ -3224,12 +3221,24 @@ xfs_bmap_extsize_align(
3224 align_alen += temp; 3221 align_alen += temp;
3225 align_off -= temp; 3222 align_off -= temp;
3226 } 3223 }
3224
3225 /* Same adjustment for the end of the requested area. */
3226 temp = (align_alen % extsz);
3227 if (temp)
3228 align_alen += extsz - temp;
3229
3227 /* 3230 /*
3228 * Same adjustment for the end of the requested area. 3231 * For large extent hint sizes, the aligned extent might be larger than
3232 * MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls
3233 * the length back under MAXEXTLEN. The outer allocation loops handle
3234 * short allocation just fine, so it is safe to do this. We only want to
3235 * do it when we are forced to, though, because it means more allocation
3236 * operations are required.
3229 */ 3237 */
3230 if ((temp = (align_alen % extsz))) { 3238 while (align_alen > MAXEXTLEN)
3231 align_alen += extsz - temp; 3239 align_alen -= extsz;
3232 } 3240 ASSERT(align_alen <= MAXEXTLEN);
3241
3233 /* 3242 /*
3234 * If the previous block overlaps with this proposed allocation 3243 * If the previous block overlaps with this proposed allocation
3235 * then move the start forward without adjusting the length. 3244 * then move the start forward without adjusting the length.
@@ -3318,7 +3327,9 @@ xfs_bmap_extsize_align(
3318 return -EINVAL; 3327 return -EINVAL;
3319 } else { 3328 } else {
3320 ASSERT(orig_off >= align_off); 3329 ASSERT(orig_off >= align_off);
3321 ASSERT(orig_end <= align_off + align_alen); 3330 /* see MAXEXTLEN handling above */
3331 ASSERT(orig_end <= align_off + align_alen ||
3332 align_alen + extsz > MAXEXTLEN);
3322 } 3333 }
3323 3334
3324#ifdef DEBUG 3335#ifdef DEBUG
@@ -4100,13 +4111,6 @@ xfs_bmapi_reserve_delalloc(
4100 /* Figure out the extent size, adjust alen */ 4111 /* Figure out the extent size, adjust alen */
4101 extsz = xfs_get_extsz_hint(ip); 4112 extsz = xfs_get_extsz_hint(ip);
4102 if (extsz) { 4113 if (extsz) {
4103 /*
4104 * Make sure we don't exceed a single extent length when we
4105 * align the extent by reducing length we are going to
4106 * allocate by the maximum amount extent size aligment may
4107 * require.
4108 */
4109 alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1));
4110 error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof, 4114 error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
4111 1, 0, &aoff, &alen); 4115 1, 0, &aoff, &alen);
4112 ASSERT(!error); 4116 ASSERT(!error);
@@ -4418,7 +4422,15 @@ xfs_bmapi_convert_unwritten(
4418 error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, 4422 error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
4419 &bma->cur, mval, bma->firstblock, bma->flist, 4423 &bma->cur, mval, bma->firstblock, bma->flist,
4420 &tmp_logflags); 4424 &tmp_logflags);
4421 bma->logflags |= tmp_logflags; 4425 /*
4426 * Log the inode core unconditionally in the unwritten extent conversion
4427 * path because the conversion might not have done so (e.g., if the
4428 * extent count hasn't changed). We need to make sure the inode is dirty
4429 * in the transaction for the sake of fsync(), even if nothing has
4430 * changed, because fsync() will not force the log for this transaction
4431 * unless it sees the inode pinned.
4432 */
4433 bma->logflags |= tmp_logflags | XFS_ILOG_CORE;
4422 if (error) 4434 if (error)
4423 return error; 4435 return error;
4424 4436
@@ -5912,7 +5924,7 @@ xfs_bmap_split_extent(
5912 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, 5924 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
5913 XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); 5925 XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
5914 if (error) { 5926 if (error) {
5915 xfs_trans_cancel(tp, 0); 5927 xfs_trans_cancel(tp);
5916 return error; 5928 return error;
5917 } 5929 }
5918 5930
@@ -5930,10 +5942,9 @@ xfs_bmap_split_extent(
5930 if (error) 5942 if (error)
5931 goto out; 5943 goto out;
5932 5944
5933 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 5945 return xfs_trans_commit(tp);
5934
5935 5946
5936out: 5947out:
5937 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 5948 xfs_trans_cancel(tp);
5938 return error; 5949 return error;
5939} 5950}
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 487a6e0d0103..a0ae572051de 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -170,7 +170,7 @@ typedef struct xfs_sb {
170 __uint32_t sb_features_log_incompat; 170 __uint32_t sb_features_log_incompat;
171 171
172 __uint32_t sb_crc; /* superblock crc */ 172 __uint32_t sb_crc; /* superblock crc */
173 __uint32_t sb_pad; 173 xfs_extlen_t sb_spino_align; /* sparse inode chunk alignment */
174 174
175 xfs_ino_t sb_pquotino; /* project quota inode */ 175 xfs_ino_t sb_pquotino; /* project quota inode */
176 xfs_lsn_t sb_lsn; /* last write sequence */ 176 xfs_lsn_t sb_lsn; /* last write sequence */
@@ -256,7 +256,7 @@ typedef struct xfs_dsb {
256 __be32 sb_features_log_incompat; 256 __be32 sb_features_log_incompat;
257 257
258 __le32 sb_crc; /* superblock crc */ 258 __le32 sb_crc; /* superblock crc */
259 __be32 sb_pad; 259 __be32 sb_spino_align; /* sparse inode chunk alignment */
260 260
261 __be64 sb_pquotino; /* project quota inode */ 261 __be64 sb_pquotino; /* project quota inode */
262 __be64 sb_lsn; /* last write sequence */ 262 __be64 sb_lsn; /* last write sequence */
@@ -457,8 +457,10 @@ xfs_sb_has_ro_compat_feature(
457} 457}
458 458
459#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ 459#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
460#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */
460#define XFS_SB_FEAT_INCOMPAT_ALL \ 461#define XFS_SB_FEAT_INCOMPAT_ALL \
461 (XFS_SB_FEAT_INCOMPAT_FTYPE) 462 (XFS_SB_FEAT_INCOMPAT_FTYPE| \
463 XFS_SB_FEAT_INCOMPAT_SPINODES)
462 464
463#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL 465#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
464static inline bool 466static inline bool
@@ -506,6 +508,12 @@ static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
506 (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); 508 (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
507} 509}
508 510
511static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp)
512{
513 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
514 xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES);
515}
516
509/* 517/*
510 * end of superblock version macros 518 * end of superblock version macros
511 */ 519 */
@@ -1203,26 +1211,54 @@ typedef __uint64_t xfs_inofree_t;
1203#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) 1211#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
1204#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) 1212#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
1205 1213
1214#define XFS_INOBT_HOLEMASK_FULL 0 /* holemask for full chunk */
1215#define XFS_INOBT_HOLEMASK_BITS (NBBY * sizeof(__uint16_t))
1216#define XFS_INODES_PER_HOLEMASK_BIT \
1217 (XFS_INODES_PER_CHUNK / (NBBY * sizeof(__uint16_t)))
1218
1206static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) 1219static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
1207{ 1220{
1208 return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i; 1221 return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
1209} 1222}
1210 1223
1211/* 1224/*
1212 * Data record structure 1225 * The on-disk inode record structure has two formats. The original "full"
1226 * format uses a 4-byte freecount. The "sparse" format uses a 1-byte freecount
1227 * and replaces the 3 high-order freecount bytes wth the holemask and inode
1228 * count.
1229 *
1230 * The holemask of the sparse record format allows an inode chunk to have holes
1231 * that refer to blocks not owned by the inode record. This facilitates inode
1232 * allocation in the event of severe free space fragmentation.
1213 */ 1233 */
1214typedef struct xfs_inobt_rec { 1234typedef struct xfs_inobt_rec {
1215 __be32 ir_startino; /* starting inode number */ 1235 __be32 ir_startino; /* starting inode number */
1216 __be32 ir_freecount; /* count of free inodes (set bits) */ 1236 union {
1237 struct {
1238 __be32 ir_freecount; /* count of free inodes */
1239 } f;
1240 struct {
1241 __be16 ir_holemask;/* hole mask for sparse chunks */
1242 __u8 ir_count; /* total inode count */
1243 __u8 ir_freecount; /* count of free inodes */
1244 } sp;
1245 } ir_u;
1217 __be64 ir_free; /* free inode mask */ 1246 __be64 ir_free; /* free inode mask */
1218} xfs_inobt_rec_t; 1247} xfs_inobt_rec_t;
1219 1248
1220typedef struct xfs_inobt_rec_incore { 1249typedef struct xfs_inobt_rec_incore {
1221 xfs_agino_t ir_startino; /* starting inode number */ 1250 xfs_agino_t ir_startino; /* starting inode number */
1222 __int32_t ir_freecount; /* count of free inodes (set bits) */ 1251 __uint16_t ir_holemask; /* hole mask for sparse chunks */
1252 __uint8_t ir_count; /* total inode count */
1253 __uint8_t ir_freecount; /* count of free inodes (set bits) */
1223 xfs_inofree_t ir_free; /* free inode mask */ 1254 xfs_inofree_t ir_free; /* free inode mask */
1224} xfs_inobt_rec_incore_t; 1255} xfs_inobt_rec_incore_t;
1225 1256
1257static inline bool xfs_inobt_issparse(uint16_t holemask)
1258{
1259 /* non-zero holemask represents a sparse rec. */
1260 return holemask;
1261}
1226 1262
1227/* 1263/*
1228 * Key structure 1264 * Key structure
@@ -1440,8 +1476,8 @@ struct xfs_acl {
1440 sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp))) 1476 sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
1441 1477
1442/* On-disk XFS extended attribute names */ 1478/* On-disk XFS extended attribute names */
1443#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" 1479#define SGI_ACL_FILE "SGI_ACL_FILE"
1444#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" 1480#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT"
1445#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) 1481#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1)
1446#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) 1482#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
1447 1483
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 18dc721ca19f..89689c6a43e2 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -239,6 +239,7 @@ typedef struct xfs_fsop_resblks {
239#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ 239#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */
240#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ 240#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */
241#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ 241#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */
242#define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */
242 243
243/* 244/*
244 * Minimum and maximum sizes need for growth checks. 245 * Minimum and maximum sizes need for growth checks.
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 07349a183a11..66efc702452a 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -65,6 +65,8 @@ xfs_inobt_lookup(
65 int *stat) /* success/failure */ 65 int *stat) /* success/failure */
66{ 66{
67 cur->bc_rec.i.ir_startino = ino; 67 cur->bc_rec.i.ir_startino = ino;
68 cur->bc_rec.i.ir_holemask = 0;
69 cur->bc_rec.i.ir_count = 0;
68 cur->bc_rec.i.ir_freecount = 0; 70 cur->bc_rec.i.ir_freecount = 0;
69 cur->bc_rec.i.ir_free = 0; 71 cur->bc_rec.i.ir_free = 0;
70 return xfs_btree_lookup(cur, dir, stat); 72 return xfs_btree_lookup(cur, dir, stat);
@@ -82,7 +84,14 @@ xfs_inobt_update(
82 union xfs_btree_rec rec; 84 union xfs_btree_rec rec;
83 85
84 rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino); 86 rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
85 rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount); 87 if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
88 rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask);
89 rec.inobt.ir_u.sp.ir_count = irec->ir_count;
90 rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount;
91 } else {
92 /* ir_holemask/ir_count not supported on-disk */
93 rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount);
94 }
86 rec.inobt.ir_free = cpu_to_be64(irec->ir_free); 95 rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
87 return xfs_btree_update(cur, &rec); 96 return xfs_btree_update(cur, &rec);
88} 97}
@@ -100,12 +109,27 @@ xfs_inobt_get_rec(
100 int error; 109 int error;
101 110
102 error = xfs_btree_get_rec(cur, &rec, stat); 111 error = xfs_btree_get_rec(cur, &rec, stat);
103 if (!error && *stat == 1) { 112 if (error || *stat == 0)
104 irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); 113 return error;
105 irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount); 114
106 irec->ir_free = be64_to_cpu(rec->inobt.ir_free); 115 irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
116 if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
117 irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask);
118 irec->ir_count = rec->inobt.ir_u.sp.ir_count;
119 irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount;
120 } else {
121 /*
122 * ir_holemask/ir_count not supported on-disk. Fill in hardcoded
123 * values for full inode chunks.
124 */
125 irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL;
126 irec->ir_count = XFS_INODES_PER_CHUNK;
127 irec->ir_freecount =
128 be32_to_cpu(rec->inobt.ir_u.f.ir_freecount);
107 } 129 }
108 return error; 130 irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
131
132 return 0;
109} 133}
110 134
111/* 135/*
@@ -114,10 +138,14 @@ xfs_inobt_get_rec(
114STATIC int 138STATIC int
115xfs_inobt_insert_rec( 139xfs_inobt_insert_rec(
116 struct xfs_btree_cur *cur, 140 struct xfs_btree_cur *cur,
141 __uint16_t holemask,
142 __uint8_t count,
117 __int32_t freecount, 143 __int32_t freecount,
118 xfs_inofree_t free, 144 xfs_inofree_t free,
119 int *stat) 145 int *stat)
120{ 146{
147 cur->bc_rec.i.ir_holemask = holemask;
148 cur->bc_rec.i.ir_count = count;
121 cur->bc_rec.i.ir_freecount = freecount; 149 cur->bc_rec.i.ir_freecount = freecount;
122 cur->bc_rec.i.ir_free = free; 150 cur->bc_rec.i.ir_free = free;
123 return xfs_btree_insert(cur, stat); 151 return xfs_btree_insert(cur, stat);
@@ -154,7 +182,9 @@ xfs_inobt_insert(
154 } 182 }
155 ASSERT(i == 0); 183 ASSERT(i == 0);
156 184
157 error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK, 185 error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL,
186 XFS_INODES_PER_CHUNK,
187 XFS_INODES_PER_CHUNK,
158 XFS_INOBT_ALL_FREE, &i); 188 XFS_INOBT_ALL_FREE, &i);
159 if (error) { 189 if (error) {
160 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 190 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -220,6 +250,7 @@ xfs_ialloc_inode_init(
220 struct xfs_mount *mp, 250 struct xfs_mount *mp,
221 struct xfs_trans *tp, 251 struct xfs_trans *tp,
222 struct list_head *buffer_list, 252 struct list_head *buffer_list,
253 int icount,
223 xfs_agnumber_t agno, 254 xfs_agnumber_t agno,
224 xfs_agblock_t agbno, 255 xfs_agblock_t agbno,
225 xfs_agblock_t length, 256 xfs_agblock_t length,
@@ -275,7 +306,7 @@ xfs_ialloc_inode_init(
275 * they track in the AIL as if they were physically logged. 306 * they track in the AIL as if they were physically logged.
276 */ 307 */
277 if (tp) 308 if (tp)
278 xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos, 309 xfs_icreate_log(tp, agno, agbno, icount,
279 mp->m_sb.sb_inodesize, length, gen); 310 mp->m_sb.sb_inodesize, length, gen);
280 } else 311 } else
281 version = 2; 312 version = 2;
@@ -347,6 +378,214 @@ xfs_ialloc_inode_init(
347} 378}
348 379
349/* 380/*
381 * Align startino and allocmask for a recently allocated sparse chunk such that
382 * they are fit for insertion (or merge) into the on-disk inode btrees.
383 *
384 * Background:
385 *
386 * When enabled, sparse inode support increases the inode alignment from cluster
387 * size to inode chunk size. This means that the minimum range between two
388 * non-adjacent inode records in the inobt is large enough for a full inode
389 * record. This allows for cluster sized, cluster aligned block allocation
390 * without need to worry about whether the resulting inode record overlaps with
391 * another record in the tree. Without this basic rule, we would have to deal
392 * with the consequences of overlap by potentially undoing recent allocations in
393 * the inode allocation codepath.
394 *
395 * Because of this alignment rule (which is enforced on mount), there are two
396 * inobt possibilities for newly allocated sparse chunks. One is that the
397 * aligned inode record for the chunk covers a range of inodes not already
398 * covered in the inobt (i.e., it is safe to insert a new sparse record). The
399 * other is that a record already exists at the aligned startino that considers
400 * the newly allocated range as sparse. In the latter case, record content is
401 * merged in hope that sparse inode chunks fill to full chunks over time.
402 */
403STATIC void
404xfs_align_sparse_ino(
405 struct xfs_mount *mp,
406 xfs_agino_t *startino,
407 uint16_t *allocmask)
408{
409 xfs_agblock_t agbno;
410 xfs_agblock_t mod;
411 int offset;
412
413 agbno = XFS_AGINO_TO_AGBNO(mp, *startino);
414 mod = agbno % mp->m_sb.sb_inoalignmt;
415 if (!mod)
416 return;
417
418 /* calculate the inode offset and align startino */
419 offset = mod << mp->m_sb.sb_inopblog;
420 *startino -= offset;
421
422 /*
423 * Since startino has been aligned down, left shift allocmask such that
424 * it continues to represent the same physical inodes relative to the
425 * new startino.
426 */
427 *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
428}
429
430/*
431 * Determine whether the source inode record can merge into the target. Both
432 * records must be sparse, the inode ranges must match and there must be no
433 * allocation overlap between the records.
434 */
435STATIC bool
436__xfs_inobt_can_merge(
437 struct xfs_inobt_rec_incore *trec, /* tgt record */
438 struct xfs_inobt_rec_incore *srec) /* src record */
439{
440 uint64_t talloc;
441 uint64_t salloc;
442
443 /* records must cover the same inode range */
444 if (trec->ir_startino != srec->ir_startino)
445 return false;
446
447 /* both records must be sparse */
448 if (!xfs_inobt_issparse(trec->ir_holemask) ||
449 !xfs_inobt_issparse(srec->ir_holemask))
450 return false;
451
452 /* both records must track some inodes */
453 if (!trec->ir_count || !srec->ir_count)
454 return false;
455
456 /* can't exceed capacity of a full record */
457 if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
458 return false;
459
460 /* verify there is no allocation overlap */
461 talloc = xfs_inobt_irec_to_allocmask(trec);
462 salloc = xfs_inobt_irec_to_allocmask(srec);
463 if (talloc & salloc)
464 return false;
465
466 return true;
467}
468
469/*
470 * Merge the source inode record into the target. The caller must call
471 * __xfs_inobt_can_merge() to ensure the merge is valid.
472 */
473STATIC void
474__xfs_inobt_rec_merge(
475 struct xfs_inobt_rec_incore *trec, /* target */
476 struct xfs_inobt_rec_incore *srec) /* src */
477{
478 ASSERT(trec->ir_startino == srec->ir_startino);
479
480 /* combine the counts */
481 trec->ir_count += srec->ir_count;
482 trec->ir_freecount += srec->ir_freecount;
483
484 /*
485 * Merge the holemask and free mask. For both fields, 0 bits refer to
486 * allocated inodes. We combine the allocated ranges with bitwise AND.
487 */
488 trec->ir_holemask &= srec->ir_holemask;
489 trec->ir_free &= srec->ir_free;
490}
491
492/*
493 * Insert a new sparse inode chunk into the associated inode btree. The inode
494 * record for the sparse chunk is pre-aligned to a startino that should match
495 * any pre-existing sparse inode record in the tree. This allows sparse chunks
496 * to fill over time.
497 *
498 * This function supports two modes of handling preexisting records depending on
499 * the merge flag. If merge is true, the provided record is merged with the
500 * existing record and updated in place. The merged record is returned in nrec.
501 * If merge is false, an existing record is replaced with the provided record.
502 * If no preexisting record exists, the provided record is always inserted.
503 *
504 * It is considered corruption if a merge is requested and not possible. Given
505 * the sparse inode alignment constraints, this should never happen.
506 */
507STATIC int
508xfs_inobt_insert_sprec(
509 struct xfs_mount *mp,
510 struct xfs_trans *tp,
511 struct xfs_buf *agbp,
512 int btnum,
513 struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */
514 bool merge) /* merge or replace */
515{
516 struct xfs_btree_cur *cur;
517 struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
518 xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
519 int error;
520 int i;
521 struct xfs_inobt_rec_incore rec;
522
523 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
524
525 /* the new record is pre-aligned so we know where to look */
526 error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
527 if (error)
528 goto error;
529 /* if nothing there, insert a new record and return */
530 if (i == 0) {
531 error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
532 nrec->ir_count, nrec->ir_freecount,
533 nrec->ir_free, &i);
534 if (error)
535 goto error;
536 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
537
538 goto out;
539 }
540
541 /*
542 * A record exists at this startino. Merge or replace the record
543 * depending on what we've been asked to do.
544 */
545 if (merge) {
546 error = xfs_inobt_get_rec(cur, &rec, &i);
547 if (error)
548 goto error;
549 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
550 XFS_WANT_CORRUPTED_GOTO(mp,
551 rec.ir_startino == nrec->ir_startino,
552 error);
553
554 /*
555 * This should never fail. If we have coexisting records that
556 * cannot merge, something is seriously wrong.
557 */
558 XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec),
559 error);
560
561 trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino,
562 rec.ir_holemask, nrec->ir_startino,
563 nrec->ir_holemask);
564
565 /* merge to nrec to output the updated record */
566 __xfs_inobt_rec_merge(nrec, &rec);
567
568 trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino,
569 nrec->ir_holemask);
570
571 error = xfs_inobt_rec_check_count(mp, nrec);
572 if (error)
573 goto error;
574 }
575
576 error = xfs_inobt_update(cur, nrec);
577 if (error)
578 goto error;
579
580out:
581 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
582 return 0;
583error:
584 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
585 return error;
586}
587
588/*
350 * Allocate new inodes in the allocation group specified by agbp. 589 * Allocate new inodes in the allocation group specified by agbp.
351 * Return 0 for success, else error code. 590 * Return 0 for success, else error code.
352 */ 591 */
@@ -364,11 +603,22 @@ xfs_ialloc_ag_alloc(
364 xfs_agino_t newlen; /* new number of inodes */ 603 xfs_agino_t newlen; /* new number of inodes */
365 int isaligned = 0; /* inode allocation at stripe unit */ 604 int isaligned = 0; /* inode allocation at stripe unit */
366 /* boundary */ 605 /* boundary */
606 uint16_t allocmask = (uint16_t) -1; /* init. to full chunk */
607 struct xfs_inobt_rec_incore rec;
367 struct xfs_perag *pag; 608 struct xfs_perag *pag;
609 int do_sparse = 0;
368 610
369 memset(&args, 0, sizeof(args)); 611 memset(&args, 0, sizeof(args));
370 args.tp = tp; 612 args.tp = tp;
371 args.mp = tp->t_mountp; 613 args.mp = tp->t_mountp;
614 args.fsbno = NULLFSBLOCK;
615
616#ifdef DEBUG
617 /* randomly do sparse inode allocations */
618 if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) &&
619 args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks)
620 do_sparse = prandom_u32() & 1;
621#endif
372 622
373 /* 623 /*
374 * Locking will ensure that we don't have two callers in here 624 * Locking will ensure that we don't have two callers in here
@@ -376,7 +626,7 @@ xfs_ialloc_ag_alloc(
376 */ 626 */
377 newlen = args.mp->m_ialloc_inos; 627 newlen = args.mp->m_ialloc_inos;
378 if (args.mp->m_maxicount && 628 if (args.mp->m_maxicount &&
379 percpu_counter_read(&args.mp->m_icount) + newlen > 629 percpu_counter_read_positive(&args.mp->m_icount) + newlen >
380 args.mp->m_maxicount) 630 args.mp->m_maxicount)
381 return -ENOSPC; 631 return -ENOSPC;
382 args.minlen = args.maxlen = args.mp->m_ialloc_blks; 632 args.minlen = args.maxlen = args.mp->m_ialloc_blks;
@@ -390,6 +640,8 @@ xfs_ialloc_ag_alloc(
390 agno = be32_to_cpu(agi->agi_seqno); 640 agno = be32_to_cpu(agi->agi_seqno);
391 args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + 641 args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
392 args.mp->m_ialloc_blks; 642 args.mp->m_ialloc_blks;
643 if (do_sparse)
644 goto sparse_alloc;
393 if (likely(newino != NULLAGINO && 645 if (likely(newino != NULLAGINO &&
394 (args.agbno < be32_to_cpu(agi->agi_length)))) { 646 (args.agbno < be32_to_cpu(agi->agi_length)))) {
395 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); 647 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
@@ -428,8 +680,7 @@ xfs_ialloc_ag_alloc(
428 * subsequent requests. 680 * subsequent requests.
429 */ 681 */
430 args.minalignslop = 0; 682 args.minalignslop = 0;
431 } else 683 }
432 args.fsbno = NULLFSBLOCK;
433 684
434 if (unlikely(args.fsbno == NULLFSBLOCK)) { 685 if (unlikely(args.fsbno == NULLFSBLOCK)) {
435 /* 686 /*
@@ -480,6 +731,47 @@ xfs_ialloc_ag_alloc(
480 return error; 731 return error;
481 } 732 }
482 733
734 /*
735 * Finally, try a sparse allocation if the filesystem supports it and
736 * the sparse allocation length is smaller than a full chunk.
737 */
738 if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) &&
739 args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks &&
740 args.fsbno == NULLFSBLOCK) {
741sparse_alloc:
742 args.type = XFS_ALLOCTYPE_NEAR_BNO;
743 args.agbno = be32_to_cpu(agi->agi_root);
744 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
745 args.alignment = args.mp->m_sb.sb_spino_align;
746 args.prod = 1;
747
748 args.minlen = args.mp->m_ialloc_min_blks;
749 args.maxlen = args.minlen;
750
751 /*
752 * The inode record will be aligned to full chunk size. We must
753 * prevent sparse allocation from AG boundaries that result in
754 * invalid inode records, such as records that start at agbno 0
755 * or extend beyond the AG.
756 *
757 * Set min agbno to the first aligned, non-zero agbno and max to
758 * the last aligned agbno that is at least one full chunk from
759 * the end of the AG.
760 */
761 args.min_agbno = args.mp->m_sb.sb_inoalignmt;
762 args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
763 args.mp->m_sb.sb_inoalignmt) -
764 args.mp->m_ialloc_blks;
765
766 error = xfs_alloc_vextent(&args);
767 if (error)
768 return error;
769
770 newlen = args.len << args.mp->m_sb.sb_inopblog;
771 ASSERT(newlen <= XFS_INODES_PER_CHUNK);
772 allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
773 }
774
483 if (args.fsbno == NULLFSBLOCK) { 775 if (args.fsbno == NULLFSBLOCK) {
484 *alloc = 0; 776 *alloc = 0;
485 return 0; 777 return 0;
@@ -495,8 +787,8 @@ xfs_ialloc_ag_alloc(
495 * rather than a linear progression to prevent the next generation 787 * rather than a linear progression to prevent the next generation
496 * number from being easily guessable. 788 * number from being easily guessable.
497 */ 789 */
498 error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno, 790 error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno,
499 args.len, prandom_u32()); 791 args.agbno, args.len, prandom_u32());
500 792
501 if (error) 793 if (error)
502 return error; 794 return error;
@@ -504,6 +796,73 @@ xfs_ialloc_ag_alloc(
504 * Convert the results. 796 * Convert the results.
505 */ 797 */
506 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); 798 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
799
800 if (xfs_inobt_issparse(~allocmask)) {
801 /*
802 * We've allocated a sparse chunk. Align the startino and mask.
803 */
804 xfs_align_sparse_ino(args.mp, &newino, &allocmask);
805
806 rec.ir_startino = newino;
807 rec.ir_holemask = ~allocmask;
808 rec.ir_count = newlen;
809 rec.ir_freecount = newlen;
810 rec.ir_free = XFS_INOBT_ALL_FREE;
811
812 /*
813 * Insert the sparse record into the inobt and allow for a merge
814 * if necessary. If a merge does occur, rec is updated to the
815 * merged record.
816 */
817 error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO,
818 &rec, true);
819 if (error == -EFSCORRUPTED) {
820 xfs_alert(args.mp,
821 "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
822 XFS_AGINO_TO_INO(args.mp, agno,
823 rec.ir_startino),
824 rec.ir_holemask, rec.ir_count);
825 xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
826 }
827 if (error)
828 return error;
829
830 /*
831 * We can't merge the part we've just allocated as for the inobt
832 * due to finobt semantics. The original record may or may not
833 * exist independent of whether physical inodes exist in this
834 * sparse chunk.
835 *
836 * We must update the finobt record based on the inobt record.
837 * rec contains the fully merged and up to date inobt record
838 * from the previous call. Set merge false to replace any
839 * existing record with this one.
840 */
841 if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
842 error = xfs_inobt_insert_sprec(args.mp, tp, agbp,
843 XFS_BTNUM_FINO, &rec,
844 false);
845 if (error)
846 return error;
847 }
848 } else {
849 /* full chunk - insert new records to both btrees */
850 error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
851 XFS_BTNUM_INO);
852 if (error)
853 return error;
854
855 if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
856 error = xfs_inobt_insert(args.mp, tp, agbp, newino,
857 newlen, XFS_BTNUM_FINO);
858 if (error)
859 return error;
860 }
861 }
862
863 /*
864 * Update AGI counts and newino.
865 */
507 be32_add_cpu(&agi->agi_count, newlen); 866 be32_add_cpu(&agi->agi_count, newlen);
508 be32_add_cpu(&agi->agi_freecount, newlen); 867 be32_add_cpu(&agi->agi_freecount, newlen);
509 pag = xfs_perag_get(args.mp, agno); 868 pag = xfs_perag_get(args.mp, agno);
@@ -512,20 +871,6 @@ xfs_ialloc_ag_alloc(
512 agi->agi_newino = cpu_to_be32(newino); 871 agi->agi_newino = cpu_to_be32(newino);
513 872
514 /* 873 /*
515 * Insert records describing the new inode chunk into the btrees.
516 */
517 error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
518 XFS_BTNUM_INO);
519 if (error)
520 return error;
521
522 if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
523 error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
524 XFS_BTNUM_FINO);
525 if (error)
526 return error;
527 }
528 /*
529 * Log allocation group header fields 874 * Log allocation group header fields
530 */ 875 */
531 xfs_ialloc_log_agi(tp, agbp, 876 xfs_ialloc_log_agi(tp, agbp,
@@ -645,7 +990,7 @@ xfs_ialloc_ag_select(
645 * if we fail allocation due to alignment issues then it is most 990 * if we fail allocation due to alignment issues then it is most
646 * likely a real ENOSPC condition. 991 * likely a real ENOSPC condition.
647 */ 992 */
648 ineed = mp->m_ialloc_blks; 993 ineed = mp->m_ialloc_min_blks;
649 if (flags && ineed > 1) 994 if (flags && ineed > 1)
650 ineed += xfs_ialloc_cluster_alignment(mp); 995 ineed += xfs_ialloc_cluster_alignment(mp);
651 longest = pag->pagf_longest; 996 longest = pag->pagf_longest;
@@ -732,6 +1077,27 @@ xfs_ialloc_get_rec(
732} 1077}
733 1078
734/* 1079/*
1080 * Return the offset of the first free inode in the record. If the inode chunk
1081 * is sparsely allocated, we convert the record holemask to inode granularity
1082 * and mask off the unallocated regions from the inode free mask.
1083 */
1084STATIC int
1085xfs_inobt_first_free_inode(
1086 struct xfs_inobt_rec_incore *rec)
1087{
1088 xfs_inofree_t realfree;
1089
1090 /* if there are no holes, return the first available offset */
1091 if (!xfs_inobt_issparse(rec->ir_holemask))
1092 return xfs_lowbit64(rec->ir_free);
1093
1094 realfree = xfs_inobt_irec_to_allocmask(rec);
1095 realfree &= rec->ir_free;
1096
1097 return xfs_lowbit64(realfree);
1098}
1099
1100/*
735 * Allocate an inode using the inobt-only algorithm. 1101 * Allocate an inode using the inobt-only algorithm.
736 */ 1102 */
737STATIC int 1103STATIC int
@@ -961,7 +1327,7 @@ newino:
961 } 1327 }
962 1328
963alloc_inode: 1329alloc_inode:
964 offset = xfs_lowbit64(rec.ir_free); 1330 offset = xfs_inobt_first_free_inode(&rec);
965 ASSERT(offset >= 0); 1331 ASSERT(offset >= 0);
966 ASSERT(offset < XFS_INODES_PER_CHUNK); 1332 ASSERT(offset < XFS_INODES_PER_CHUNK);
967 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % 1333 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1210,7 +1576,7 @@ xfs_dialloc_ag(
1210 if (error) 1576 if (error)
1211 goto error_cur; 1577 goto error_cur;
1212 1578
1213 offset = xfs_lowbit64(rec.ir_free); 1579 offset = xfs_inobt_first_free_inode(&rec);
1214 ASSERT(offset >= 0); 1580 ASSERT(offset >= 0);
1215 ASSERT(offset < XFS_INODES_PER_CHUNK); 1581 ASSERT(offset < XFS_INODES_PER_CHUNK);
1216 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % 1582 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1339,10 +1705,13 @@ xfs_dialloc(
1339 * If we have already hit the ceiling of inode blocks then clear 1705 * If we have already hit the ceiling of inode blocks then clear
1340 * okalloc so we scan all available agi structures for a free 1706 * okalloc so we scan all available agi structures for a free
1341 * inode. 1707 * inode.
1708 *
1709 * Read rough value of mp->m_icount by percpu_counter_read_positive,
1710 * which will sacrifice the preciseness but improve the performance.
1342 */ 1711 */
1343 if (mp->m_maxicount && 1712 if (mp->m_maxicount &&
1344 percpu_counter_read(&mp->m_icount) + mp->m_ialloc_inos > 1713 percpu_counter_read_positive(&mp->m_icount) + mp->m_ialloc_inos
1345 mp->m_maxicount) { 1714 > mp->m_maxicount) {
1346 noroom = 1; 1715 noroom = 1;
1347 okalloc = 0; 1716 okalloc = 0;
1348 } 1717 }
@@ -1436,6 +1805,83 @@ out_error:
1436 return error; 1805 return error;
1437} 1806}
1438 1807
1808/*
1809 * Free the blocks of an inode chunk. We must consider that the inode chunk
1810 * might be sparse and only free the regions that are allocated as part of the
1811 * chunk.
1812 */
1813STATIC void
1814xfs_difree_inode_chunk(
1815 struct xfs_mount *mp,
1816 xfs_agnumber_t agno,
1817 struct xfs_inobt_rec_incore *rec,
1818 struct xfs_bmap_free *flist)
1819{
1820 xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino);
1821 int startidx, endidx;
1822 int nextbit;
1823 xfs_agblock_t agbno;
1824 int contigblk;
1825 DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
1826
1827 if (!xfs_inobt_issparse(rec->ir_holemask)) {
1828 /* not sparse, calculate extent info directly */
1829 xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
1830 XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)),
1831 mp->m_ialloc_blks, flist, mp);
1832 return;
1833 }
1834
1835 /* holemask is only 16-bits (fits in an unsigned long) */
1836 ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0]));
1837 holemask[0] = rec->ir_holemask;
1838
1839 /*
1840 * Find contiguous ranges of zeroes (i.e., allocated regions) in the
1841 * holemask and convert the start/end index of each range to an extent.
1842 * We start with the start and end index both pointing at the first 0 in
1843 * the mask.
1844 */
1845 startidx = endidx = find_first_zero_bit(holemask,
1846 XFS_INOBT_HOLEMASK_BITS);
1847 nextbit = startidx + 1;
1848 while (startidx < XFS_INOBT_HOLEMASK_BITS) {
1849 nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
1850 nextbit);
1851 /*
1852 * If the next zero bit is contiguous, update the end index of
1853 * the current range and continue.
1854 */
1855 if (nextbit != XFS_INOBT_HOLEMASK_BITS &&
1856 nextbit == endidx + 1) {
1857 endidx = nextbit;
1858 goto next;
1859 }
1860
1861 /*
1862 * nextbit is not contiguous with the current end index. Convert
1863 * the current start/end to an extent and add it to the free
1864 * list.
1865 */
1866 agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) /
1867 mp->m_sb.sb_inopblock;
1868 contigblk = ((endidx - startidx + 1) *
1869 XFS_INODES_PER_HOLEMASK_BIT) /
1870 mp->m_sb.sb_inopblock;
1871
1872 ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
1873 ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
1874 xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
1875 flist, mp);
1876
1877 /* reset range to current bit and carry on... */
1878 startidx = endidx = nextbit;
1879
1880next:
1881 nextbit++;
1882 }
1883}
1884
1439STATIC int 1885STATIC int
1440xfs_difree_inobt( 1886xfs_difree_inobt(
1441 struct xfs_mount *mp, 1887 struct xfs_mount *mp,
@@ -1443,8 +1889,7 @@ xfs_difree_inobt(
1443 struct xfs_buf *agbp, 1889 struct xfs_buf *agbp,
1444 xfs_agino_t agino, 1890 xfs_agino_t agino,
1445 struct xfs_bmap_free *flist, 1891 struct xfs_bmap_free *flist,
1446 int *deleted, 1892 struct xfs_icluster *xic,
1447 xfs_ino_t *first_ino,
1448 struct xfs_inobt_rec_incore *orec) 1893 struct xfs_inobt_rec_incore *orec)
1449{ 1894{
1450 struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); 1895 struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
@@ -1498,20 +1943,23 @@ xfs_difree_inobt(
1498 rec.ir_freecount++; 1943 rec.ir_freecount++;
1499 1944
1500 /* 1945 /*
1501 * When an inode cluster is free, it becomes eligible for removal 1946 * When an inode chunk is free, it becomes eligible for removal. Don't
1947 * remove the chunk if the block size is large enough for multiple inode
1948 * chunks (that might not be free).
1502 */ 1949 */
1503 if (!(mp->m_flags & XFS_MOUNT_IKEEP) && 1950 if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
1504 (rec.ir_freecount == mp->m_ialloc_inos)) { 1951 rec.ir_free == XFS_INOBT_ALL_FREE &&
1505 1952 mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
1506 *deleted = 1; 1953 xic->deleted = 1;
1507 *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); 1954 xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
1955 xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
1508 1956
1509 /* 1957 /*
1510 * Remove the inode cluster from the AGI B+Tree, adjust the 1958 * Remove the inode cluster from the AGI B+Tree, adjust the
1511 * AGI and Superblock inode counts, and mark the disk space 1959 * AGI and Superblock inode counts, and mark the disk space
1512 * to be freed when the transaction is committed. 1960 * to be freed when the transaction is committed.
1513 */ 1961 */
1514 ilen = mp->m_ialloc_inos; 1962 ilen = rec.ir_freecount;
1515 be32_add_cpu(&agi->agi_count, -ilen); 1963 be32_add_cpu(&agi->agi_count, -ilen);
1516 be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); 1964 be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
1517 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); 1965 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
@@ -1527,11 +1975,9 @@ xfs_difree_inobt(
1527 goto error0; 1975 goto error0;
1528 } 1976 }
1529 1977
1530 xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, 1978 xfs_difree_inode_chunk(mp, agno, &rec, flist);
1531 XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
1532 mp->m_ialloc_blks, flist, mp);
1533 } else { 1979 } else {
1534 *deleted = 0; 1980 xic->deleted = 0;
1535 1981
1536 error = xfs_inobt_update(cur, &rec); 1982 error = xfs_inobt_update(cur, &rec);
1537 if (error) { 1983 if (error) {
@@ -1596,7 +2042,9 @@ xfs_difree_finobt(
1596 */ 2042 */
1597 XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error); 2043 XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error);
1598 2044
1599 error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount, 2045 error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask,
2046 ibtrec->ir_count,
2047 ibtrec->ir_freecount,
1600 ibtrec->ir_free, &i); 2048 ibtrec->ir_free, &i);
1601 if (error) 2049 if (error)
1602 goto error; 2050 goto error;
@@ -1631,8 +2079,13 @@ xfs_difree_finobt(
1631 * free inode. Hence, if all of the inodes are free and we aren't 2079 * free inode. Hence, if all of the inodes are free and we aren't
1632 * keeping inode chunks permanently on disk, remove the record. 2080 * keeping inode chunks permanently on disk, remove the record.
1633 * Otherwise, update the record with the new information. 2081 * Otherwise, update the record with the new information.
2082 *
2083 * Note that we currently can't free chunks when the block size is large
2084 * enough for multiple chunks. Leave the finobt record to remain in sync
2085 * with the inobt.
1634 */ 2086 */
1635 if (rec.ir_freecount == mp->m_ialloc_inos && 2087 if (rec.ir_free == XFS_INOBT_ALL_FREE &&
2088 mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK &&
1636 !(mp->m_flags & XFS_MOUNT_IKEEP)) { 2089 !(mp->m_flags & XFS_MOUNT_IKEEP)) {
1637 error = xfs_btree_delete(cur, &i); 2090 error = xfs_btree_delete(cur, &i);
1638 if (error) 2091 if (error)
@@ -1668,8 +2121,7 @@ xfs_difree(
1668 struct xfs_trans *tp, /* transaction pointer */ 2121 struct xfs_trans *tp, /* transaction pointer */
1669 xfs_ino_t inode, /* inode to be freed */ 2122 xfs_ino_t inode, /* inode to be freed */
1670 struct xfs_bmap_free *flist, /* extents to free */ 2123 struct xfs_bmap_free *flist, /* extents to free */
1671 int *deleted,/* set if inode cluster was deleted */ 2124 struct xfs_icluster *xic) /* cluster info if deleted */
1672 xfs_ino_t *first_ino)/* first inode in deleted cluster */
1673{ 2125{
1674 /* REFERENCED */ 2126 /* REFERENCED */
1675 xfs_agblock_t agbno; /* block number containing inode */ 2127 xfs_agblock_t agbno; /* block number containing inode */
@@ -1720,8 +2172,7 @@ xfs_difree(
1720 /* 2172 /*
1721 * Fix up the inode allocation btree. 2173 * Fix up the inode allocation btree.
1722 */ 2174 */
1723 error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino, 2175 error = xfs_difree_inobt(mp, tp, agbp, agino, flist, xic, &rec);
1724 &rec);
1725 if (error) 2176 if (error)
1726 goto error0; 2177 goto error0;
1727 2178
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 100007d56449..12401fea7bff 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -28,6 +28,13 @@ struct xfs_btree_cur;
28/* Move inodes in clusters of this size */ 28/* Move inodes in clusters of this size */
29#define XFS_INODE_BIG_CLUSTER_SIZE 8192 29#define XFS_INODE_BIG_CLUSTER_SIZE 8192
30 30
31struct xfs_icluster {
32 bool deleted; /* record is deleted */
33 xfs_ino_t first_ino; /* first inode number */
34 uint64_t alloc; /* inode phys. allocation bitmap for
35 * sparse chunks */
36};
37
31/* Calculate and return the number of filesystem blocks per inode cluster */ 38/* Calculate and return the number of filesystem blocks per inode cluster */
32static inline int 39static inline int
33xfs_icluster_size_fsb( 40xfs_icluster_size_fsb(
@@ -90,8 +97,7 @@ xfs_difree(
90 struct xfs_trans *tp, /* transaction pointer */ 97 struct xfs_trans *tp, /* transaction pointer */
91 xfs_ino_t inode, /* inode to be freed */ 98 xfs_ino_t inode, /* inode to be freed */
92 struct xfs_bmap_free *flist, /* extents to free */ 99 struct xfs_bmap_free *flist, /* extents to free */
93 int *deleted, /* set if inode cluster was deleted */ 100 struct xfs_icluster *ifree); /* cluster info if deleted */
94 xfs_ino_t *first_ino); /* first inode in deleted cluster */
95 101
96/* 102/*
97 * Return the location of the inode in imap, for mapping it into a buffer. 103 * Return the location of the inode in imap, for mapping it into a buffer.
@@ -156,7 +162,7 @@ int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
156 * Inode chunk initialisation routine 162 * Inode chunk initialisation routine
157 */ 163 */
158int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, 164int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
159 struct list_head *buffer_list, 165 struct list_head *buffer_list, int icount,
160 xfs_agnumber_t agno, xfs_agblock_t agbno, 166 xfs_agnumber_t agno, xfs_agblock_t agbno,
161 xfs_agblock_t length, unsigned int gen); 167 xfs_agblock_t length, unsigned int gen);
162 168
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 964c465ca69c..674ad8f760be 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -167,7 +167,16 @@ xfs_inobt_init_rec_from_cur(
167 union xfs_btree_rec *rec) 167 union xfs_btree_rec *rec)
168{ 168{
169 rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino); 169 rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
170 rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount); 170 if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
171 rec->inobt.ir_u.sp.ir_holemask =
172 cpu_to_be16(cur->bc_rec.i.ir_holemask);
173 rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count;
174 rec->inobt.ir_u.sp.ir_freecount = cur->bc_rec.i.ir_freecount;
175 } else {
176 /* ir_holemask/ir_count not supported on-disk */
177 rec->inobt.ir_u.f.ir_freecount =
178 cpu_to_be32(cur->bc_rec.i.ir_freecount);
179 }
171 rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free); 180 rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
172} 181}
173 182
@@ -418,3 +427,85 @@ xfs_inobt_maxrecs(
418 return blocklen / sizeof(xfs_inobt_rec_t); 427 return blocklen / sizeof(xfs_inobt_rec_t);
419 return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t)); 428 return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
420} 429}
430
431/*
432 * Convert the inode record holemask to an inode allocation bitmap. The inode
433 * allocation bitmap is inode granularity and specifies whether an inode is
434 * physically allocated on disk (not whether the inode is considered allocated
435 * or free by the fs).
436 *
437 * A bit value of 1 means the inode is allocated, a value of 0 means it is free.
438 */
439uint64_t
440xfs_inobt_irec_to_allocmask(
441 struct xfs_inobt_rec_incore *rec)
442{
443 uint64_t bitmap = 0;
444 uint64_t inodespbit;
445 int nextbit;
446 uint allocbitmap;
447
448 /*
449 * The holemask has 16-bits for a 64 inode record. Therefore each
450 * holemask bit represents multiple inodes. Create a mask of bits to set
451 * in the allocmask for each holemask bit.
452 */
453 inodespbit = (1 << XFS_INODES_PER_HOLEMASK_BIT) - 1;
454
455 /*
456 * Allocated inodes are represented by 0 bits in holemask. Invert the 0
457 * bits to 1 and convert to a uint so we can use xfs_next_bit(). Mask
458 * anything beyond the 16 holemask bits since this casts to a larger
459 * type.
460 */
461 allocbitmap = ~rec->ir_holemask & ((1 << XFS_INOBT_HOLEMASK_BITS) - 1);
462
463 /*
464 * allocbitmap is the inverted holemask so every set bit represents
465 * allocated inodes. To expand from 16-bit holemask granularity to
466 * 64-bit (e.g., bit-per-inode), set inodespbit bits in the target
467 * bitmap for every holemask bit.
468 */
469 nextbit = xfs_next_bit(&allocbitmap, 1, 0);
470 while (nextbit != -1) {
471 ASSERT(nextbit < (sizeof(rec->ir_holemask) * NBBY));
472
473 bitmap |= (inodespbit <<
474 (nextbit * XFS_INODES_PER_HOLEMASK_BIT));
475
476 nextbit = xfs_next_bit(&allocbitmap, 1, nextbit + 1);
477 }
478
479 return bitmap;
480}
481
482#if defined(DEBUG) || defined(XFS_WARN)
483/*
484 * Verify that an in-core inode record has a valid inode count.
485 */
486int
487xfs_inobt_rec_check_count(
488 struct xfs_mount *mp,
489 struct xfs_inobt_rec_incore *rec)
490{
491 int inocount = 0;
492 int nextbit = 0;
493 uint64_t allocbmap;
494 int wordsz;
495
496 wordsz = sizeof(allocbmap) / sizeof(unsigned int);
497 allocbmap = xfs_inobt_irec_to_allocmask(rec);
498
499 nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit);
500 while (nextbit != -1) {
501 inocount++;
502 nextbit = xfs_next_bit((uint *) &allocbmap, wordsz,
503 nextbit + 1);
504 }
505
506 if (inocount != rec->ir_count)
507 return -EFSCORRUPTED;
508
509 return 0;
510}
511#endif /* DEBUG */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index d7ebea72c2d0..bd88453217ce 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -62,4 +62,14 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
62 xfs_btnum_t); 62 xfs_btnum_t);
63extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); 63extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
64 64
65/* ir_holemask to inode allocation bitmap conversion */
66uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *);
67
68#if defined(DEBUG) || defined(XFS_WARN)
69int xfs_inobt_rec_check_count(struct xfs_mount *,
70 struct xfs_inobt_rec_incore *);
71#else
72#define xfs_inobt_rec_check_count(mp, rec) 0
73#endif /* DEBUG */
74
65#endif /* __XFS_IALLOC_BTREE_H__ */ 75#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index dc4bfc5d88fc..df9851c46b5c 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -174,6 +174,27 @@ xfs_mount_validate_sb(
174 return -EFSCORRUPTED; 174 return -EFSCORRUPTED;
175 } 175 }
176 176
177 /*
178 * Full inode chunks must be aligned to inode chunk size when
179 * sparse inodes are enabled to support the sparse chunk
180 * allocation algorithm and prevent overlapping inode records.
181 */
182 if (xfs_sb_version_hassparseinodes(sbp)) {
183 uint32_t align;
184
185 xfs_alert(mp,
186 "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
187
188 align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize
189 >> sbp->sb_blocklog;
190 if (sbp->sb_inoalignmt != align) {
191 xfs_warn(mp,
192"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.",
193 sbp->sb_inoalignmt, align);
194 return -EINVAL;
195 }
196 }
197
177 if (unlikely( 198 if (unlikely(
178 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { 199 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
179 xfs_warn(mp, 200 xfs_warn(mp,
@@ -374,7 +395,7 @@ __xfs_sb_from_disk(
374 be32_to_cpu(from->sb_features_log_incompat); 395 be32_to_cpu(from->sb_features_log_incompat);
375 /* crc is only used on disk, not in memory; just init to 0 here. */ 396 /* crc is only used on disk, not in memory; just init to 0 here. */
376 to->sb_crc = 0; 397 to->sb_crc = 0;
377 to->sb_pad = 0; 398 to->sb_spino_align = be32_to_cpu(from->sb_spino_align);
378 to->sb_pquotino = be64_to_cpu(from->sb_pquotino); 399 to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
379 to->sb_lsn = be64_to_cpu(from->sb_lsn); 400 to->sb_lsn = be64_to_cpu(from->sb_lsn);
380 /* Convert on-disk flags to in-memory flags? */ 401 /* Convert on-disk flags to in-memory flags? */
@@ -516,7 +537,7 @@ xfs_sb_to_disk(
516 cpu_to_be32(from->sb_features_incompat); 537 cpu_to_be32(from->sb_features_incompat);
517 to->sb_features_log_incompat = 538 to->sb_features_log_incompat =
518 cpu_to_be32(from->sb_features_log_incompat); 539 cpu_to_be32(from->sb_features_log_incompat);
519 to->sb_pad = 0; 540 to->sb_spino_align = cpu_to_be32(from->sb_spino_align);
520 to->sb_lsn = cpu_to_be64(from->sb_lsn); 541 to->sb_lsn = cpu_to_be64(from->sb_lsn);
521 } 542 }
522} 543}
@@ -689,6 +710,11 @@ xfs_sb_mount_common(
689 mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, 710 mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
690 sbp->sb_inopblock); 711 sbp->sb_inopblock);
691 mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; 712 mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
713
714 if (sbp->sb_spino_align)
715 mp->m_ialloc_min_blks = sbp->sb_spino_align;
716 else
717 mp->m_ialloc_min_blks = mp->m_ialloc_blks;
692} 718}
693 719
694/* 720/*
@@ -792,12 +818,12 @@ xfs_sync_sb(
792 tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP); 818 tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP);
793 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); 819 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
794 if (error) { 820 if (error) {
795 xfs_trans_cancel(tp, 0); 821 xfs_trans_cancel(tp);
796 return error; 822 return error;
797 } 823 }
798 824
799 xfs_log_sb(tp); 825 xfs_log_sb(tp);
800 if (wait) 826 if (wait)
801 xfs_trans_set_sync(tp); 827 xfs_trans_set_sync(tp);
802 return xfs_trans_commit(tp, 0); 828 return xfs_trans_commit(tp);
803} 829}
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 8dda4b321343..5be529707903 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -182,12 +182,6 @@ int xfs_log_calc_minimum_size(struct xfs_mount *);
182#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer 182#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer
183 count in superblock */ 183 count in superblock */
184/* 184/*
185 * Values for call flags parameter.
186 */
187#define XFS_TRANS_RELEASE_LOG_RES 0x4
188#define XFS_TRANS_ABORT 0x8
189
190/*
191 * Field values for xfs_trans_mod_sb. 185 * Field values for xfs_trans_mod_sb.
192 */ 186 */
193#define XFS_TRANS_SB_ICOUNT 0x00000001 187#define XFS_TRANS_SB_ICOUNT 0x00000001
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a56960dd1684..dc5269807399 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -109,7 +109,7 @@ xfs_setfilesize_trans_alloc(
109 109
110 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); 110 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
111 if (error) { 111 if (error) {
112 xfs_trans_cancel(tp, 0); 112 xfs_trans_cancel(tp);
113 return error; 113 return error;
114 } 114 }
115 115
@@ -145,7 +145,7 @@ xfs_setfilesize(
145 isize = xfs_new_eof(ip, offset + size); 145 isize = xfs_new_eof(ip, offset + size);
146 if (!isize) { 146 if (!isize) {
147 xfs_iunlock(ip, XFS_ILOCK_EXCL); 147 xfs_iunlock(ip, XFS_ILOCK_EXCL);
148 xfs_trans_cancel(tp, 0); 148 xfs_trans_cancel(tp);
149 return 0; 149 return 0;
150 } 150 }
151 151
@@ -155,7 +155,7 @@ xfs_setfilesize(
155 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 155 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
156 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 156 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
157 157
158 return xfs_trans_commit(tp, 0); 158 return xfs_trans_commit(tp);
159} 159}
160 160
161STATIC int 161STATIC int
@@ -1349,7 +1349,7 @@ __xfs_get_blocks(
1349 sector_t iblock, 1349 sector_t iblock,
1350 struct buffer_head *bh_result, 1350 struct buffer_head *bh_result,
1351 int create, 1351 int create,
1352 int direct) 1352 bool direct)
1353{ 1353{
1354 struct xfs_inode *ip = XFS_I(inode); 1354 struct xfs_inode *ip = XFS_I(inode);
1355 struct xfs_mount *mp = ip->i_mount; 1355 struct xfs_mount *mp = ip->i_mount;
@@ -1414,6 +1414,7 @@ __xfs_get_blocks(
1414 if (error) 1414 if (error)
1415 return error; 1415 return error;
1416 new = 1; 1416 new = 1;
1417
1417 } else { 1418 } else {
1418 /* 1419 /*
1419 * Delalloc reservations do not require a transaction, 1420 * Delalloc reservations do not require a transaction,
@@ -1508,49 +1509,29 @@ xfs_get_blocks(
1508 struct buffer_head *bh_result, 1509 struct buffer_head *bh_result,
1509 int create) 1510 int create)
1510{ 1511{
1511 return __xfs_get_blocks(inode, iblock, bh_result, create, 0); 1512 return __xfs_get_blocks(inode, iblock, bh_result, create, false);
1512} 1513}
1513 1514
1514STATIC int 1515int
1515xfs_get_blocks_direct( 1516xfs_get_blocks_direct(
1516 struct inode *inode, 1517 struct inode *inode,
1517 sector_t iblock, 1518 sector_t iblock,
1518 struct buffer_head *bh_result, 1519 struct buffer_head *bh_result,
1519 int create) 1520 int create)
1520{ 1521{
1521 return __xfs_get_blocks(inode, iblock, bh_result, create, 1); 1522 return __xfs_get_blocks(inode, iblock, bh_result, create, true);
1522} 1523}
1523 1524
1524/* 1525static void
1525 * Complete a direct I/O write request. 1526__xfs_end_io_direct_write(
1526 * 1527 struct inode *inode,
1527 * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. 1528 struct xfs_ioend *ioend,
1528 * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
1529 * wholly within the EOF and so there is nothing for us to do. Note that in this
1530 * case the completion can be called in interrupt context, whereas if we have an
1531 * ioend we will always be called in task context (i.e. from a workqueue).
1532 */
1533STATIC void
1534xfs_end_io_direct_write(
1535 struct kiocb *iocb,
1536 loff_t offset, 1529 loff_t offset,
1537 ssize_t size, 1530 ssize_t size)
1538 void *private)
1539{ 1531{
1540 struct inode *inode = file_inode(iocb->ki_filp); 1532 struct xfs_mount *mp = XFS_I(inode)->i_mount;
1541 struct xfs_inode *ip = XFS_I(inode);
1542 struct xfs_mount *mp = ip->i_mount;
1543 struct xfs_ioend *ioend = private;
1544
1545 trace_xfs_gbmap_direct_endio(ip, offset, size,
1546 ioend ? ioend->io_type : 0, NULL);
1547 1533
1548 if (!ioend) { 1534 if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
1549 ASSERT(offset + size <= i_size_read(inode));
1550 return;
1551 }
1552
1553 if (XFS_FORCED_SHUTDOWN(mp))
1554 goto out_end_io; 1535 goto out_end_io;
1555 1536
1556 /* 1537 /*
@@ -1587,10 +1568,10 @@ xfs_end_io_direct_write(
1587 * here can result in EOF moving backwards and Bad Things Happen when 1568 * here can result in EOF moving backwards and Bad Things Happen when
1588 * that occurs. 1569 * that occurs.
1589 */ 1570 */
1590 spin_lock(&ip->i_flags_lock); 1571 spin_lock(&XFS_I(inode)->i_flags_lock);
1591 if (offset + size > i_size_read(inode)) 1572 if (offset + size > i_size_read(inode))
1592 i_size_write(inode, offset + size); 1573 i_size_write(inode, offset + size);
1593 spin_unlock(&ip->i_flags_lock); 1574 spin_unlock(&XFS_I(inode)->i_flags_lock);
1594 1575
1595 /* 1576 /*
1596 * If we are doing an append IO that needs to update the EOF on disk, 1577 * If we are doing an append IO that needs to update the EOF on disk,
@@ -1607,6 +1588,98 @@ out_end_io:
1607 return; 1588 return;
1608} 1589}
1609 1590
1591/*
1592 * Complete a direct I/O write request.
1593 *
1594 * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
1595 * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
1596 * wholly within the EOF and so there is nothing for us to do. Note that in this
1597 * case the completion can be called in interrupt context, whereas if we have an
1598 * ioend we will always be called in task context (i.e. from a workqueue).
1599 */
1600STATIC void
1601xfs_end_io_direct_write(
1602 struct kiocb *iocb,
1603 loff_t offset,
1604 ssize_t size,
1605 void *private)
1606{
1607 struct inode *inode = file_inode(iocb->ki_filp);
1608 struct xfs_ioend *ioend = private;
1609
1610 trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
1611 ioend ? ioend->io_type : 0, NULL);
1612
1613 if (!ioend) {
1614 ASSERT(offset + size <= i_size_read(inode));
1615 return;
1616 }
1617
1618 __xfs_end_io_direct_write(inode, ioend, offset, size);
1619}
1620
1621/*
1622 * For DAX we need a mapping buffer callback for unwritten extent conversion
1623 * when page faults allocate blocks and then zero them. Note that in this
1624 * case the mapping indicated by the ioend may extend beyond EOF. We most
1625 * definitely do not want to extend EOF here, so we trim back the ioend size to
1626 * EOF.
1627 */
1628#ifdef CONFIG_FS_DAX
1629void
1630xfs_end_io_dax_write(
1631 struct buffer_head *bh,
1632 int uptodate)
1633{
1634 struct xfs_ioend *ioend = bh->b_private;
1635 struct inode *inode = ioend->io_inode;
1636 ssize_t size = ioend->io_size;
1637
1638 ASSERT(IS_DAX(ioend->io_inode));
1639
1640 /* if there was an error zeroing, then don't convert it */
1641 if (!uptodate)
1642 ioend->io_error = -EIO;
1643
1644 /*
1645 * Trim update to EOF, so we don't extend EOF during unwritten extent
1646 * conversion of partial EOF blocks.
1647 */
1648 spin_lock(&XFS_I(inode)->i_flags_lock);
1649 if (ioend->io_offset + size > i_size_read(inode))
1650 size = i_size_read(inode) - ioend->io_offset;
1651 spin_unlock(&XFS_I(inode)->i_flags_lock);
1652
1653 __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);
1654
1655}
1656#else
1657void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
1658#endif
1659
1660static inline ssize_t
1661xfs_vm_do_dio(
1662 struct inode *inode,
1663 struct kiocb *iocb,
1664 struct iov_iter *iter,
1665 loff_t offset,
1666 void (*endio)(struct kiocb *iocb,
1667 loff_t offset,
1668 ssize_t size,
1669 void *private),
1670 int flags)
1671{
1672 struct block_device *bdev;
1673
1674 if (IS_DAX(inode))
1675 return dax_do_io(iocb, inode, iter, offset,
1676 xfs_get_blocks_direct, endio, 0);
1677
1678 bdev = xfs_find_bdev_for_inode(inode);
1679 return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
1680 xfs_get_blocks_direct, endio, NULL, flags);
1681}
1682
1610STATIC ssize_t 1683STATIC ssize_t
1611xfs_vm_direct_IO( 1684xfs_vm_direct_IO(
1612 struct kiocb *iocb, 1685 struct kiocb *iocb,
@@ -1614,16 +1687,11 @@ xfs_vm_direct_IO(
1614 loff_t offset) 1687 loff_t offset)
1615{ 1688{
1616 struct inode *inode = iocb->ki_filp->f_mapping->host; 1689 struct inode *inode = iocb->ki_filp->f_mapping->host;
1617 struct block_device *bdev = xfs_find_bdev_for_inode(inode);
1618 1690
1619 if (iov_iter_rw(iter) == WRITE) { 1691 if (iov_iter_rw(iter) == WRITE)
1620 return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, 1692 return xfs_vm_do_dio(inode, iocb, iter, offset,
1621 xfs_get_blocks_direct, 1693 xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
1622 xfs_end_io_direct_write, NULL, 1694 return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
1623 DIO_ASYNC_EXTEND);
1624 }
1625 return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
1626 xfs_get_blocks_direct, NULL, NULL, 0);
1627} 1695}
1628 1696
1629/* 1697/*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index ac644e0137a4..86afd1ac7895 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -53,7 +53,12 @@ typedef struct xfs_ioend {
53} xfs_ioend_t; 53} xfs_ioend_t;
54 54
55extern const struct address_space_operations xfs_address_space_operations; 55extern const struct address_space_operations xfs_address_space_operations;
56extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); 56
57int xfs_get_blocks(struct inode *inode, sector_t offset,
58 struct buffer_head *map_bh, int create);
59int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
60 struct buffer_head *map_bh, int create);
61void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
57 62
58extern void xfs_count_page_state(struct page *, int *, int *); 63extern void xfs_count_page_state(struct page *, int *, int *);
59 64
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index f9c1c64782d3..69a154c58287 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -380,23 +380,30 @@ xfs_attr3_root_inactive(
380 return error; 380 return error;
381} 381}
382 382
383/*
384 * xfs_attr_inactive kills all traces of an attribute fork on an inode. It
385 * removes both the on-disk and in-memory inode fork. Note that this also has to
386 * handle the condition of inodes without attributes but with an attribute fork
387 * configured, so we can't use xfs_inode_hasattr() here.
388 *
389 * The in-memory attribute fork is removed even on error.
390 */
383int 391int
384xfs_attr_inactive(xfs_inode_t *dp) 392xfs_attr_inactive(
393 struct xfs_inode *dp)
385{ 394{
386 xfs_trans_t *trans; 395 struct xfs_trans *trans;
387 xfs_mount_t *mp; 396 struct xfs_mount *mp;
388 int error; 397 int lock_mode = XFS_ILOCK_SHARED;
398 int error = 0;
389 399
390 mp = dp->i_mount; 400 mp = dp->i_mount;
391 ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); 401 ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
392 402
393 xfs_ilock(dp, XFS_ILOCK_SHARED); 403 xfs_ilock(dp, lock_mode);
394 if (!xfs_inode_hasattr(dp) || 404 if (!XFS_IFORK_Q(dp))
395 dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { 405 goto out_destroy_fork;
396 xfs_iunlock(dp, XFS_ILOCK_SHARED); 406 xfs_iunlock(dp, lock_mode);
397 return 0;
398 }
399 xfs_iunlock(dp, XFS_ILOCK_SHARED);
400 407
401 /* 408 /*
402 * Start our first transaction of the day. 409 * Start our first transaction of the day.
@@ -408,13 +415,17 @@ xfs_attr_inactive(xfs_inode_t *dp)
408 * the inode in every transaction to let it float upward through 415 * the inode in every transaction to let it float upward through
409 * the log. 416 * the log.
410 */ 417 */
418 lock_mode = 0;
411 trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL); 419 trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
412 error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0); 420 error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0);
413 if (error) { 421 if (error)
414 xfs_trans_cancel(trans, 0); 422 goto out_cancel;
415 return error; 423
416 } 424 lock_mode = XFS_ILOCK_EXCL;
417 xfs_ilock(dp, XFS_ILOCK_EXCL); 425 xfs_ilock(dp, lock_mode);
426
427 if (!XFS_IFORK_Q(dp))
428 goto out_cancel;
418 429
419 /* 430 /*
420 * No need to make quota reservations here. We expect to release some 431 * No need to make quota reservations here. We expect to release some
@@ -422,29 +433,31 @@ xfs_attr_inactive(xfs_inode_t *dp)
422 */ 433 */
423 xfs_trans_ijoin(trans, dp, 0); 434 xfs_trans_ijoin(trans, dp, 0);
424 435
425 /* 436 /* invalidate and truncate the attribute fork extents */
426 * Decide on what work routines to call based on the inode size. 437 if (dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) {
427 */ 438 error = xfs_attr3_root_inactive(&trans, dp);
428 if (!xfs_inode_hasattr(dp) || 439 if (error)
429 dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { 440 goto out_cancel;
430 error = 0;
431 goto out;
432 }
433 error = xfs_attr3_root_inactive(&trans, dp);
434 if (error)
435 goto out;
436 441
437 error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); 442 error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
438 if (error) 443 if (error)
439 goto out; 444 goto out_cancel;
445 }
440 446
441 error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); 447 /* Reset the attribute fork - this also destroys the in-core fork */
442 xfs_iunlock(dp, XFS_ILOCK_EXCL); 448 xfs_attr_fork_remove(dp, trans);
443 449
450 error = xfs_trans_commit(trans);
451 xfs_iunlock(dp, lock_mode);
444 return error; 452 return error;
445 453
446out: 454out_cancel:
447 xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); 455 xfs_trans_cancel(trans);
448 xfs_iunlock(dp, XFS_ILOCK_EXCL); 456out_destroy_fork:
457 /* kill the in-core attr fork before we drop the inode lock */
458 if (dp->i_afp)
459 xfs_idestroy_fork(dp, XFS_ATTR_FORK);
460 if (lock_mode)
461 xfs_iunlock(dp, lock_mode);
449 return error; 462 return error;
450} 463}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index a52bbd3abc7d..0f34886cf726 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -75,28 +75,20 @@ xfs_bmap_finish(
75 xfs_efi_log_item_t *efi; /* extent free intention */ 75 xfs_efi_log_item_t *efi; /* extent free intention */
76 int error; /* error return value */ 76 int error; /* error return value */
77 xfs_bmap_free_item_t *free; /* free extent item */ 77 xfs_bmap_free_item_t *free; /* free extent item */
78 struct xfs_trans_res tres; /* new log reservation */
79 xfs_mount_t *mp; /* filesystem mount structure */ 78 xfs_mount_t *mp; /* filesystem mount structure */
80 xfs_bmap_free_item_t *next; /* next item on free list */ 79 xfs_bmap_free_item_t *next; /* next item on free list */
81 xfs_trans_t *ntp; /* new transaction pointer */
82 80
83 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); 81 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
84 if (flist->xbf_count == 0) { 82 if (flist->xbf_count == 0) {
85 *committed = 0; 83 *committed = 0;
86 return 0; 84 return 0;
87 } 85 }
88 ntp = *tp; 86 efi = xfs_trans_get_efi(*tp, flist->xbf_count);
89 efi = xfs_trans_get_efi(ntp, flist->xbf_count);
90 for (free = flist->xbf_first; free; free = free->xbfi_next) 87 for (free = flist->xbf_first; free; free = free->xbfi_next)
91 xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, 88 xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
92 free->xbfi_blockcount); 89 free->xbfi_blockcount);
93 90
94 tres.tr_logres = ntp->t_log_res; 91 error = xfs_trans_roll(tp, NULL);
95 tres.tr_logcount = ntp->t_log_count;
96 tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
97 ntp = xfs_trans_dup(*tp);
98 error = xfs_trans_commit(*tp, 0);
99 *tp = ntp;
100 *committed = 1; 92 *committed = 1;
101 /* 93 /*
102 * We have a new transaction, so we should return committed=1, 94 * We have a new transaction, so we should return committed=1,
@@ -105,19 +97,10 @@ xfs_bmap_finish(
105 if (error) 97 if (error)
106 return error; 98 return error;
107 99
108 /* 100 efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
109 * transaction commit worked ok so we can drop the extra ticket
110 * reference that we gained in xfs_trans_dup()
111 */
112 xfs_log_ticket_put(ntp->t_ticket);
113
114 error = xfs_trans_reserve(ntp, &tres, 0, 0);
115 if (error)
116 return error;
117 efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
118 for (free = flist->xbf_first; free != NULL; free = next) { 101 for (free = flist->xbf_first; free != NULL; free = next) {
119 next = free->xbfi_next; 102 next = free->xbfi_next;
120 if ((error = xfs_free_extent(ntp, free->xbfi_startblock, 103 if ((error = xfs_free_extent(*tp, free->xbfi_startblock,
121 free->xbfi_blockcount))) { 104 free->xbfi_blockcount))) {
122 /* 105 /*
123 * The bmap free list will be cleaned up at a 106 * The bmap free list will be cleaned up at a
@@ -127,7 +110,7 @@ xfs_bmap_finish(
127 * happens, since this transaction may not be 110 * happens, since this transaction may not be
128 * dirty yet. 111 * dirty yet.
129 */ 112 */
130 mp = ntp->t_mountp; 113 mp = (*tp)->t_mountp;
131 if (!XFS_FORCED_SHUTDOWN(mp)) 114 if (!XFS_FORCED_SHUTDOWN(mp))
132 xfs_force_shutdown(mp, 115 xfs_force_shutdown(mp,
133 (error == -EFSCORRUPTED) ? 116 (error == -EFSCORRUPTED) ?
@@ -135,7 +118,7 @@ xfs_bmap_finish(
135 SHUTDOWN_META_IO_ERROR); 118 SHUTDOWN_META_IO_ERROR);
136 return error; 119 return error;
137 } 120 }
138 xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, 121 xfs_trans_log_efd_extent(*tp, efd, free->xbfi_startblock,
139 free->xbfi_blockcount); 122 free->xbfi_blockcount);
140 xfs_bmap_del_free(flist, NULL, free); 123 xfs_bmap_del_free(flist, NULL, free);
141 } 124 }
@@ -878,7 +861,7 @@ xfs_free_eofblocks(
878 861
879 if (need_iolock) { 862 if (need_iolock) {
880 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 863 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
881 xfs_trans_cancel(tp, 0); 864 xfs_trans_cancel(tp);
882 return -EAGAIN; 865 return -EAGAIN;
883 } 866 }
884 } 867 }
@@ -886,7 +869,7 @@ xfs_free_eofblocks(
886 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); 869 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
887 if (error) { 870 if (error) {
888 ASSERT(XFS_FORCED_SHUTDOWN(mp)); 871 ASSERT(XFS_FORCED_SHUTDOWN(mp));
889 xfs_trans_cancel(tp, 0); 872 xfs_trans_cancel(tp);
890 if (need_iolock) 873 if (need_iolock)
891 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 874 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
892 return error; 875 return error;
@@ -908,12 +891,9 @@ xfs_free_eofblocks(
908 * If we get an error at this point we simply don't 891 * If we get an error at this point we simply don't
909 * bother truncating the file. 892 * bother truncating the file.
910 */ 893 */
911 xfs_trans_cancel(tp, 894 xfs_trans_cancel(tp);
912 (XFS_TRANS_RELEASE_LOG_RES |
913 XFS_TRANS_ABORT));
914 } else { 895 } else {
915 error = xfs_trans_commit(tp, 896 error = xfs_trans_commit(tp);
916 XFS_TRANS_RELEASE_LOG_RES);
917 if (!error) 897 if (!error)
918 xfs_inode_clear_eofblocks_tag(ip); 898 xfs_inode_clear_eofblocks_tag(ip);
919 } 899 }
@@ -1026,7 +1006,7 @@ xfs_alloc_file_space(
1026 * Free the transaction structure. 1006 * Free the transaction structure.
1027 */ 1007 */
1028 ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); 1008 ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1029 xfs_trans_cancel(tp, 0); 1009 xfs_trans_cancel(tp);
1030 break; 1010 break;
1031 } 1011 }
1032 xfs_ilock(ip, XFS_ILOCK_EXCL); 1012 xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1053,7 +1033,7 @@ xfs_alloc_file_space(
1053 goto error0; 1033 goto error0;
1054 } 1034 }
1055 1035
1056 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1036 error = xfs_trans_commit(tp);
1057 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1037 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1058 if (error) { 1038 if (error) {
1059 break; 1039 break;
@@ -1077,7 +1057,7 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
1077 xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); 1057 xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
1078 1058
1079error1: /* Just cancel transaction */ 1059error1: /* Just cancel transaction */
1080 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 1060 xfs_trans_cancel(tp);
1081 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1061 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1082 return error; 1062 return error;
1083} 1063}
@@ -1133,14 +1113,29 @@ xfs_zero_remaining_bytes(
1133 break; 1113 break;
1134 ASSERT(imap.br_blockcount >= 1); 1114 ASSERT(imap.br_blockcount >= 1);
1135 ASSERT(imap.br_startoff == offset_fsb); 1115 ASSERT(imap.br_startoff == offset_fsb);
1116 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1117
1118 if (imap.br_startblock == HOLESTARTBLOCK ||
1119 imap.br_state == XFS_EXT_UNWRITTEN) {
1120 /* skip the entire extent */
1121 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
1122 imap.br_blockcount) - 1;
1123 continue;
1124 }
1125
1136 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; 1126 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
1137 if (lastoffset > endoff) 1127 if (lastoffset > endoff)
1138 lastoffset = endoff; 1128 lastoffset = endoff;
1139 if (imap.br_startblock == HOLESTARTBLOCK) 1129
1140 continue; 1130 /* DAX can just zero the backing device directly */
1141 ASSERT(imap.br_startblock != DELAYSTARTBLOCK); 1131 if (IS_DAX(VFS_I(ip))) {
1142 if (imap.br_state == XFS_EXT_UNWRITTEN) 1132 error = dax_zero_page_range(VFS_I(ip), offset,
1133 lastoffset - offset + 1,
1134 xfs_get_blocks_direct);
1135 if (error)
1136 return error;
1143 continue; 1137 continue;
1138 }
1144 1139
1145 error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ? 1140 error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
1146 mp->m_rtdev_targp : mp->m_ddev_targp, 1141 mp->m_rtdev_targp : mp->m_ddev_targp,
@@ -1289,7 +1284,7 @@ xfs_free_file_space(
1289 * Free the transaction structure. 1284 * Free the transaction structure.
1290 */ 1285 */
1291 ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); 1286 ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1292 xfs_trans_cancel(tp, 0); 1287 xfs_trans_cancel(tp);
1293 break; 1288 break;
1294 } 1289 }
1295 xfs_ilock(ip, XFS_ILOCK_EXCL); 1290 xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1320,7 +1315,7 @@ xfs_free_file_space(
1320 goto error0; 1315 goto error0;
1321 } 1316 }
1322 1317
1323 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1318 error = xfs_trans_commit(tp);
1324 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1319 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1325 } 1320 }
1326 1321
@@ -1330,7 +1325,7 @@ xfs_free_file_space(
1330 error0: 1325 error0:
1331 xfs_bmap_cancel(&free_list); 1326 xfs_bmap_cancel(&free_list);
1332 error1: 1327 error1:
1333 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 1328 xfs_trans_cancel(tp);
1334 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1329 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1335 goto out; 1330 goto out;
1336} 1331}
@@ -1462,7 +1457,7 @@ xfs_shift_file_space(
1462 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, 1457 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
1463 XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); 1458 XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
1464 if (error) { 1459 if (error) {
1465 xfs_trans_cancel(tp, 0); 1460 xfs_trans_cancel(tp);
1466 break; 1461 break;
1467 } 1462 }
1468 1463
@@ -1492,13 +1487,13 @@ xfs_shift_file_space(
1492 if (error) 1487 if (error)
1493 goto out; 1488 goto out;
1494 1489
1495 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1490 error = xfs_trans_commit(tp);
1496 } 1491 }
1497 1492
1498 return error; 1493 return error;
1499 1494
1500out: 1495out:
1501 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 1496 xfs_trans_cancel(tp);
1502 return error; 1497 return error;
1503} 1498}
1504 1499
@@ -1718,7 +1713,7 @@ xfs_swap_extents(
1718 tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); 1713 tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
1719 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); 1714 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
1720 if (error) { 1715 if (error) {
1721 xfs_trans_cancel(tp, 0); 1716 xfs_trans_cancel(tp);
1722 goto out_unlock; 1717 goto out_unlock;
1723 } 1718 }
1724 1719
@@ -1901,7 +1896,7 @@ xfs_swap_extents(
1901 if (mp->m_flags & XFS_MOUNT_WSYNC) 1896 if (mp->m_flags & XFS_MOUNT_WSYNC)
1902 xfs_trans_set_sync(tp); 1897 xfs_trans_set_sync(tp);
1903 1898
1904 error = xfs_trans_commit(tp, 0); 1899 error = xfs_trans_commit(tp);
1905 1900
1906 trace_xfs_swap_extent_after(ip, 0); 1901 trace_xfs_swap_extent_after(ip, 0);
1907 trace_xfs_swap_extent_after(tip, 1); 1902 trace_xfs_swap_extent_after(tip, 1);
@@ -1915,6 +1910,6 @@ out_unlock:
1915 goto out; 1910 goto out;
1916 1911
1917out_trans_cancel: 1912out_trans_cancel:
1918 xfs_trans_cancel(tp, 0); 1913 xfs_trans_cancel(tp);
1919 goto out; 1914 goto out;
1920} 1915}
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 02c01bbbc789..4143dc75dca4 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -568,8 +568,6 @@ xfs_qm_dqread(
568 struct xfs_buf *bp; 568 struct xfs_buf *bp;
569 struct xfs_trans *tp = NULL; 569 struct xfs_trans *tp = NULL;
570 int error; 570 int error;
571 int cancelflags = 0;
572
573 571
574 dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP); 572 dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
575 573
@@ -617,7 +615,6 @@ xfs_qm_dqread(
617 XFS_QM_DQALLOC_SPACE_RES(mp), 0); 615 XFS_QM_DQALLOC_SPACE_RES(mp), 0);
618 if (error) 616 if (error)
619 goto error1; 617 goto error1;
620 cancelflags = XFS_TRANS_RELEASE_LOG_RES;
621 } 618 }
622 619
623 /* 620 /*
@@ -632,7 +629,6 @@ xfs_qm_dqread(
632 * allocate (ENOENT). 629 * allocate (ENOENT).
633 */ 630 */
634 trace_xfs_dqread_fail(dqp); 631 trace_xfs_dqread_fail(dqp);
635 cancelflags |= XFS_TRANS_ABORT;
636 goto error1; 632 goto error1;
637 } 633 }
638 634
@@ -670,7 +666,7 @@ xfs_qm_dqread(
670 xfs_trans_brelse(tp, bp); 666 xfs_trans_brelse(tp, bp);
671 667
672 if (tp) { 668 if (tp) {
673 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 669 error = xfs_trans_commit(tp);
674 if (error) 670 if (error)
675 goto error0; 671 goto error0;
676 } 672 }
@@ -680,7 +676,7 @@ xfs_qm_dqread(
680 676
681error1: 677error1:
682 if (tp) 678 if (tp)
683 xfs_trans_cancel(tp, cancelflags); 679 xfs_trans_cancel(tp);
684error0: 680error0:
685 xfs_qm_dqdestroy(dqp); 681 xfs_qm_dqdestroy(dqp);
686 *O_dqpp = NULL; 682 *O_dqpp = NULL;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 8121e75352ee..97d92c144768 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -79,14 +79,15 @@ xfs_rw_ilock_demote(
79} 79}
80 80
81/* 81/*
82 * xfs_iozero 82 * xfs_iozero clears the specified range supplied via the page cache (except in
83 * the DAX case). Writes through the page cache will allocate blocks over holes,
84 * though the callers usually map the holes first and avoid them. If a block is
85 * not completely zeroed, then it will be read from disk before being partially
86 * zeroed.
83 * 87 *
84 * xfs_iozero clears the specified range of buffer supplied, 88 * In the DAX case, we can just directly write to the underlying pages. This
85 * and marks all the affected blocks as valid and modified. If 89 * will not allocate blocks, but will avoid holes and unwritten extents and so
86 * an affected block is not allocated, it will be allocated. If 90 * not do unnecessary work.
87 * an affected block is not completely overwritten, and is not
88 * valid before the operation, it will be read from disk before
89 * being partially zeroed.
90 */ 91 */
91int 92int
92xfs_iozero( 93xfs_iozero(
@@ -96,7 +97,8 @@ xfs_iozero(
96{ 97{
97 struct page *page; 98 struct page *page;
98 struct address_space *mapping; 99 struct address_space *mapping;
99 int status; 100 int status = 0;
101
100 102
101 mapping = VFS_I(ip)->i_mapping; 103 mapping = VFS_I(ip)->i_mapping;
102 do { 104 do {
@@ -108,23 +110,30 @@ xfs_iozero(
108 if (bytes > count) 110 if (bytes > count)
109 bytes = count; 111 bytes = count;
110 112
111 status = pagecache_write_begin(NULL, mapping, pos, bytes, 113 if (IS_DAX(VFS_I(ip))) {
112 AOP_FLAG_UNINTERRUPTIBLE, 114 status = dax_zero_page_range(VFS_I(ip), pos, bytes,
113 &page, &fsdata); 115 xfs_get_blocks_direct);
114 if (status) 116 if (status)
115 break; 117 break;
118 } else {
119 status = pagecache_write_begin(NULL, mapping, pos, bytes,
120 AOP_FLAG_UNINTERRUPTIBLE,
121 &page, &fsdata);
122 if (status)
123 break;
116 124
117 zero_user(page, offset, bytes); 125 zero_user(page, offset, bytes);
118 126
119 status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, 127 status = pagecache_write_end(NULL, mapping, pos, bytes,
120 page, fsdata); 128 bytes, page, fsdata);
121 WARN_ON(status <= 0); /* can't return less than zero! */ 129 WARN_ON(status <= 0); /* can't return less than zero! */
130 status = 0;
131 }
122 pos += bytes; 132 pos += bytes;
123 count -= bytes; 133 count -= bytes;
124 status = 0;
125 } while (count); 134 } while (count);
126 135
127 return (-status); 136 return status;
128} 137}
129 138
130int 139int
@@ -138,7 +147,7 @@ xfs_update_prealloc_flags(
138 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID); 147 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
139 error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0); 148 error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
140 if (error) { 149 if (error) {
141 xfs_trans_cancel(tp, 0); 150 xfs_trans_cancel(tp);
142 return error; 151 return error;
143 } 152 }
144 153
@@ -160,7 +169,7 @@ xfs_update_prealloc_flags(
160 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 169 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
161 if (flags & XFS_PREALLOC_SYNC) 170 if (flags & XFS_PREALLOC_SYNC)
162 xfs_trans_set_sync(tp); 171 xfs_trans_set_sync(tp);
163 return xfs_trans_commit(tp, 0); 172 return xfs_trans_commit(tp);
164} 173}
165 174
166/* 175/*
@@ -284,7 +293,7 @@ xfs_file_read_iter(
284 if (file->f_mode & FMODE_NOCMTIME) 293 if (file->f_mode & FMODE_NOCMTIME)
285 ioflags |= XFS_IO_INVIS; 294 ioflags |= XFS_IO_INVIS;
286 295
287 if (unlikely(ioflags & XFS_IO_ISDIRECT)) { 296 if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
288 xfs_buftarg_t *target = 297 xfs_buftarg_t *target =
289 XFS_IS_REALTIME_INODE(ip) ? 298 XFS_IS_REALTIME_INODE(ip) ?
290 mp->m_rtdev_targp : mp->m_ddev_targp; 299 mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -378,7 +387,11 @@ xfs_file_splice_read(
378 387
379 trace_xfs_file_splice_read(ip, count, *ppos, ioflags); 388 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
380 389
381 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); 390 /* for dax, we need to avoid the page cache */
391 if (IS_DAX(VFS_I(ip)))
392 ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
393 else
394 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
382 if (ret > 0) 395 if (ret > 0)
383 XFS_STATS_ADD(xs_read_bytes, ret); 396 XFS_STATS_ADD(xs_read_bytes, ret);
384 397
@@ -672,7 +685,7 @@ xfs_file_dio_aio_write(
672 mp->m_rtdev_targp : mp->m_ddev_targp; 685 mp->m_rtdev_targp : mp->m_ddev_targp;
673 686
674 /* DIO must be aligned to device logical sector size */ 687 /* DIO must be aligned to device logical sector size */
675 if ((pos | count) & target->bt_logical_sectormask) 688 if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask))
676 return -EINVAL; 689 return -EINVAL;
677 690
678 /* "unaligned" here means not aligned to a filesystem block */ 691 /* "unaligned" here means not aligned to a filesystem block */
@@ -758,8 +771,11 @@ xfs_file_dio_aio_write(
758out: 771out:
759 xfs_rw_iunlock(ip, iolock); 772 xfs_rw_iunlock(ip, iolock);
760 773
761 /* No fallback to buffered IO on errors for XFS. */ 774 /*
762 ASSERT(ret < 0 || ret == count); 775 * No fallback to buffered IO on errors for XFS. DAX can result in
776 * partial writes, but direct IO will either complete fully or fail.
777 */
778 ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
763 return ret; 779 return ret;
764} 780}
765 781
@@ -842,7 +858,7 @@ xfs_file_write_iter(
842 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 858 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
843 return -EIO; 859 return -EIO;
844 860
845 if (unlikely(iocb->ki_flags & IOCB_DIRECT)) 861 if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
846 ret = xfs_file_dio_aio_write(iocb, from); 862 ret = xfs_file_dio_aio_write(iocb, from);
847 else 863 else
848 ret = xfs_file_buffered_aio_write(iocb, from); 864 ret = xfs_file_buffered_aio_write(iocb, from);
@@ -1063,17 +1079,6 @@ xfs_file_readdir(
1063 return xfs_readdir(ip, ctx, bufsize); 1079 return xfs_readdir(ip, ctx, bufsize);
1064} 1080}
1065 1081
1066STATIC int
1067xfs_file_mmap(
1068 struct file *filp,
1069 struct vm_area_struct *vma)
1070{
1071 vma->vm_ops = &xfs_file_vm_ops;
1072
1073 file_accessed(filp);
1074 return 0;
1075}
1076
1077/* 1082/*
1078 * This type is designed to indicate the type of offset we would like 1083 * This type is designed to indicate the type of offset we would like
1079 * to search from page cache for xfs_seek_hole_data(). 1084 * to search from page cache for xfs_seek_hole_data().
@@ -1454,48 +1459,83 @@ xfs_file_llseek(
1454 * ordering of: 1459 * ordering of:
1455 * 1460 *
1456 * mmap_sem (MM) 1461 * mmap_sem (MM)
1457 * i_mmap_lock (XFS - truncate serialisation) 1462 * sb_start_pagefault(vfs, freeze)
1458 * page_lock (MM) 1463 * i_mmap_lock (XFS - truncate serialisation)
1459 * i_lock (XFS - extent map serialisation) 1464 * page_lock (MM)
1465 * i_lock (XFS - extent map serialisation)
1466 */
1467
1468/*
1469 * mmap()d file has taken write protection fault and is being made writable. We
1470 * can set the page state up correctly for a writable page, which means we can
1471 * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
1472 * mapping.
1460 */ 1473 */
1461STATIC int 1474STATIC int
1462xfs_filemap_fault( 1475xfs_filemap_page_mkwrite(
1463 struct vm_area_struct *vma, 1476 struct vm_area_struct *vma,
1464 struct vm_fault *vmf) 1477 struct vm_fault *vmf)
1465{ 1478{
1466 struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); 1479 struct inode *inode = file_inode(vma->vm_file);
1467 int error; 1480 int ret;
1468 1481
1469 trace_xfs_filemap_fault(ip); 1482 trace_xfs_filemap_page_mkwrite(XFS_I(inode));
1470 1483
1471 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1484 sb_start_pagefault(inode->i_sb);
1472 error = filemap_fault(vma, vmf); 1485 file_update_time(vma->vm_file);
1473 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1486 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1474 1487
1475 return error; 1488 if (IS_DAX(inode)) {
1489 ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
1490 xfs_end_io_dax_write);
1491 } else {
1492 ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
1493 ret = block_page_mkwrite_return(ret);
1494 }
1495
1496 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1497 sb_end_pagefault(inode->i_sb);
1498
1499 return ret;
1476} 1500}
1477 1501
1478/*
1479 * mmap()d file has taken write protection fault and is being made writable. We
1480 * can set the page state up correctly for a writable page, which means we can
1481 * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
1482 * mapping.
1483 */
1484STATIC int 1502STATIC int
1485xfs_filemap_page_mkwrite( 1503xfs_filemap_fault(
1486 struct vm_area_struct *vma, 1504 struct vm_area_struct *vma,
1487 struct vm_fault *vmf) 1505 struct vm_fault *vmf)
1488{ 1506{
1489 struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); 1507 struct xfs_inode *ip = XFS_I(file_inode(vma->vm_file));
1490 int error; 1508 int ret;
1509
1510 trace_xfs_filemap_fault(ip);
1491 1511
1492 trace_xfs_filemap_page_mkwrite(ip); 1512 /* DAX can shortcut the normal fault path on write faults! */
1513 if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip)))
1514 return xfs_filemap_page_mkwrite(vma, vmf);
1493 1515
1494 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1516 xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1495 error = block_page_mkwrite(vma, vmf, xfs_get_blocks); 1517 ret = filemap_fault(vma, vmf);
1496 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1518 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1497 1519
1498 return error; 1520 return ret;
1521}
1522
1523static const struct vm_operations_struct xfs_file_vm_ops = {
1524 .fault = xfs_filemap_fault,
1525 .map_pages = filemap_map_pages,
1526 .page_mkwrite = xfs_filemap_page_mkwrite,
1527};
1528
1529STATIC int
1530xfs_file_mmap(
1531 struct file *filp,
1532 struct vm_area_struct *vma)
1533{
1534 file_accessed(filp);
1535 vma->vm_ops = &xfs_file_vm_ops;
1536 if (IS_DAX(file_inode(filp)))
1537 vma->vm_flags |= VM_MIXEDMAP;
1538 return 0;
1499} 1539}
1500 1540
1501const struct file_operations xfs_file_operations = { 1541const struct file_operations xfs_file_operations = {
@@ -1526,9 +1566,3 @@ const struct file_operations xfs_dir_file_operations = {
1526#endif 1566#endif
1527 .fsync = xfs_dir_fsync, 1567 .fsync = xfs_dir_fsync,
1528}; 1568};
1529
1530static const struct vm_operations_struct xfs_file_vm_ops = {
1531 .fault = xfs_filemap_fault,
1532 .map_pages = filemap_map_pages,
1533 .page_mkwrite = xfs_filemap_page_mkwrite,
1534};
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cb7e8a29dfb6..9b3438a7680f 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -101,7 +101,9 @@ xfs_fs_geometry(
101 (xfs_sb_version_hasftype(&mp->m_sb) ? 101 (xfs_sb_version_hasftype(&mp->m_sb) ?
102 XFS_FSOP_GEOM_FLAGS_FTYPE : 0) | 102 XFS_FSOP_GEOM_FLAGS_FTYPE : 0) |
103 (xfs_sb_version_hasfinobt(&mp->m_sb) ? 103 (xfs_sb_version_hasfinobt(&mp->m_sb) ?
104 XFS_FSOP_GEOM_FLAGS_FINOBT : 0); 104 XFS_FSOP_GEOM_FLAGS_FINOBT : 0) |
105 (xfs_sb_version_hassparseinodes(&mp->m_sb) ?
106 XFS_FSOP_GEOM_FLAGS_SPINODES : 0);
105 geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? 107 geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
106 mp->m_sb.sb_logsectsize : BBSIZE; 108 mp->m_sb.sb_logsectsize : BBSIZE;
107 geo->rtsectsize = mp->m_sb.sb_blocksize; 109 geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -201,7 +203,7 @@ xfs_growfs_data_private(
201 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata, 203 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
202 XFS_GROWFS_SPACE_RES(mp), 0); 204 XFS_GROWFS_SPACE_RES(mp), 0);
203 if (error) { 205 if (error) {
204 xfs_trans_cancel(tp, 0); 206 xfs_trans_cancel(tp);
205 return error; 207 return error;
206 } 208 }
207 209
@@ -489,7 +491,7 @@ xfs_growfs_data_private(
489 if (dpct) 491 if (dpct)
490 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); 492 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
491 xfs_trans_set_sync(tp); 493 xfs_trans_set_sync(tp);
492 error = xfs_trans_commit(tp, 0); 494 error = xfs_trans_commit(tp);
493 if (error) 495 if (error)
494 return error; 496 return error;
495 497
@@ -557,7 +559,7 @@ xfs_growfs_data_private(
557 return saved_error ? saved_error : error; 559 return saved_error ? saved_error : error;
558 560
559 error0: 561 error0:
560 xfs_trans_cancel(tp, XFS_TRANS_ABORT); 562 xfs_trans_cancel(tp);
561 return error; 563 return error;
562} 564}
563 565
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index d6ebc85192b7..a37a1011b6e4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -905,7 +905,6 @@ xfs_dir_ialloc(
905 905
906{ 906{
907 xfs_trans_t *tp; 907 xfs_trans_t *tp;
908 xfs_trans_t *ntp;
909 xfs_inode_t *ip; 908 xfs_inode_t *ip;
910 xfs_buf_t *ialloc_context = NULL; 909 xfs_buf_t *ialloc_context = NULL;
911 int code; 910 int code;
@@ -954,8 +953,6 @@ xfs_dir_ialloc(
954 * to succeed the second time. 953 * to succeed the second time.
955 */ 954 */
956 if (ialloc_context) { 955 if (ialloc_context) {
957 struct xfs_trans_res tres;
958
959 /* 956 /*
960 * Normally, xfs_trans_commit releases all the locks. 957 * Normally, xfs_trans_commit releases all the locks.
961 * We call bhold to hang on to the ialloc_context across 958 * We call bhold to hang on to the ialloc_context across
@@ -964,12 +961,6 @@ xfs_dir_ialloc(
964 * allocation group. 961 * allocation group.
965 */ 962 */
966 xfs_trans_bhold(tp, ialloc_context); 963 xfs_trans_bhold(tp, ialloc_context);
967 /*
968 * Save the log reservation so we can use
969 * them in the next transaction.
970 */
971 tres.tr_logres = xfs_trans_get_log_res(tp);
972 tres.tr_logcount = xfs_trans_get_log_count(tp);
973 964
974 /* 965 /*
975 * We want the quota changes to be associated with the next 966 * We want the quota changes to be associated with the next
@@ -985,35 +976,9 @@ xfs_dir_ialloc(
985 tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); 976 tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
986 } 977 }
987 978
988 ntp = xfs_trans_dup(tp); 979 code = xfs_trans_roll(&tp, 0);
989 code = xfs_trans_commit(tp, 0); 980 if (committed != NULL)
990 tp = ntp;
991 if (committed != NULL) {
992 *committed = 1; 981 *committed = 1;
993 }
994 /*
995 * If we get an error during the commit processing,
996 * release the buffer that is still held and return
997 * to the caller.
998 */
999 if (code) {
1000 xfs_buf_relse(ialloc_context);
1001 if (dqinfo) {
1002 tp->t_dqinfo = dqinfo;
1003 xfs_trans_free_dqinfo(tp);
1004 }
1005 *tpp = ntp;
1006 *ipp = NULL;
1007 return code;
1008 }
1009
1010 /*
1011 * transaction commit worked ok so we can drop the extra ticket
1012 * reference that we gained in xfs_trans_dup()
1013 */
1014 xfs_log_ticket_put(tp->t_ticket);
1015 tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
1016 code = xfs_trans_reserve(tp, &tres, 0, 0);
1017 982
1018 /* 983 /*
1019 * Re-attach the quota info that we detached from prev trx. 984 * Re-attach the quota info that we detached from prev trx.
@@ -1025,7 +990,7 @@ xfs_dir_ialloc(
1025 990
1026 if (code) { 991 if (code) {
1027 xfs_buf_relse(ialloc_context); 992 xfs_buf_relse(ialloc_context);
1028 *tpp = ntp; 993 *tpp = tp;
1029 *ipp = NULL; 994 *ipp = NULL;
1030 return code; 995 return code;
1031 } 996 }
@@ -1127,7 +1092,6 @@ xfs_create(
1127 xfs_bmap_free_t free_list; 1092 xfs_bmap_free_t free_list;
1128 xfs_fsblock_t first_block; 1093 xfs_fsblock_t first_block;
1129 bool unlock_dp_on_error = false; 1094 bool unlock_dp_on_error = false;
1130 uint cancel_flags;
1131 int committed; 1095 int committed;
1132 prid_t prid; 1096 prid_t prid;
1133 struct xfs_dquot *udqp = NULL; 1097 struct xfs_dquot *udqp = NULL;
@@ -1164,8 +1128,6 @@ xfs_create(
1164 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); 1128 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1165 } 1129 }
1166 1130
1167 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1168
1169 /* 1131 /*
1170 * Initially assume that the file does not exist and 1132 * Initially assume that the file does not exist and
1171 * reserve the resources for that case. If that is not 1133 * reserve the resources for that case. If that is not
@@ -1183,10 +1145,9 @@ xfs_create(
1183 resblks = 0; 1145 resblks = 0;
1184 error = xfs_trans_reserve(tp, tres, 0, 0); 1146 error = xfs_trans_reserve(tp, tres, 0, 0);
1185 } 1147 }
1186 if (error) { 1148 if (error)
1187 cancel_flags = 0;
1188 goto out_trans_cancel; 1149 goto out_trans_cancel;
1189 } 1150
1190 1151
1191 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); 1152 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1192 unlock_dp_on_error = true; 1153 unlock_dp_on_error = true;
@@ -1217,7 +1178,7 @@ xfs_create(
1217 if (error) { 1178 if (error) {
1218 if (error == -ENOSPC) 1179 if (error == -ENOSPC)
1219 goto out_trans_cancel; 1180 goto out_trans_cancel;
1220 goto out_trans_abort; 1181 goto out_trans_cancel;
1221 } 1182 }
1222 1183
1223 /* 1184 /*
@@ -1235,7 +1196,7 @@ xfs_create(
1235 resblks - XFS_IALLOC_SPACE_RES(mp) : 0); 1196 resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1236 if (error) { 1197 if (error) {
1237 ASSERT(error != -ENOSPC); 1198 ASSERT(error != -ENOSPC);
1238 goto out_trans_abort; 1199 goto out_trans_cancel;
1239 } 1200 }
1240 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1201 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1241 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 1202 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
@@ -1269,7 +1230,7 @@ xfs_create(
1269 if (error) 1230 if (error)
1270 goto out_bmap_cancel; 1231 goto out_bmap_cancel;
1271 1232
1272 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1233 error = xfs_trans_commit(tp);
1273 if (error) 1234 if (error)
1274 goto out_release_inode; 1235 goto out_release_inode;
1275 1236
@@ -1282,10 +1243,8 @@ xfs_create(
1282 1243
1283 out_bmap_cancel: 1244 out_bmap_cancel:
1284 xfs_bmap_cancel(&free_list); 1245 xfs_bmap_cancel(&free_list);
1285 out_trans_abort:
1286 cancel_flags |= XFS_TRANS_ABORT;
1287 out_trans_cancel: 1246 out_trans_cancel:
1288 xfs_trans_cancel(tp, cancel_flags); 1247 xfs_trans_cancel(tp);
1289 out_release_inode: 1248 out_release_inode:
1290 /* 1249 /*
1291 * Wait until after the current transaction is aborted to finish the 1250 * Wait until after the current transaction is aborted to finish the
@@ -1317,7 +1276,6 @@ xfs_create_tmpfile(
1317 struct xfs_inode *ip = NULL; 1276 struct xfs_inode *ip = NULL;
1318 struct xfs_trans *tp = NULL; 1277 struct xfs_trans *tp = NULL;
1319 int error; 1278 int error;
1320 uint cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1321 prid_t prid; 1279 prid_t prid;
1322 struct xfs_dquot *udqp = NULL; 1280 struct xfs_dquot *udqp = NULL;
1323 struct xfs_dquot *gdqp = NULL; 1281 struct xfs_dquot *gdqp = NULL;
@@ -1350,10 +1308,8 @@ xfs_create_tmpfile(
1350 resblks = 0; 1308 resblks = 0;
1351 error = xfs_trans_reserve(tp, tres, 0, 0); 1309 error = xfs_trans_reserve(tp, tres, 0, 0);
1352 } 1310 }
1353 if (error) { 1311 if (error)
1354 cancel_flags = 0;
1355 goto out_trans_cancel; 1312 goto out_trans_cancel;
1356 }
1357 1313
1358 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, 1314 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
1359 pdqp, resblks, 1, 0); 1315 pdqp, resblks, 1, 0);
@@ -1365,7 +1321,7 @@ xfs_create_tmpfile(
1365 if (error) { 1321 if (error) {
1366 if (error == -ENOSPC) 1322 if (error == -ENOSPC)
1367 goto out_trans_cancel; 1323 goto out_trans_cancel;
1368 goto out_trans_abort; 1324 goto out_trans_cancel;
1369 } 1325 }
1370 1326
1371 if (mp->m_flags & XFS_MOUNT_WSYNC) 1327 if (mp->m_flags & XFS_MOUNT_WSYNC)
@@ -1381,9 +1337,9 @@ xfs_create_tmpfile(
1381 ip->i_d.di_nlink--; 1337 ip->i_d.di_nlink--;
1382 error = xfs_iunlink(tp, ip); 1338 error = xfs_iunlink(tp, ip);
1383 if (error) 1339 if (error)
1384 goto out_trans_abort; 1340 goto out_trans_cancel;
1385 1341
1386 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1342 error = xfs_trans_commit(tp);
1387 if (error) 1343 if (error)
1388 goto out_release_inode; 1344 goto out_release_inode;
1389 1345
@@ -1394,10 +1350,8 @@ xfs_create_tmpfile(
1394 *ipp = ip; 1350 *ipp = ip;
1395 return 0; 1351 return 0;
1396 1352
1397 out_trans_abort:
1398 cancel_flags |= XFS_TRANS_ABORT;
1399 out_trans_cancel: 1353 out_trans_cancel:
1400 xfs_trans_cancel(tp, cancel_flags); 1354 xfs_trans_cancel(tp);
1401 out_release_inode: 1355 out_release_inode:
1402 /* 1356 /*
1403 * Wait until after the current transaction is aborted to finish the 1357 * Wait until after the current transaction is aborted to finish the
@@ -1427,7 +1381,6 @@ xfs_link(
1427 int error; 1381 int error;
1428 xfs_bmap_free_t free_list; 1382 xfs_bmap_free_t free_list;
1429 xfs_fsblock_t first_block; 1383 xfs_fsblock_t first_block;
1430 int cancel_flags;
1431 int committed; 1384 int committed;
1432 int resblks; 1385 int resblks;
1433 1386
@@ -1447,17 +1400,14 @@ xfs_link(
1447 goto std_return; 1400 goto std_return;
1448 1401
1449 tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); 1402 tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
1450 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1451 resblks = XFS_LINK_SPACE_RES(mp, target_name->len); 1403 resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1452 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0); 1404 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
1453 if (error == -ENOSPC) { 1405 if (error == -ENOSPC) {
1454 resblks = 0; 1406 resblks = 0;
1455 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0); 1407 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
1456 } 1408 }
1457 if (error) { 1409 if (error)
1458 cancel_flags = 0;
1459 goto error_return; 1410 goto error_return;
1460 }
1461 1411
1462 xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); 1412 xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
1463 1413
@@ -1486,19 +1436,19 @@ xfs_link(
1486 if (sip->i_d.di_nlink == 0) { 1436 if (sip->i_d.di_nlink == 0) {
1487 error = xfs_iunlink_remove(tp, sip); 1437 error = xfs_iunlink_remove(tp, sip);
1488 if (error) 1438 if (error)
1489 goto abort_return; 1439 goto error_return;
1490 } 1440 }
1491 1441
1492 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, 1442 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1493 &first_block, &free_list, resblks); 1443 &first_block, &free_list, resblks);
1494 if (error) 1444 if (error)
1495 goto abort_return; 1445 goto error_return;
1496 xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1446 xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1497 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); 1447 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1498 1448
1499 error = xfs_bumplink(tp, sip); 1449 error = xfs_bumplink(tp, sip);
1500 if (error) 1450 if (error)
1501 goto abort_return; 1451 goto error_return;
1502 1452
1503 /* 1453 /*
1504 * If this is a synchronous mount, make sure that the 1454 * If this is a synchronous mount, make sure that the
@@ -1512,15 +1462,13 @@ xfs_link(
1512 error = xfs_bmap_finish (&tp, &free_list, &committed); 1462 error = xfs_bmap_finish (&tp, &free_list, &committed);
1513 if (error) { 1463 if (error) {
1514 xfs_bmap_cancel(&free_list); 1464 xfs_bmap_cancel(&free_list);
1515 goto abort_return; 1465 goto error_return;
1516 } 1466 }
1517 1467
1518 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1468 return xfs_trans_commit(tp);
1519 1469
1520 abort_return:
1521 cancel_flags |= XFS_TRANS_ABORT;
1522 error_return: 1470 error_return:
1523 xfs_trans_cancel(tp, cancel_flags); 1471 xfs_trans_cancel(tp);
1524 std_return: 1472 std_return:
1525 return error; 1473 return error;
1526} 1474}
@@ -1555,7 +1503,6 @@ xfs_itruncate_extents(
1555{ 1503{
1556 struct xfs_mount *mp = ip->i_mount; 1504 struct xfs_mount *mp = ip->i_mount;
1557 struct xfs_trans *tp = *tpp; 1505 struct xfs_trans *tp = *tpp;
1558 struct xfs_trans *ntp;
1559 xfs_bmap_free_t free_list; 1506 xfs_bmap_free_t free_list;
1560 xfs_fsblock_t first_block; 1507 xfs_fsblock_t first_block;
1561 xfs_fileoff_t first_unmap_block; 1508 xfs_fileoff_t first_unmap_block;
@@ -1613,29 +1560,7 @@ xfs_itruncate_extents(
1613 if (error) 1560 if (error)
1614 goto out_bmap_cancel; 1561 goto out_bmap_cancel;
1615 1562
1616 if (committed) { 1563 error = xfs_trans_roll(&tp, ip);
1617 /*
1618 * Mark the inode dirty so it will be logged and
1619 * moved forward in the log as part of every commit.
1620 */
1621 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1622 }
1623
1624 ntp = xfs_trans_dup(tp);
1625 error = xfs_trans_commit(tp, 0);
1626 tp = ntp;
1627
1628 xfs_trans_ijoin(tp, ip, 0);
1629
1630 if (error)
1631 goto out;
1632
1633 /*
1634 * Transaction commit worked ok so we can drop the extra ticket
1635 * reference that we gained in xfs_trans_dup()
1636 */
1637 xfs_log_ticket_put(tp->t_ticket);
1638 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
1639 if (error) 1564 if (error)
1640 goto out; 1565 goto out;
1641 } 1566 }
@@ -1756,7 +1681,7 @@ xfs_inactive_truncate(
1756 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); 1681 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
1757 if (error) { 1682 if (error) {
1758 ASSERT(XFS_FORCED_SHUTDOWN(mp)); 1683 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1759 xfs_trans_cancel(tp, 0); 1684 xfs_trans_cancel(tp);
1760 return error; 1685 return error;
1761 } 1686 }
1762 1687
@@ -1777,7 +1702,7 @@ xfs_inactive_truncate(
1777 1702
1778 ASSERT(ip->i_d.di_nextents == 0); 1703 ASSERT(ip->i_d.di_nextents == 0);
1779 1704
1780 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1705 error = xfs_trans_commit(tp);
1781 if (error) 1706 if (error)
1782 goto error_unlock; 1707 goto error_unlock;
1783 1708
@@ -1785,7 +1710,7 @@ xfs_inactive_truncate(
1785 return 0; 1710 return 0;
1786 1711
1787error_trans_cancel: 1712error_trans_cancel:
1788 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 1713 xfs_trans_cancel(tp);
1789error_unlock: 1714error_unlock:
1790 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1715 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1791 return error; 1716 return error;
@@ -1835,7 +1760,7 @@ xfs_inactive_ifree(
1835 } else { 1760 } else {
1836 ASSERT(XFS_FORCED_SHUTDOWN(mp)); 1761 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1837 } 1762 }
1838 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); 1763 xfs_trans_cancel(tp);
1839 return error; 1764 return error;
1840 } 1765 }
1841 1766
@@ -1855,7 +1780,7 @@ xfs_inactive_ifree(
1855 __func__, error); 1780 __func__, error);
1856 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1781 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1857 } 1782 }
1858 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); 1783 xfs_trans_cancel(tp);
1859 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1784 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1860 return error; 1785 return error;
1861 } 1786 }
@@ -1874,7 +1799,7 @@ xfs_inactive_ifree(
1874 if (error) 1799 if (error)
1875 xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", 1800 xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
1876 __func__, error); 1801 __func__, error);
1877 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1802 error = xfs_trans_commit(tp);
1878 if (error) 1803 if (error)
1879 xfs_notice(mp, "%s: xfs_trans_commit returned error %d", 1804 xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
1880 __func__, error); 1805 __func__, error);
@@ -1946,21 +1871,17 @@ xfs_inactive(
1946 /* 1871 /*
1947 * If there are attributes associated with the file then blow them away 1872 * If there are attributes associated with the file then blow them away
1948 * now. The code calls a routine that recursively deconstructs the 1873 * now. The code calls a routine that recursively deconstructs the
1949 * attribute fork. We need to just commit the current transaction 1874 * attribute fork. If also blows away the in-core attribute fork.
1950 * because we can't use it for xfs_attr_inactive().
1951 */ 1875 */
1952 if (ip->i_d.di_anextents > 0) { 1876 if (XFS_IFORK_Q(ip)) {
1953 ASSERT(ip->i_d.di_forkoff != 0);
1954
1955 error = xfs_attr_inactive(ip); 1877 error = xfs_attr_inactive(ip);
1956 if (error) 1878 if (error)
1957 return; 1879 return;
1958 } 1880 }
1959 1881
1960 if (ip->i_afp) 1882 ASSERT(!ip->i_afp);
1961 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1962
1963 ASSERT(ip->i_d.di_anextents == 0); 1883 ASSERT(ip->i_d.di_anextents == 0);
1884 ASSERT(ip->i_d.di_forkoff == 0);
1964 1885
1965 /* 1886 /*
1966 * Free the inode. 1887 * Free the inode.
@@ -2239,28 +2160,42 @@ xfs_iunlink_remove(
2239 */ 2160 */
2240STATIC int 2161STATIC int
2241xfs_ifree_cluster( 2162xfs_ifree_cluster(
2242 xfs_inode_t *free_ip, 2163 xfs_inode_t *free_ip,
2243 xfs_trans_t *tp, 2164 xfs_trans_t *tp,
2244 xfs_ino_t inum) 2165 struct xfs_icluster *xic)
2245{ 2166{
2246 xfs_mount_t *mp = free_ip->i_mount; 2167 xfs_mount_t *mp = free_ip->i_mount;
2247 int blks_per_cluster; 2168 int blks_per_cluster;
2248 int inodes_per_cluster; 2169 int inodes_per_cluster;
2249 int nbufs; 2170 int nbufs;
2250 int i, j; 2171 int i, j;
2172 int ioffset;
2251 xfs_daddr_t blkno; 2173 xfs_daddr_t blkno;
2252 xfs_buf_t *bp; 2174 xfs_buf_t *bp;
2253 xfs_inode_t *ip; 2175 xfs_inode_t *ip;
2254 xfs_inode_log_item_t *iip; 2176 xfs_inode_log_item_t *iip;
2255 xfs_log_item_t *lip; 2177 xfs_log_item_t *lip;
2256 struct xfs_perag *pag; 2178 struct xfs_perag *pag;
2179 xfs_ino_t inum;
2257 2180
2181 inum = xic->first_ino;
2258 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); 2182 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
2259 blks_per_cluster = xfs_icluster_size_fsb(mp); 2183 blks_per_cluster = xfs_icluster_size_fsb(mp);
2260 inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; 2184 inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
2261 nbufs = mp->m_ialloc_blks / blks_per_cluster; 2185 nbufs = mp->m_ialloc_blks / blks_per_cluster;
2262 2186
2263 for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) { 2187 for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
2188 /*
2189 * The allocation bitmap tells us which inodes of the chunk were
2190 * physically allocated. Skip the cluster if an inode falls into
2191 * a sparse region.
2192 */
2193 ioffset = inum - xic->first_ino;
2194 if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
2195 ASSERT(do_mod(ioffset, inodes_per_cluster) == 0);
2196 continue;
2197 }
2198
2264 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 2199 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
2265 XFS_INO_TO_AGBNO(mp, inum)); 2200 XFS_INO_TO_AGBNO(mp, inum));
2266 2201
@@ -2418,8 +2353,7 @@ xfs_ifree(
2418 xfs_bmap_free_t *flist) 2353 xfs_bmap_free_t *flist)
2419{ 2354{
2420 int error; 2355 int error;
2421 int delete; 2356 struct xfs_icluster xic = { 0 };
2422 xfs_ino_t first_ino;
2423 2357
2424 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 2358 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2425 ASSERT(ip->i_d.di_nlink == 0); 2359 ASSERT(ip->i_d.di_nlink == 0);
@@ -2435,7 +2369,7 @@ xfs_ifree(
2435 if (error) 2369 if (error)
2436 return error; 2370 return error;
2437 2371
2438 error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); 2372 error = xfs_difree(tp, ip->i_ino, flist, &xic);
2439 if (error) 2373 if (error)
2440 return error; 2374 return error;
2441 2375
@@ -2452,8 +2386,8 @@ xfs_ifree(
2452 ip->i_d.di_gen++; 2386 ip->i_d.di_gen++;
2453 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2387 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2454 2388
2455 if (delete) 2389 if (xic.deleted)
2456 error = xfs_ifree_cluster(ip, tp, first_ino); 2390 error = xfs_ifree_cluster(ip, tp, &xic);
2457 2391
2458 return error; 2392 return error;
2459} 2393}
@@ -2540,7 +2474,6 @@ xfs_remove(
2540 int error = 0; 2474 int error = 0;
2541 xfs_bmap_free_t free_list; 2475 xfs_bmap_free_t free_list;
2542 xfs_fsblock_t first_block; 2476 xfs_fsblock_t first_block;
2543 int cancel_flags;
2544 int committed; 2477 int committed;
2545 uint resblks; 2478 uint resblks;
2546 2479
@@ -2561,7 +2494,6 @@ xfs_remove(
2561 tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); 2494 tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
2562 else 2495 else
2563 tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); 2496 tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2564 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2565 2497
2566 /* 2498 /*
2567 * We try to get the real space reservation first, 2499 * We try to get the real space reservation first,
@@ -2580,7 +2512,6 @@ xfs_remove(
2580 } 2512 }
2581 if (error) { 2513 if (error) {
2582 ASSERT(error != -ENOSPC); 2514 ASSERT(error != -ENOSPC);
2583 cancel_flags = 0;
2584 goto out_trans_cancel; 2515 goto out_trans_cancel;
2585 } 2516 }
2586 2517
@@ -2592,7 +2523,6 @@ xfs_remove(
2592 /* 2523 /*
2593 * If we're removing a directory perform some additional validation. 2524 * If we're removing a directory perform some additional validation.
2594 */ 2525 */
2595 cancel_flags |= XFS_TRANS_ABORT;
2596 if (is_dir) { 2526 if (is_dir) {
2597 ASSERT(ip->i_d.di_nlink >= 2); 2527 ASSERT(ip->i_d.di_nlink >= 2);
2598 if (ip->i_d.di_nlink != 2) { 2528 if (ip->i_d.di_nlink != 2) {
@@ -2648,7 +2578,7 @@ xfs_remove(
2648 if (error) 2578 if (error)
2649 goto out_bmap_cancel; 2579 goto out_bmap_cancel;
2650 2580
2651 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 2581 error = xfs_trans_commit(tp);
2652 if (error) 2582 if (error)
2653 goto std_return; 2583 goto std_return;
2654 2584
@@ -2660,7 +2590,7 @@ xfs_remove(
2660 out_bmap_cancel: 2590 out_bmap_cancel:
2661 xfs_bmap_cancel(&free_list); 2591 xfs_bmap_cancel(&free_list);
2662 out_trans_cancel: 2592 out_trans_cancel:
2663 xfs_trans_cancel(tp, cancel_flags); 2593 xfs_trans_cancel(tp);
2664 std_return: 2594 std_return:
2665 return error; 2595 return error;
2666} 2596}
@@ -2734,11 +2664,11 @@ xfs_finish_rename(
2734 error = xfs_bmap_finish(&tp, free_list, &committed); 2664 error = xfs_bmap_finish(&tp, free_list, &committed);
2735 if (error) { 2665 if (error) {
2736 xfs_bmap_cancel(free_list); 2666 xfs_bmap_cancel(free_list);
2737 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); 2667 xfs_trans_cancel(tp);
2738 return error; 2668 return error;
2739 } 2669 }
2740 2670
2741 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 2671 return xfs_trans_commit(tp);
2742} 2672}
2743 2673
2744/* 2674/*
@@ -2859,7 +2789,7 @@ xfs_cross_rename(
2859 2789
2860out_trans_abort: 2790out_trans_abort:
2861 xfs_bmap_cancel(free_list); 2791 xfs_bmap_cancel(free_list);
2862 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); 2792 xfs_trans_cancel(tp);
2863 return error; 2793 return error;
2864} 2794}
2865 2795
@@ -2883,7 +2813,13 @@ xfs_rename_alloc_whiteout(
2883 if (error) 2813 if (error)
2884 return error; 2814 return error;
2885 2815
2886 /* Satisfy xfs_bumplink that this is a real tmpfile */ 2816 /*
2817 * Prepare the tmpfile inode as if it were created through the VFS.
2818 * Otherwise, the link increment paths will complain about nlink 0->1.
2819 * Drop the link count as done by d_tmpfile(), complete the inode setup
2820 * and flag it as linkable.
2821 */
2822 drop_nlink(VFS_I(tmpfile));
2887 xfs_finish_inode_setup(tmpfile); 2823 xfs_finish_inode_setup(tmpfile);
2888 VFS_I(tmpfile)->i_state |= I_LINKABLE; 2824 VFS_I(tmpfile)->i_state |= I_LINKABLE;
2889 2825
@@ -2913,7 +2849,6 @@ xfs_rename(
2913 int num_inodes = __XFS_SORT_INODES; 2849 int num_inodes = __XFS_SORT_INODES;
2914 bool new_parent = (src_dp != target_dp); 2850 bool new_parent = (src_dp != target_dp);
2915 bool src_is_directory = S_ISDIR(src_ip->i_d.di_mode); 2851 bool src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
2916 int cancel_flags = 0;
2917 int spaceres; 2852 int spaceres;
2918 int error; 2853 int error;
2919 2854
@@ -2949,7 +2884,6 @@ xfs_rename(
2949 } 2884 }
2950 if (error) 2885 if (error)
2951 goto out_trans_cancel; 2886 goto out_trans_cancel;
2952 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2953 2887
2954 /* 2888 /*
2955 * Attach the dquots to the inodes 2889 * Attach the dquots to the inodes
@@ -3020,10 +2954,8 @@ xfs_rename(
3020 error = xfs_dir_createname(tp, target_dp, target_name, 2954 error = xfs_dir_createname(tp, target_dp, target_name,
3021 src_ip->i_ino, &first_block, 2955 src_ip->i_ino, &first_block,
3022 &free_list, spaceres); 2956 &free_list, spaceres);
3023 if (error == -ENOSPC)
3024 goto out_bmap_cancel;
3025 if (error) 2957 if (error)
3026 goto out_trans_abort; 2958 goto out_bmap_cancel;
3027 2959
3028 xfs_trans_ichgtime(tp, target_dp, 2960 xfs_trans_ichgtime(tp, target_dp,
3029 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2961 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -3031,7 +2963,7 @@ xfs_rename(
3031 if (new_parent && src_is_directory) { 2963 if (new_parent && src_is_directory) {
3032 error = xfs_bumplink(tp, target_dp); 2964 error = xfs_bumplink(tp, target_dp);
3033 if (error) 2965 if (error)
3034 goto out_trans_abort; 2966 goto out_bmap_cancel;
3035 } 2967 }
3036 } else { /* target_ip != NULL */ 2968 } else { /* target_ip != NULL */
3037 /* 2969 /*
@@ -3063,7 +2995,7 @@ xfs_rename(
3063 src_ip->i_ino, 2995 src_ip->i_ino,
3064 &first_block, &free_list, spaceres); 2996 &first_block, &free_list, spaceres);
3065 if (error) 2997 if (error)
3066 goto out_trans_abort; 2998 goto out_bmap_cancel;
3067 2999
3068 xfs_trans_ichgtime(tp, target_dp, 3000 xfs_trans_ichgtime(tp, target_dp,
3069 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3001 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -3074,7 +3006,7 @@ xfs_rename(
3074 */ 3006 */
3075 error = xfs_droplink(tp, target_ip); 3007 error = xfs_droplink(tp, target_ip);
3076 if (error) 3008 if (error)
3077 goto out_trans_abort; 3009 goto out_bmap_cancel;
3078 3010
3079 if (src_is_directory) { 3011 if (src_is_directory) {
3080 /* 3012 /*
@@ -3082,7 +3014,7 @@ xfs_rename(
3082 */ 3014 */
3083 error = xfs_droplink(tp, target_ip); 3015 error = xfs_droplink(tp, target_ip);
3084 if (error) 3016 if (error)
3085 goto out_trans_abort; 3017 goto out_bmap_cancel;
3086 } 3018 }
3087 } /* target_ip != NULL */ 3019 } /* target_ip != NULL */
3088 3020
@@ -3099,7 +3031,7 @@ xfs_rename(
3099 &first_block, &free_list, spaceres); 3031 &first_block, &free_list, spaceres);
3100 ASSERT(error != -EEXIST); 3032 ASSERT(error != -EEXIST);
3101 if (error) 3033 if (error)
3102 goto out_trans_abort; 3034 goto out_bmap_cancel;
3103 } 3035 }
3104 3036
3105 /* 3037 /*
@@ -3125,7 +3057,7 @@ xfs_rename(
3125 */ 3057 */
3126 error = xfs_droplink(tp, src_dp); 3058 error = xfs_droplink(tp, src_dp);
3127 if (error) 3059 if (error)
3128 goto out_trans_abort; 3060 goto out_bmap_cancel;
3129 } 3061 }
3130 3062
3131 /* 3063 /*
@@ -3140,7 +3072,7 @@ xfs_rename(
3140 error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, 3072 error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
3141 &first_block, &free_list, spaceres); 3073 &first_block, &free_list, spaceres);
3142 if (error) 3074 if (error)
3143 goto out_trans_abort; 3075 goto out_bmap_cancel;
3144 3076
3145 /* 3077 /*
3146 * For whiteouts, we need to bump the link count on the whiteout inode. 3078 * For whiteouts, we need to bump the link count on the whiteout inode.
@@ -3151,13 +3083,13 @@ xfs_rename(
3151 * intermediate state on disk. 3083 * intermediate state on disk.
3152 */ 3084 */
3153 if (wip) { 3085 if (wip) {
3154 ASSERT(wip->i_d.di_nlink == 0); 3086 ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0);
3155 error = xfs_bumplink(tp, wip); 3087 error = xfs_bumplink(tp, wip);
3156 if (error) 3088 if (error)
3157 goto out_trans_abort; 3089 goto out_bmap_cancel;
3158 error = xfs_iunlink_remove(tp, wip); 3090 error = xfs_iunlink_remove(tp, wip);
3159 if (error) 3091 if (error)
3160 goto out_trans_abort; 3092 goto out_bmap_cancel;
3161 xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); 3093 xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
3162 3094
3163 /* 3095 /*
@@ -3178,12 +3110,10 @@ xfs_rename(
3178 IRELE(wip); 3110 IRELE(wip);
3179 return error; 3111 return error;
3180 3112
3181out_trans_abort:
3182 cancel_flags |= XFS_TRANS_ABORT;
3183out_bmap_cancel: 3113out_bmap_cancel:
3184 xfs_bmap_cancel(&free_list); 3114 xfs_bmap_cancel(&free_list);
3185out_trans_cancel: 3115out_trans_cancel:
3186 xfs_trans_cancel(tp, cancel_flags); 3116 xfs_trans_cancel(tp);
3187 if (wip) 3117 if (wip)
3188 IRELE(wip); 3118 IRELE(wip);
3189 return error; 3119 return error;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 87f67c6b654c..ea7d85af5310 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -336,7 +336,7 @@ xfs_set_dmattrs(
336 tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); 336 tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
337 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); 337 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
338 if (error) { 338 if (error) {
339 xfs_trans_cancel(tp, 0); 339 xfs_trans_cancel(tp);
340 return error; 340 return error;
341 } 341 }
342 xfs_ilock(ip, XFS_ILOCK_EXCL); 342 xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -346,7 +346,7 @@ xfs_set_dmattrs(
346 ip->i_d.di_dmstate = state; 346 ip->i_d.di_dmstate = state;
347 347
348 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 348 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
349 error = xfs_trans_commit(tp, 0); 349 error = xfs_trans_commit(tp);
350 350
351 return error; 351 return error;
352} 352}
@@ -1076,7 +1076,7 @@ xfs_ioctl_setattr_get_trans(
1076 return tp; 1076 return tp;
1077 1077
1078out_cancel: 1078out_cancel:
1079 xfs_trans_cancel(tp, 0); 1079 xfs_trans_cancel(tp);
1080 return ERR_PTR(error); 1080 return ERR_PTR(error);
1081} 1081}
1082 1082
@@ -1253,7 +1253,7 @@ xfs_ioctl_setattr(
1253 else 1253 else
1254 ip->i_d.di_extsize = 0; 1254 ip->i_d.di_extsize = 0;
1255 1255
1256 code = xfs_trans_commit(tp, 0); 1256 code = xfs_trans_commit(tp);
1257 1257
1258 /* 1258 /*
1259 * Release any dquot(s) the inode had kept before chown. 1259 * Release any dquot(s) the inode had kept before chown.
@@ -1265,7 +1265,7 @@ xfs_ioctl_setattr(
1265 return code; 1265 return code;
1266 1266
1267error_trans_cancel: 1267error_trans_cancel:
1268 xfs_trans_cancel(tp, 0); 1268 xfs_trans_cancel(tp);
1269error_free_dquots: 1269error_free_dquots:
1270 xfs_qm_dqrele(udqp); 1270 xfs_qm_dqrele(udqp);
1271 xfs_qm_dqrele(pdqp); 1271 xfs_qm_dqrele(pdqp);
@@ -1338,11 +1338,11 @@ xfs_ioc_setxflags(
1338 1338
1339 error = xfs_ioctl_setattr_xflags(tp, ip, &fa); 1339 error = xfs_ioctl_setattr_xflags(tp, ip, &fa);
1340 if (error) { 1340 if (error) {
1341 xfs_trans_cancel(tp, 0); 1341 xfs_trans_cancel(tp);
1342 goto out_drop_write; 1342 goto out_drop_write;
1343 } 1343 }
1344 1344
1345 error = xfs_trans_commit(tp, 0); 1345 error = xfs_trans_commit(tp);
1346out_drop_write: 1346out_drop_write:
1347 mnt_drop_write_file(filp); 1347 mnt_drop_write_file(filp);
1348 return error; 1348 return error;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 38e633bad8c2..1f86033171c8 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -183,7 +183,7 @@ xfs_iomap_write_direct(
183 * Check for running out of space, note: need lock to return 183 * Check for running out of space, note: need lock to return
184 */ 184 */
185 if (error) { 185 if (error) {
186 xfs_trans_cancel(tp, 0); 186 xfs_trans_cancel(tp);
187 return error; 187 return error;
188 } 188 }
189 189
@@ -213,7 +213,7 @@ xfs_iomap_write_direct(
213 error = xfs_bmap_finish(&tp, &free_list, &committed); 213 error = xfs_bmap_finish(&tp, &free_list, &committed);
214 if (error) 214 if (error)
215 goto out_bmap_cancel; 215 goto out_bmap_cancel;
216 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 216 error = xfs_trans_commit(tp);
217 if (error) 217 if (error)
218 goto out_unlock; 218 goto out_unlock;
219 219
@@ -236,7 +236,7 @@ out_bmap_cancel:
236 xfs_bmap_cancel(&free_list); 236 xfs_bmap_cancel(&free_list);
237 xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); 237 xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
238out_trans_cancel: 238out_trans_cancel:
239 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 239 xfs_trans_cancel(tp);
240 goto out_unlock; 240 goto out_unlock;
241} 241}
242 242
@@ -690,7 +690,7 @@ xfs_iomap_write_allocate(
690 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, 690 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
691 nres, 0); 691 nres, 0);
692 if (error) { 692 if (error) {
693 xfs_trans_cancel(tp, 0); 693 xfs_trans_cancel(tp);
694 return error; 694 return error;
695 } 695 }
696 xfs_ilock(ip, XFS_ILOCK_EXCL); 696 xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -760,7 +760,7 @@ xfs_iomap_write_allocate(
760 if (error) 760 if (error)
761 goto trans_cancel; 761 goto trans_cancel;
762 762
763 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 763 error = xfs_trans_commit(tp);
764 if (error) 764 if (error)
765 goto error0; 765 goto error0;
766 766
@@ -791,7 +791,7 @@ xfs_iomap_write_allocate(
791 791
792trans_cancel: 792trans_cancel:
793 xfs_bmap_cancel(&free_list); 793 xfs_bmap_cancel(&free_list);
794 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 794 xfs_trans_cancel(tp);
795error0: 795error0:
796 xfs_iunlock(ip, XFS_ILOCK_EXCL); 796 xfs_iunlock(ip, XFS_ILOCK_EXCL);
797 return error; 797 return error;
@@ -853,7 +853,7 @@ xfs_iomap_write_unwritten(
853 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, 853 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
854 resblks, 0); 854 resblks, 0);
855 if (error) { 855 if (error) {
856 xfs_trans_cancel(tp, 0); 856 xfs_trans_cancel(tp);
857 return error; 857 return error;
858 } 858 }
859 859
@@ -890,7 +890,7 @@ xfs_iomap_write_unwritten(
890 if (error) 890 if (error)
891 goto error_on_bmapi_transaction; 891 goto error_on_bmapi_transaction;
892 892
893 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 893 error = xfs_trans_commit(tp);
894 xfs_iunlock(ip, XFS_ILOCK_EXCL); 894 xfs_iunlock(ip, XFS_ILOCK_EXCL);
895 if (error) 895 if (error)
896 return error; 896 return error;
@@ -914,7 +914,7 @@ xfs_iomap_write_unwritten(
914 914
915error_on_bmapi_transaction: 915error_on_bmapi_transaction:
916 xfs_bmap_cancel(&free_list); 916 xfs_bmap_cancel(&free_list);
917 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); 917 xfs_trans_cancel(tp);
918 xfs_iunlock(ip, XFS_ILOCK_EXCL); 918 xfs_iunlock(ip, XFS_ILOCK_EXCL);
919 return error; 919 return error;
920} 920}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f4cd7204e236..2923419a66fa 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -702,7 +702,7 @@ xfs_setattr_nonsize(
702 702
703 if (mp->m_flags & XFS_MOUNT_WSYNC) 703 if (mp->m_flags & XFS_MOUNT_WSYNC)
704 xfs_trans_set_sync(tp); 704 xfs_trans_set_sync(tp);
705 error = xfs_trans_commit(tp, 0); 705 error = xfs_trans_commit(tp);
706 706
707 xfs_iunlock(ip, XFS_ILOCK_EXCL); 707 xfs_iunlock(ip, XFS_ILOCK_EXCL);
708 708
@@ -733,7 +733,7 @@ xfs_setattr_nonsize(
733 return 0; 733 return 0;
734 734
735out_trans_cancel: 735out_trans_cancel:
736 xfs_trans_cancel(tp, 0); 736 xfs_trans_cancel(tp);
737 xfs_iunlock(ip, XFS_ILOCK_EXCL); 737 xfs_iunlock(ip, XFS_ILOCK_EXCL);
738out_dqrele: 738out_dqrele:
739 xfs_qm_dqrele(udqp); 739 xfs_qm_dqrele(udqp);
@@ -755,7 +755,6 @@ xfs_setattr_size(
755 struct xfs_trans *tp; 755 struct xfs_trans *tp;
756 int error; 756 int error;
757 uint lock_flags = 0; 757 uint lock_flags = 0;
758 uint commit_flags = 0;
759 bool did_zeroing = false; 758 bool did_zeroing = false;
760 759
761 trace_xfs_setattr(ip); 760 trace_xfs_setattr(ip);
@@ -851,7 +850,11 @@ xfs_setattr_size(
851 * to hope that the caller sees ENOMEM and retries the truncate 850 * to hope that the caller sees ENOMEM and retries the truncate
852 * operation. 851 * operation.
853 */ 852 */
854 error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); 853 if (IS_DAX(inode))
854 error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
855 else
856 error = block_truncate_page(inode->i_mapping, newsize,
857 xfs_get_blocks);
855 if (error) 858 if (error)
856 return error; 859 return error;
857 truncate_setsize(inode, newsize); 860 truncate_setsize(inode, newsize);
@@ -861,7 +864,6 @@ xfs_setattr_size(
861 if (error) 864 if (error)
862 goto out_trans_cancel; 865 goto out_trans_cancel;
863 866
864 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
865 lock_flags |= XFS_ILOCK_EXCL; 867 lock_flags |= XFS_ILOCK_EXCL;
866 xfs_ilock(ip, XFS_ILOCK_EXCL); 868 xfs_ilock(ip, XFS_ILOCK_EXCL);
867 xfs_trans_ijoin(tp, ip, 0); 869 xfs_trans_ijoin(tp, ip, 0);
@@ -901,7 +903,7 @@ xfs_setattr_size(
901 if (newsize <= oldsize) { 903 if (newsize <= oldsize) {
902 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); 904 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
903 if (error) 905 if (error)
904 goto out_trans_abort; 906 goto out_trans_cancel;
905 907
906 /* 908 /*
907 * Truncated "down", so we're removing references to old data 909 * Truncated "down", so we're removing references to old data
@@ -928,16 +930,14 @@ xfs_setattr_size(
928 if (mp->m_flags & XFS_MOUNT_WSYNC) 930 if (mp->m_flags & XFS_MOUNT_WSYNC)
929 xfs_trans_set_sync(tp); 931 xfs_trans_set_sync(tp);
930 932
931 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 933 error = xfs_trans_commit(tp);
932out_unlock: 934out_unlock:
933 if (lock_flags) 935 if (lock_flags)
934 xfs_iunlock(ip, lock_flags); 936 xfs_iunlock(ip, lock_flags);
935 return error; 937 return error;
936 938
937out_trans_abort:
938 commit_flags |= XFS_TRANS_ABORT;
939out_trans_cancel: 939out_trans_cancel:
940 xfs_trans_cancel(tp, commit_flags); 940 xfs_trans_cancel(tp);
941 goto out_unlock; 941 goto out_unlock;
942} 942}
943 943
@@ -984,7 +984,7 @@ xfs_vn_update_time(
984 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); 984 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
985 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); 985 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
986 if (error) { 986 if (error) {
987 xfs_trans_cancel(tp, 0); 987 xfs_trans_cancel(tp);
988 return error; 988 return error;
989 } 989 }
990 990
@@ -1006,7 +1006,7 @@ xfs_vn_update_time(
1006 } 1006 }
1007 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 1007 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1008 xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); 1008 xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
1009 return xfs_trans_commit(tp, 0); 1009 return xfs_trans_commit(tp);
1010} 1010}
1011 1011
1012#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 1012#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -1191,22 +1191,22 @@ xfs_diflags_to_iflags(
1191 struct inode *inode, 1191 struct inode *inode,
1192 struct xfs_inode *ip) 1192 struct xfs_inode *ip)
1193{ 1193{
1194 if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) 1194 uint16_t flags = ip->i_d.di_flags;
1195
1196 inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC |
1197 S_NOATIME | S_DAX);
1198
1199 if (flags & XFS_DIFLAG_IMMUTABLE)
1195 inode->i_flags |= S_IMMUTABLE; 1200 inode->i_flags |= S_IMMUTABLE;
1196 else 1201 if (flags & XFS_DIFLAG_APPEND)
1197 inode->i_flags &= ~S_IMMUTABLE;
1198 if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
1199 inode->i_flags |= S_APPEND; 1202 inode->i_flags |= S_APPEND;
1200 else 1203 if (flags & XFS_DIFLAG_SYNC)
1201 inode->i_flags &= ~S_APPEND;
1202 if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
1203 inode->i_flags |= S_SYNC; 1204 inode->i_flags |= S_SYNC;
1204 else 1205 if (flags & XFS_DIFLAG_NOATIME)
1205 inode->i_flags &= ~S_SYNC;
1206 if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
1207 inode->i_flags |= S_NOATIME; 1206 inode->i_flags |= S_NOATIME;
1208 else 1207 /* XXX: Also needs an on-disk per inode flag! */
1209 inode->i_flags &= ~S_NOATIME; 1208 if (ip->i_mount->m_flags & XFS_MOUNT_DAX)
1209 inode->i_flags |= S_DAX;
1210} 1210}
1211 1211
1212/* 1212/*
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 80429891dc9b..f41b0c3fddab 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -252,7 +252,7 @@ xfs_bulkstat_grab_ichunk(
252 } 252 }
253 253
254 irec->ir_free |= xfs_inobt_maskn(0, idx); 254 irec->ir_free |= xfs_inobt_maskn(0, idx);
255 *icount = XFS_INODES_PER_CHUNK - irec->ir_freecount; 255 *icount = irec->ir_count - irec->ir_freecount;
256 } 256 }
257 257
258 return 0; 258 return 0;
@@ -415,6 +415,8 @@ xfs_bulkstat(
415 goto del_cursor; 415 goto del_cursor;
416 if (icount) { 416 if (icount) {
417 irbp->ir_startino = r.ir_startino; 417 irbp->ir_startino = r.ir_startino;
418 irbp->ir_holemask = r.ir_holemask;
419 irbp->ir_count = r.ir_count;
418 irbp->ir_freecount = r.ir_freecount; 420 irbp->ir_freecount = r.ir_freecount;
419 irbp->ir_free = r.ir_free; 421 irbp->ir_free = r.ir_free;
420 irbp++; 422 irbp++;
@@ -447,13 +449,15 @@ xfs_bulkstat(
447 * If this chunk has any allocated inodes, save it. 449 * If this chunk has any allocated inodes, save it.
448 * Also start read-ahead now for this chunk. 450 * Also start read-ahead now for this chunk.
449 */ 451 */
450 if (r.ir_freecount < XFS_INODES_PER_CHUNK) { 452 if (r.ir_freecount < r.ir_count) {
451 xfs_bulkstat_ichunk_ra(mp, agno, &r); 453 xfs_bulkstat_ichunk_ra(mp, agno, &r);
452 irbp->ir_startino = r.ir_startino; 454 irbp->ir_startino = r.ir_startino;
455 irbp->ir_holemask = r.ir_holemask;
456 irbp->ir_count = r.ir_count;
453 irbp->ir_freecount = r.ir_freecount; 457 irbp->ir_freecount = r.ir_freecount;
454 irbp->ir_free = r.ir_free; 458 irbp->ir_free = r.ir_free;
455 irbp++; 459 irbp++;
456 icount += XFS_INODES_PER_CHUNK - r.ir_freecount; 460 icount += r.ir_count - r.ir_freecount;
457 } 461 }
458 error = xfs_btree_increment(cur, 0, &stat); 462 error = xfs_btree_increment(cur, 0, &stat);
459 if (error || stat == 0) { 463 if (error || stat == 0) {
@@ -599,8 +603,7 @@ xfs_inumbers(
599 agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1; 603 agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
600 buffer[bufidx].xi_startino = 604 buffer[bufidx].xi_startino =
601 XFS_AGINO_TO_INO(mp, agno, r.ir_startino); 605 XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
602 buffer[bufidx].xi_alloccount = 606 buffer[bufidx].xi_alloccount = r.ir_count - r.ir_freecount;
603 XFS_INODES_PER_CHUNK - r.ir_freecount;
604 buffer[bufidx].xi_allocmask = ~r.ir_free; 607 buffer[bufidx].xi_allocmask = ~r.ir_free;
605 if (++bufidx == bcount) { 608 if (++bufidx == bcount) {
606 long written; 609 long written;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index bcc7cfabb787..c8d09ef81c4f 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -513,7 +513,7 @@ xfs_log_done(
513 struct xfs_mount *mp, 513 struct xfs_mount *mp,
514 struct xlog_ticket *ticket, 514 struct xlog_ticket *ticket,
515 struct xlog_in_core **iclog, 515 struct xlog_in_core **iclog,
516 uint flags) 516 bool regrant)
517{ 517{
518 struct xlog *log = mp->m_log; 518 struct xlog *log = mp->m_log;
519 xfs_lsn_t lsn = 0; 519 xfs_lsn_t lsn = 0;
@@ -526,14 +526,11 @@ xfs_log_done(
526 (((ticket->t_flags & XLOG_TIC_INITED) == 0) && 526 (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
527 (xlog_commit_record(log, ticket, iclog, &lsn)))) { 527 (xlog_commit_record(log, ticket, iclog, &lsn)))) {
528 lsn = (xfs_lsn_t) -1; 528 lsn = (xfs_lsn_t) -1;
529 if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { 529 regrant = false;
530 flags |= XFS_LOG_REL_PERM_RESERV;
531 }
532 } 530 }
533 531
534 532
535 if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 || 533 if (!regrant) {
536 (flags & XFS_LOG_REL_PERM_RESERV)) {
537 trace_xfs_log_done_nonperm(log, ticket); 534 trace_xfs_log_done_nonperm(log, ticket);
538 535
539 /* 536 /*
@@ -541,7 +538,6 @@ xfs_log_done(
541 * request has been made to release a permanent reservation. 538 * request has been made to release a permanent reservation.
542 */ 539 */
543 xlog_ungrant_log_space(log, ticket); 540 xlog_ungrant_log_space(log, ticket);
544 xfs_log_ticket_put(ticket);
545 } else { 541 } else {
546 trace_xfs_log_done_perm(log, ticket); 542 trace_xfs_log_done_perm(log, ticket);
547 543
@@ -553,6 +549,7 @@ xfs_log_done(
553 ticket->t_flags |= XLOG_TIC_INITED; 549 ticket->t_flags |= XLOG_TIC_INITED;
554 } 550 }
555 551
552 xfs_log_ticket_put(ticket);
556 return lsn; 553 return lsn;
557} 554}
558 555
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 84e0deb95abd..fa27aaec72cb 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -111,15 +111,6 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
111#define XFS_LSN_CMP(x,y) _lsn_cmp(x,y) 111#define XFS_LSN_CMP(x,y) _lsn_cmp(x,y)
112 112
113/* 113/*
114 * Macros, structures, prototypes for interface to the log manager.
115 */
116
117/*
118 * Flags to xfs_log_done()
119 */
120#define XFS_LOG_REL_PERM_RESERV 0x1
121
122/*
123 * Flags to xfs_log_force() 114 * Flags to xfs_log_force()
124 * 115 *
125 * XFS_LOG_SYNC: Synchronous force in-core log to disk 116 * XFS_LOG_SYNC: Synchronous force in-core log to disk
@@ -138,7 +129,7 @@ struct xfs_log_callback;
138xfs_lsn_t xfs_log_done(struct xfs_mount *mp, 129xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
139 struct xlog_ticket *ticket, 130 struct xlog_ticket *ticket,
140 struct xlog_in_core **iclog, 131 struct xlog_in_core **iclog,
141 uint flags); 132 bool regrant);
142int _xfs_log_force(struct xfs_mount *mp, 133int _xfs_log_force(struct xfs_mount *mp,
143 uint flags, 134 uint flags,
144 int *log_forced); 135 int *log_forced);
@@ -183,7 +174,7 @@ struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
183void xfs_log_ticket_put(struct xlog_ticket *ticket); 174void xfs_log_ticket_put(struct xlog_ticket *ticket);
184 175
185void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, 176void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
186 xfs_lsn_t *commit_lsn, int flags); 177 xfs_lsn_t *commit_lsn, bool regrant);
187bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); 178bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
188 179
189void xfs_log_work_queue(struct xfs_mount *mp); 180void xfs_log_work_queue(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 45cc0ce18adf..abc2ccbff739 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -624,7 +624,7 @@ restart:
624 spin_unlock(&cil->xc_push_lock); 624 spin_unlock(&cil->xc_push_lock);
625 625
626 /* xfs_log_done always frees the ticket on error. */ 626 /* xfs_log_done always frees the ticket on error. */
627 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); 627 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false);
628 if (commit_lsn == -1) 628 if (commit_lsn == -1)
629 goto out_abort; 629 goto out_abort;
630 630
@@ -773,14 +773,10 @@ xfs_log_commit_cil(
773 struct xfs_mount *mp, 773 struct xfs_mount *mp,
774 struct xfs_trans *tp, 774 struct xfs_trans *tp,
775 xfs_lsn_t *commit_lsn, 775 xfs_lsn_t *commit_lsn,
776 int flags) 776 bool regrant)
777{ 777{
778 struct xlog *log = mp->m_log; 778 struct xlog *log = mp->m_log;
779 struct xfs_cil *cil = log->l_cilp; 779 struct xfs_cil *cil = log->l_cilp;
780 int log_flags = 0;
781
782 if (flags & XFS_TRANS_RELEASE_LOG_RES)
783 log_flags = XFS_LOG_REL_PERM_RESERV;
784 780
785 /* lock out background commit */ 781 /* lock out background commit */
786 down_read(&cil->xc_ctx_lock); 782 down_read(&cil->xc_ctx_lock);
@@ -795,7 +791,7 @@ xfs_log_commit_cil(
795 if (commit_lsn) 791 if (commit_lsn)
796 *commit_lsn = tp->t_commit_lsn; 792 *commit_lsn = tp->t_commit_lsn;
797 793
798 xfs_log_done(mp, tp->t_ticket, NULL, log_flags); 794 xfs_log_done(mp, tp->t_ticket, NULL, regrant);
799 xfs_trans_unreserve_and_mod_sb(tp); 795 xfs_trans_unreserve_and_mod_sb(tp);
800 796
801 /* 797 /*
@@ -809,7 +805,7 @@ xfs_log_commit_cil(
809 * the log items. This affects (at least) processing of stale buffers, 805 * the log items. This affects (at least) processing of stale buffers,
810 * inodes and EFIs. 806 * inodes and EFIs.
811 */ 807 */
812 xfs_trans_free_items(tp, tp->t_commit_lsn, 0); 808 xfs_trans_free_items(tp, tp->t_commit_lsn, false);
813 809
814 xlog_cil_push_background(log); 810 xlog_cil_push_background(log);
815 811
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 4f5784f85a5b..299fbaff1f2c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3068,12 +3068,22 @@ xlog_recover_do_icreate_pass2(
3068 return -EINVAL; 3068 return -EINVAL;
3069 } 3069 }
3070 3070
3071 /* existing allocation is fixed value */ 3071 /*
3072 ASSERT(count == mp->m_ialloc_inos); 3072 * The inode chunk is either full or sparse and we only support
3073 ASSERT(length == mp->m_ialloc_blks); 3073 * m_ialloc_min_blks sized sparse allocations at this time.
3074 if (count != mp->m_ialloc_inos || 3074 */
3075 length != mp->m_ialloc_blks) { 3075 if (length != mp->m_ialloc_blks &&
3076 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); 3076 length != mp->m_ialloc_min_blks) {
3077 xfs_warn(log->l_mp,
3078 "%s: unsupported chunk length", __FUNCTION__);
3079 return -EINVAL;
3080 }
3081
3082 /* verify inode count is consistent with extent length */
3083 if ((count >> mp->m_sb.sb_inopblog) != length) {
3084 xfs_warn(log->l_mp,
3085 "%s: inconsistent inode count and chunk length",
3086 __FUNCTION__);
3077 return -EINVAL; 3087 return -EINVAL;
3078 } 3088 }
3079 3089
@@ -3091,8 +3101,8 @@ xlog_recover_do_icreate_pass2(
3091 XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) 3101 XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
3092 return 0; 3102 return 0;
3093 3103
3094 xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length, 3104 xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length,
3095 be32_to_cpu(icl->icl_gen)); 3105 be32_to_cpu(icl->icl_gen));
3096 return 0; 3106 return 0;
3097} 3107}
3098 3108
@@ -3751,11 +3761,11 @@ xlog_recover_process_efi(
3751 } 3761 }
3752 3762
3753 set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); 3763 set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
3754 error = xfs_trans_commit(tp, 0); 3764 error = xfs_trans_commit(tp);
3755 return error; 3765 return error;
3756 3766
3757abort_error: 3767abort_error:
3758 xfs_trans_cancel(tp, XFS_TRANS_ABORT); 3768 xfs_trans_cancel(tp);
3759 return error; 3769 return error;
3760} 3770}
3761 3771
@@ -3857,13 +3867,13 @@ xlog_recover_clear_agi_bucket(
3857 xfs_trans_log_buf(tp, agibp, offset, 3867 xfs_trans_log_buf(tp, agibp, offset,
3858 (offset + sizeof(xfs_agino_t) - 1)); 3868 (offset + sizeof(xfs_agino_t) - 1));
3859 3869
3860 error = xfs_trans_commit(tp, 0); 3870 error = xfs_trans_commit(tp);
3861 if (error) 3871 if (error)
3862 goto out_error; 3872 goto out_error;
3863 return; 3873 return;
3864 3874
3865out_abort: 3875out_abort:
3866 xfs_trans_cancel(tp, XFS_TRANS_ABORT); 3876 xfs_trans_cancel(tp);
3867out_error: 3877out_error:
3868 xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno); 3878 xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
3869 return; 3879 return;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 2ce7ee3b4ec1..461e791efad7 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -725,6 +725,22 @@ xfs_mountfs(
725 } 725 }
726 726
727 /* 727 /*
728 * If enabled, sparse inode chunk alignment is expected to match the
729 * cluster size. Full inode chunk alignment must match the chunk size,
730 * but that is checked on sb read verification...
731 */
732 if (xfs_sb_version_hassparseinodes(&mp->m_sb) &&
733 mp->m_sb.sb_spino_align !=
734 XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) {
735 xfs_warn(mp,
736 "Sparse inode block alignment (%u) must match cluster size (%llu).",
737 mp->m_sb.sb_spino_align,
738 XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size));
739 error = -EINVAL;
740 goto out_remove_uuid;
741 }
742
743 /*
728 * Set inode alignment fields 744 * Set inode alignment fields
729 */ 745 */
730 xfs_set_inoalignment(mp); 746 xfs_set_inoalignment(mp);
@@ -1084,14 +1100,18 @@ xfs_log_sbcount(xfs_mount_t *mp)
1084 return xfs_sync_sb(mp, true); 1100 return xfs_sync_sb(mp, true);
1085} 1101}
1086 1102
1103/*
1104 * Deltas for the inode count are +/-64, hence we use a large batch size
1105 * of 128 so we don't need to take the counter lock on every update.
1106 */
1107#define XFS_ICOUNT_BATCH 128
1087int 1108int
1088xfs_mod_icount( 1109xfs_mod_icount(
1089 struct xfs_mount *mp, 1110 struct xfs_mount *mp,
1090 int64_t delta) 1111 int64_t delta)
1091{ 1112{
1092 /* deltas are +/-64, hence the large batch size of 128. */ 1113 __percpu_counter_add(&mp->m_icount, delta, XFS_ICOUNT_BATCH);
1093 __percpu_counter_add(&mp->m_icount, delta, 128); 1114 if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) {
1094 if (percpu_counter_compare(&mp->m_icount, 0) < 0) {
1095 ASSERT(0); 1115 ASSERT(0);
1096 percpu_counter_add(&mp->m_icount, -delta); 1116 percpu_counter_add(&mp->m_icount, -delta);
1097 return -EINVAL; 1117 return -EINVAL;
@@ -1113,6 +1133,14 @@ xfs_mod_ifree(
1113 return 0; 1133 return 0;
1114} 1134}
1115 1135
1136/*
1137 * Deltas for the block count can vary from 1 to very large, but lock contention
1138 * only occurs on frequent small block count updates such as in the delayed
1139 * allocation path for buffered writes (page a time updates). Hence we set
1140 * a large batch count (1024) to minimise global counter updates except when
1141 * we get near to ENOSPC and we have to be very accurate with our updates.
1142 */
1143#define XFS_FDBLOCKS_BATCH 1024
1116int 1144int
1117xfs_mod_fdblocks( 1145xfs_mod_fdblocks(
1118 struct xfs_mount *mp, 1146 struct xfs_mount *mp,
@@ -1151,25 +1179,19 @@ xfs_mod_fdblocks(
1151 * Taking blocks away, need to be more accurate the closer we 1179 * Taking blocks away, need to be more accurate the closer we
1152 * are to zero. 1180 * are to zero.
1153 * 1181 *
1154 * batch size is set to a maximum of 1024 blocks - if we are
1155 * allocating of freeing extents larger than this then we aren't
1156 * going to be hammering the counter lock so a lock per update
1157 * is not a problem.
1158 *
1159 * If the counter has a value of less than 2 * max batch size, 1182 * If the counter has a value of less than 2 * max batch size,
1160 * then make everything serialise as we are real close to 1183 * then make everything serialise as we are real close to
1161 * ENOSPC. 1184 * ENOSPC.
1162 */ 1185 */
1163#define __BATCH 1024 1186 if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH,
1164 if (percpu_counter_compare(&mp->m_fdblocks, 2 * __BATCH) < 0) 1187 XFS_FDBLOCKS_BATCH) < 0)
1165 batch = 1; 1188 batch = 1;
1166 else 1189 else
1167 batch = __BATCH; 1190 batch = XFS_FDBLOCKS_BATCH;
1168#undef __BATCH
1169 1191
1170 __percpu_counter_add(&mp->m_fdblocks, delta, batch); 1192 __percpu_counter_add(&mp->m_fdblocks, delta, batch);
1171 if (percpu_counter_compare(&mp->m_fdblocks, 1193 if (__percpu_counter_compare(&mp->m_fdblocks, XFS_ALLOC_SET_ASIDE(mp),
1172 XFS_ALLOC_SET_ASIDE(mp)) >= 0) { 1194 XFS_FDBLOCKS_BATCH) >= 0) {
1173 /* we had space! */ 1195 /* we had space! */
1174 return 0; 1196 return 0;
1175 } 1197 }
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 8c995a2ccb6f..7999e91cd49a 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -101,6 +101,8 @@ typedef struct xfs_mount {
101 __uint64_t m_flags; /* global mount flags */ 101 __uint64_t m_flags; /* global mount flags */
102 int m_ialloc_inos; /* inodes in inode allocation */ 102 int m_ialloc_inos; /* inodes in inode allocation */
103 int m_ialloc_blks; /* blocks in inode allocation */ 103 int m_ialloc_blks; /* blocks in inode allocation */
104 int m_ialloc_min_blks;/* min blocks in sparse inode
105 * allocation */
104 int m_inoalign_mask;/* mask sb_inoalignmt if used */ 106 int m_inoalign_mask;/* mask sb_inoalignmt if used */
105 uint m_qflags; /* quota status flags */ 107 uint m_qflags; /* quota status flags */
106 struct xfs_trans_resv m_resv; /* precomputed res values */ 108 struct xfs_trans_resv m_resv; /* precomputed res values */
@@ -179,6 +181,8 @@ typedef struct xfs_mount {
179 allocator */ 181 allocator */
180#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ 182#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */
181 183
184#define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */
185
182 186
183/* 187/*
184 * Default minimum read and write sizes. 188 * Default minimum read and write sizes.
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 981a657eca39..ab4a6066f7ca 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -306,7 +306,7 @@ xfs_fs_commit_blocks(
306 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); 306 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
307 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); 307 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
308 if (error) { 308 if (error) {
309 xfs_trans_cancel(tp, 0); 309 xfs_trans_cancel(tp);
310 goto out_drop_iolock; 310 goto out_drop_iolock;
311 } 311 }
312 312
@@ -321,7 +321,7 @@ xfs_fs_commit_blocks(
321 } 321 }
322 322
323 xfs_trans_set_sync(tp); 323 xfs_trans_set_sync(tp);
324 error = xfs_trans_commit(tp, 0); 324 error = xfs_trans_commit(tp);
325 325
326out_drop_iolock: 326out_drop_iolock:
327 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 327 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 5538468c7f63..eac9549efd52 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -756,7 +756,7 @@ xfs_qm_qino_alloc(
756 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create, 756 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create,
757 XFS_QM_QINOCREATE_SPACE_RES(mp), 0); 757 XFS_QM_QINOCREATE_SPACE_RES(mp), 0);
758 if (error) { 758 if (error) {
759 xfs_trans_cancel(tp, 0); 759 xfs_trans_cancel(tp);
760 return error; 760 return error;
761 } 761 }
762 762
@@ -764,8 +764,7 @@ xfs_qm_qino_alloc(
764 error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, 764 error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
765 &committed); 765 &committed);
766 if (error) { 766 if (error) {
767 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | 767 xfs_trans_cancel(tp);
768 XFS_TRANS_ABORT);
769 return error; 768 return error;
770 } 769 }
771 } 770 }
@@ -796,7 +795,7 @@ xfs_qm_qino_alloc(
796 spin_unlock(&mp->m_sb_lock); 795 spin_unlock(&mp->m_sb_lock);
797 xfs_log_sb(tp); 796 xfs_log_sb(tp);
798 797
799 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 798 error = xfs_trans_commit(tp);
800 if (error) { 799 if (error) {
801 ASSERT(XFS_FORCED_SHUTDOWN(mp)); 800 ASSERT(XFS_FORCED_SHUTDOWN(mp));
802 xfs_alert(mp, "%s failed (error %d)!", __func__, error); 801 xfs_alert(mp, "%s failed (error %d)!", __func__, error);
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 9a25c9275fb3..3640c6e896af 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -239,7 +239,7 @@ xfs_qm_scall_trunc_qfile(
239 tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); 239 tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
240 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); 240 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
241 if (error) { 241 if (error) {
242 xfs_trans_cancel(tp, 0); 242 xfs_trans_cancel(tp);
243 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 243 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
244 goto out_put; 244 goto out_put;
245 } 245 }
@@ -252,15 +252,14 @@ xfs_qm_scall_trunc_qfile(
252 252
253 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); 253 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
254 if (error) { 254 if (error) {
255 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | 255 xfs_trans_cancel(tp);
256 XFS_TRANS_ABORT);
257 goto out_unlock; 256 goto out_unlock;
258 } 257 }
259 258
260 ASSERT(ip->i_d.di_nextents == 0); 259 ASSERT(ip->i_d.di_nextents == 0);
261 260
262 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 261 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
263 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 262 error = xfs_trans_commit(tp);
264 263
265out_unlock: 264out_unlock:
266 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 265 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
@@ -437,7 +436,7 @@ xfs_qm_scall_setqlim(
437 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); 436 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
438 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0); 437 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0);
439 if (error) { 438 if (error) {
440 xfs_trans_cancel(tp, 0); 439 xfs_trans_cancel(tp);
441 goto out_rele; 440 goto out_rele;
442 } 441 }
443 442
@@ -548,7 +547,7 @@ xfs_qm_scall_setqlim(
548 dqp->dq_flags |= XFS_DQ_DIRTY; 547 dqp->dq_flags |= XFS_DQ_DIRTY;
549 xfs_trans_log_dquot(tp, dqp); 548 xfs_trans_log_dquot(tp, dqp);
550 549
551 error = xfs_trans_commit(tp, 0); 550 error = xfs_trans_commit(tp);
552 551
553out_rele: 552out_rele:
554 xfs_qm_dqrele(dqp); 553 xfs_qm_dqrele(dqp);
@@ -571,7 +570,7 @@ xfs_qm_log_quotaoff_end(
571 570
572 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0); 571 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
573 if (error) { 572 if (error) {
574 xfs_trans_cancel(tp, 0); 573 xfs_trans_cancel(tp);
575 return error; 574 return error;
576 } 575 }
577 576
@@ -585,8 +584,7 @@ xfs_qm_log_quotaoff_end(
585 * We don't care about quotoff's performance. 584 * We don't care about quotoff's performance.
586 */ 585 */
587 xfs_trans_set_sync(tp); 586 xfs_trans_set_sync(tp);
588 error = xfs_trans_commit(tp, 0); 587 return xfs_trans_commit(tp);
589 return error;
590} 588}
591 589
592 590
@@ -605,7 +603,7 @@ xfs_qm_log_quotaoff(
605 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); 603 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
606 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0); 604 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
607 if (error) { 605 if (error) {
608 xfs_trans_cancel(tp, 0); 606 xfs_trans_cancel(tp);
609 goto out; 607 goto out;
610 } 608 }
611 609
@@ -624,7 +622,7 @@ xfs_qm_log_quotaoff(
624 * We don't care about quotoff's performance. 622 * We don't care about quotoff's performance.
625 */ 623 */
626 xfs_trans_set_sync(tp); 624 xfs_trans_set_sync(tp);
627 error = xfs_trans_commit(tp, 0); 625 error = xfs_trans_commit(tp);
628 if (error) 626 if (error)
629 goto out; 627 goto out;
630 628
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 5376dd406ba2..ce6506adab7b 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -55,7 +55,6 @@ struct xfs_trans;
55typedef struct xfs_dqtrx { 55typedef struct xfs_dqtrx {
56 struct xfs_dquot *qt_dquot; /* the dquot this refers to */ 56 struct xfs_dquot *qt_dquot; /* the dquot this refers to */
57 ulong qt_blk_res; /* blks reserved on a dquot */ 57 ulong qt_blk_res; /* blks reserved on a dquot */
58 ulong qt_blk_res_used; /* blks used from the reservation */
59 ulong qt_ino_res; /* inode reserved on a dquot */ 58 ulong qt_ino_res; /* inode reserved on a dquot */
60 ulong qt_ino_res_used; /* inodes used from the reservation */ 59 ulong qt_ino_res_used; /* inodes used from the reservation */
61 long qt_bcount_delta; /* dquot blk count changes */ 60 long qt_bcount_delta; /* dquot blk count changes */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index f2079b6911cc..f4e8c06eee26 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -780,7 +780,6 @@ xfs_growfs_rt_alloc(
780 * Allocate space to the file, as necessary. 780 * Allocate space to the file, as necessary.
781 */ 781 */
782 while (oblocks < nblocks) { 782 while (oblocks < nblocks) {
783 int cancelflags = 0;
784 xfs_trans_t *tp; 783 xfs_trans_t *tp;
785 784
786 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); 785 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
@@ -792,7 +791,6 @@ xfs_growfs_rt_alloc(
792 resblks, 0); 791 resblks, 0);
793 if (error) 792 if (error)
794 goto error_cancel; 793 goto error_cancel;
795 cancelflags = XFS_TRANS_RELEASE_LOG_RES;
796 /* 794 /*
797 * Lock the inode. 795 * Lock the inode.
798 */ 796 */
@@ -804,7 +802,6 @@ xfs_growfs_rt_alloc(
804 * Allocate blocks to the bitmap file. 802 * Allocate blocks to the bitmap file.
805 */ 803 */
806 nmap = 1; 804 nmap = 1;
807 cancelflags |= XFS_TRANS_ABORT;
808 error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, 805 error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
809 XFS_BMAPI_METADATA, &firstblock, 806 XFS_BMAPI_METADATA, &firstblock,
810 resblks, &map, &nmap, &flist); 807 resblks, &map, &nmap, &flist);
@@ -818,14 +815,13 @@ xfs_growfs_rt_alloc(
818 error = xfs_bmap_finish(&tp, &flist, &committed); 815 error = xfs_bmap_finish(&tp, &flist, &committed);
819 if (error) 816 if (error)
820 goto error_cancel; 817 goto error_cancel;
821 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 818 error = xfs_trans_commit(tp);
822 if (error) 819 if (error)
823 goto error; 820 goto error;
824 /* 821 /*
825 * Now we need to clear the allocated blocks. 822 * Now we need to clear the allocated blocks.
826 * Do this one block per transaction, to keep it simple. 823 * Do this one block per transaction, to keep it simple.
827 */ 824 */
828 cancelflags = 0;
829 for (bno = map.br_startoff, fsbno = map.br_startblock; 825 for (bno = map.br_startoff, fsbno = map.br_startblock;
830 bno < map.br_startoff + map.br_blockcount; 826 bno < map.br_startoff + map.br_blockcount;
831 bno++, fsbno++) { 827 bno++, fsbno++) {
@@ -851,7 +847,7 @@ xfs_growfs_rt_alloc(
851 if (bp == NULL) { 847 if (bp == NULL) {
852 error = -EIO; 848 error = -EIO;
853error_cancel: 849error_cancel:
854 xfs_trans_cancel(tp, cancelflags); 850 xfs_trans_cancel(tp);
855 goto error; 851 goto error;
856 } 852 }
857 memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); 853 memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
@@ -859,7 +855,7 @@ error_cancel:
859 /* 855 /*
860 * Commit the transaction. 856 * Commit the transaction.
861 */ 857 */
862 error = xfs_trans_commit(tp, 0); 858 error = xfs_trans_commit(tp);
863 if (error) 859 if (error)
864 goto error; 860 goto error;
865 } 861 }
@@ -973,7 +969,6 @@ xfs_growfs_rt(
973 bmbno < nrbmblocks; 969 bmbno < nrbmblocks;
974 bmbno++) { 970 bmbno++) {
975 xfs_trans_t *tp; 971 xfs_trans_t *tp;
976 int cancelflags = 0;
977 972
978 *nmp = *mp; 973 *nmp = *mp;
979 nsbp = &nmp->m_sb; 974 nsbp = &nmp->m_sb;
@@ -1015,7 +1010,6 @@ xfs_growfs_rt(
1015 mp->m_rbmip->i_d.di_size = 1010 mp->m_rbmip->i_d.di_size =
1016 nsbp->sb_rbmblocks * nsbp->sb_blocksize; 1011 nsbp->sb_rbmblocks * nsbp->sb_blocksize;
1017 xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); 1012 xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
1018 cancelflags |= XFS_TRANS_ABORT;
1019 /* 1013 /*
1020 * Get the summary inode into the transaction. 1014 * Get the summary inode into the transaction.
1021 */ 1015 */
@@ -1062,7 +1056,7 @@ xfs_growfs_rt(
1062 nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno); 1056 nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
1063 if (error) { 1057 if (error) {
1064error_cancel: 1058error_cancel:
1065 xfs_trans_cancel(tp, cancelflags); 1059 xfs_trans_cancel(tp);
1066 break; 1060 break;
1067 } 1061 }
1068 /* 1062 /*
@@ -1076,7 +1070,7 @@ error_cancel:
1076 mp->m_rsumlevels = nrsumlevels; 1070 mp->m_rsumlevels = nrsumlevels;
1077 mp->m_rsumsize = nrsumsize; 1071 mp->m_rsumsize = nrsumsize;
1078 1072
1079 error = xfs_trans_commit(tp, 0); 1073 error = xfs_trans_commit(tp);
1080 if (error) 1074 if (error)
1081 break; 1075 break;
1082 } 1076 }
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 858e1e62bbaa..1fb16562c159 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -112,6 +112,8 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */
112#define MNTOPT_DISCARD "discard" /* Discard unused blocks */ 112#define MNTOPT_DISCARD "discard" /* Discard unused blocks */
113#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */ 113#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */
114 114
115#define MNTOPT_DAX "dax" /* Enable direct access to bdev pages */
116
115/* 117/*
116 * Table driven mount option parser. 118 * Table driven mount option parser.
117 * 119 *
@@ -363,6 +365,10 @@ xfs_parseargs(
363 mp->m_flags |= XFS_MOUNT_DISCARD; 365 mp->m_flags |= XFS_MOUNT_DISCARD;
364 } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { 366 } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
365 mp->m_flags &= ~XFS_MOUNT_DISCARD; 367 mp->m_flags &= ~XFS_MOUNT_DISCARD;
368#ifdef CONFIG_FS_DAX
369 } else if (!strcmp(this_char, MNTOPT_DAX)) {
370 mp->m_flags |= XFS_MOUNT_DAX;
371#endif
366 } else { 372 } else {
367 xfs_warn(mp, "unknown mount option [%s].", this_char); 373 xfs_warn(mp, "unknown mount option [%s].", this_char);
368 return -EINVAL; 374 return -EINVAL;
@@ -452,8 +458,8 @@ done:
452} 458}
453 459
454struct proc_xfs_info { 460struct proc_xfs_info {
455 int flag; 461 uint64_t flag;
456 char *str; 462 char *str;
457}; 463};
458 464
459STATIC int 465STATIC int
@@ -474,6 +480,7 @@ xfs_showargs(
474 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, 480 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
475 { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, 481 { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD },
476 { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE }, 482 { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE },
483 { XFS_MOUNT_DAX, "," MNTOPT_DAX },
477 { 0, NULL } 484 { 0, NULL }
478 }; 485 };
479 static struct proc_xfs_info xfs_info_unset[] = { 486 static struct proc_xfs_info xfs_info_unset[] = {
@@ -1507,6 +1514,20 @@ xfs_fs_fill_super(
1507 if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) 1514 if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
1508 sb->s_flags |= MS_I_VERSION; 1515 sb->s_flags |= MS_I_VERSION;
1509 1516
1517 if (mp->m_flags & XFS_MOUNT_DAX) {
1518 xfs_warn(mp,
1519 "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
1520 if (sb->s_blocksize != PAGE_SIZE) {
1521 xfs_alert(mp,
1522 "Filesystem block size invalid for DAX Turning DAX off.");
1523 mp->m_flags &= ~XFS_MOUNT_DAX;
1524 } else if (!sb->s_bdev->bd_disk->fops->direct_access) {
1525 xfs_alert(mp,
1526 "Block device does not support DAX Turning DAX off.");
1527 mp->m_flags &= ~XFS_MOUNT_DAX;
1528 }
1529 }
1530
1510 error = xfs_mountfs(mp); 1531 error = xfs_mountfs(mp);
1511 if (error) 1532 if (error)
1512 goto out_filestream_unmount; 1533 goto out_filestream_unmount;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 3df411eadb86..2d90452062b0 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -178,7 +178,6 @@ xfs_symlink(
178 struct xfs_bmap_free free_list; 178 struct xfs_bmap_free free_list;
179 xfs_fsblock_t first_block; 179 xfs_fsblock_t first_block;
180 bool unlock_dp_on_error = false; 180 bool unlock_dp_on_error = false;
181 uint cancel_flags;
182 int committed; 181 int committed;
183 xfs_fileoff_t first_fsb; 182 xfs_fileoff_t first_fsb;
184 xfs_filblks_t fs_blocks; 183 xfs_filblks_t fs_blocks;
@@ -224,7 +223,6 @@ xfs_symlink(
224 return error; 223 return error;
225 224
226 tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); 225 tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
227 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
228 /* 226 /*
229 * The symlink will fit into the inode data fork? 227 * The symlink will fit into the inode data fork?
230 * There can't be any attributes so we get the whole variable part. 228 * There can't be any attributes so we get the whole variable part.
@@ -239,10 +237,8 @@ xfs_symlink(
239 resblks = 0; 237 resblks = 0;
240 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0); 238 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
241 } 239 }
242 if (error) { 240 if (error)
243 cancel_flags = 0;
244 goto out_trans_cancel; 241 goto out_trans_cancel;
245 }
246 242
247 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); 243 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
248 unlock_dp_on_error = true; 244 unlock_dp_on_error = true;
@@ -394,7 +390,7 @@ xfs_symlink(
394 if (error) 390 if (error)
395 goto out_bmap_cancel; 391 goto out_bmap_cancel;
396 392
397 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 393 error = xfs_trans_commit(tp);
398 if (error) 394 if (error)
399 goto out_release_inode; 395 goto out_release_inode;
400 396
@@ -407,9 +403,8 @@ xfs_symlink(
407 403
408out_bmap_cancel: 404out_bmap_cancel:
409 xfs_bmap_cancel(&free_list); 405 xfs_bmap_cancel(&free_list);
410 cancel_flags |= XFS_TRANS_ABORT;
411out_trans_cancel: 406out_trans_cancel:
412 xfs_trans_cancel(tp, cancel_flags); 407 xfs_trans_cancel(tp);
413out_release_inode: 408out_release_inode:
414 /* 409 /*
415 * Wait until after the current transaction is aborted to finish the 410 * Wait until after the current transaction is aborted to finish the
@@ -464,7 +459,7 @@ xfs_inactive_symlink_rmt(
464 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); 459 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
465 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); 460 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
466 if (error) { 461 if (error) {
467 xfs_trans_cancel(tp, 0); 462 xfs_trans_cancel(tp);
468 return error; 463 return error;
469 } 464 }
470 465
@@ -533,7 +528,7 @@ xfs_inactive_symlink_rmt(
533 /* 528 /*
534 * Commit the transaction containing extent freeing and EFDs. 529 * Commit the transaction containing extent freeing and EFDs.
535 */ 530 */
536 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 531 error = xfs_trans_commit(tp);
537 if (error) { 532 if (error) {
538 ASSERT(XFS_FORCED_SHUTDOWN(mp)); 533 ASSERT(XFS_FORCED_SHUTDOWN(mp));
539 goto error_unlock; 534 goto error_unlock;
@@ -552,7 +547,7 @@ xfs_inactive_symlink_rmt(
552error_bmap_cancel: 547error_bmap_cancel:
553 xfs_bmap_cancel(&free_list); 548 xfs_bmap_cancel(&free_list);
554error_trans_cancel: 549error_trans_cancel:
555 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 550 xfs_trans_cancel(tp);
556error_unlock: 551error_unlock:
557 xfs_iunlock(ip, XFS_ILOCK_EXCL); 552 xfs_iunlock(ip, XFS_ILOCK_EXCL);
558 return error; 553 return error;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 615781bf4ee5..8d916d33d93d 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -738,6 +738,53 @@ TRACE_EVENT(xfs_iomap_prealloc_size,
738 __entry->blocks, __entry->shift, __entry->writeio_blocks) 738 __entry->blocks, __entry->shift, __entry->writeio_blocks)
739) 739)
740 740
741TRACE_EVENT(xfs_irec_merge_pre,
742 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
743 uint16_t holemask, xfs_agino_t nagino, uint16_t nholemask),
744 TP_ARGS(mp, agno, agino, holemask, nagino, nholemask),
745 TP_STRUCT__entry(
746 __field(dev_t, dev)
747 __field(xfs_agnumber_t, agno)
748 __field(xfs_agino_t, agino)
749 __field(uint16_t, holemask)
750 __field(xfs_agino_t, nagino)
751 __field(uint16_t, nholemask)
752 ),
753 TP_fast_assign(
754 __entry->dev = mp->m_super->s_dev;
755 __entry->agno = agno;
756 __entry->agino = agino;
757 __entry->holemask = holemask;
758 __entry->nagino = nagino;
759 __entry->nholemask = holemask;
760 ),
761 TP_printk("dev %d:%d agno %d inobt (%u:0x%x) new (%u:0x%x)",
762 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
763 __entry->agino, __entry->holemask, __entry->nagino,
764 __entry->nholemask)
765)
766
767TRACE_EVENT(xfs_irec_merge_post,
768 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
769 uint16_t holemask),
770 TP_ARGS(mp, agno, agino, holemask),
771 TP_STRUCT__entry(
772 __field(dev_t, dev)
773 __field(xfs_agnumber_t, agno)
774 __field(xfs_agino_t, agino)
775 __field(uint16_t, holemask)
776 ),
777 TP_fast_assign(
778 __entry->dev = mp->m_super->s_dev;
779 __entry->agno = agno;
780 __entry->agino = agino;
781 __entry->holemask = holemask;
782 ),
783 TP_printk("dev %d:%d agno %d inobt (%u:0x%x)", MAJOR(__entry->dev),
784 MINOR(__entry->dev), __entry->agno, __entry->agino,
785 __entry->holemask)
786)
787
741#define DEFINE_IREF_EVENT(name) \ 788#define DEFINE_IREF_EVENT(name) \
742DEFINE_EVENT(xfs_iref_class, name, \ 789DEFINE_EVENT(xfs_iref_class, name, \
743 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ 790 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 220ef2c906b2..0582a27107d4 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -113,7 +113,7 @@ xfs_trans_free(
113 * blocks. Locks and log items, however, are no inherited. They must 113 * blocks. Locks and log items, however, are no inherited. They must
114 * be added to the new transaction explicitly. 114 * be added to the new transaction explicitly.
115 */ 115 */
116xfs_trans_t * 116STATIC xfs_trans_t *
117xfs_trans_dup( 117xfs_trans_dup(
118 xfs_trans_t *tp) 118 xfs_trans_t *tp)
119{ 119{
@@ -251,14 +251,7 @@ xfs_trans_reserve(
251 */ 251 */
252undo_log: 252undo_log:
253 if (resp->tr_logres > 0) { 253 if (resp->tr_logres > 0) {
254 int log_flags; 254 xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false);
255
256 if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
257 log_flags = XFS_LOG_REL_PERM_RESERV;
258 } else {
259 log_flags = 0;
260 }
261 xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags);
262 tp->t_ticket = NULL; 255 tp->t_ticket = NULL;
263 tp->t_log_res = 0; 256 tp->t_log_res = 0;
264 tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES; 257 tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
@@ -744,7 +737,7 @@ void
744xfs_trans_free_items( 737xfs_trans_free_items(
745 struct xfs_trans *tp, 738 struct xfs_trans *tp,
746 xfs_lsn_t commit_lsn, 739 xfs_lsn_t commit_lsn,
747 int flags) 740 bool abort)
748{ 741{
749 struct xfs_log_item_desc *lidp, *next; 742 struct xfs_log_item_desc *lidp, *next;
750 743
@@ -755,7 +748,7 @@ xfs_trans_free_items(
755 748
756 if (commit_lsn != NULLCOMMITLSN) 749 if (commit_lsn != NULLCOMMITLSN)
757 lip->li_ops->iop_committing(lip, commit_lsn); 750 lip->li_ops->iop_committing(lip, commit_lsn);
758 if (flags & XFS_TRANS_ABORT) 751 if (abort)
759 lip->li_flags |= XFS_LI_ABORTED; 752 lip->li_flags |= XFS_LI_ABORTED;
760 lip->li_ops->iop_unlock(lip); 753 lip->li_ops->iop_unlock(lip);
761 754
@@ -892,27 +885,17 @@ xfs_trans_committed_bulk(
892 * have already been unlocked as if the commit had succeeded. 885 * have already been unlocked as if the commit had succeeded.
893 * Do not reference the transaction structure after this call. 886 * Do not reference the transaction structure after this call.
894 */ 887 */
895int 888static int
896xfs_trans_commit( 889__xfs_trans_commit(
897 struct xfs_trans *tp, 890 struct xfs_trans *tp,
898 uint flags) 891 bool regrant)
899{ 892{
900 struct xfs_mount *mp = tp->t_mountp; 893 struct xfs_mount *mp = tp->t_mountp;
901 xfs_lsn_t commit_lsn = -1; 894 xfs_lsn_t commit_lsn = -1;
902 int error = 0; 895 int error = 0;
903 int log_flags = 0;
904 int sync = tp->t_flags & XFS_TRANS_SYNC; 896 int sync = tp->t_flags & XFS_TRANS_SYNC;
905 897
906 /* 898 /*
907 * Determine whether this commit is releasing a permanent
908 * log reservation or not.
909 */
910 if (flags & XFS_TRANS_RELEASE_LOG_RES) {
911 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
912 log_flags = XFS_LOG_REL_PERM_RESERV;
913 }
914
915 /*
916 * If there is nothing to be logged by the transaction, 899 * If there is nothing to be logged by the transaction,
917 * then unlock all of the items associated with the 900 * then unlock all of the items associated with the
918 * transaction and free the transaction structure. 901 * transaction and free the transaction structure.
@@ -936,7 +919,7 @@ xfs_trans_commit(
936 xfs_trans_apply_sb_deltas(tp); 919 xfs_trans_apply_sb_deltas(tp);
937 xfs_trans_apply_dquot_deltas(tp); 920 xfs_trans_apply_dquot_deltas(tp);
938 921
939 xfs_log_commit_cil(mp, tp, &commit_lsn, flags); 922 xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
940 923
941 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 924 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
942 xfs_trans_free(tp); 925 xfs_trans_free(tp);
@@ -964,18 +947,25 @@ out_unreserve:
964 */ 947 */
965 xfs_trans_unreserve_and_mod_dquots(tp); 948 xfs_trans_unreserve_and_mod_dquots(tp);
966 if (tp->t_ticket) { 949 if (tp->t_ticket) {
967 commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags); 950 commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant);
968 if (commit_lsn == -1 && !error) 951 if (commit_lsn == -1 && !error)
969 error = -EIO; 952 error = -EIO;
970 } 953 }
971 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 954 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
972 xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0); 955 xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
973 xfs_trans_free(tp); 956 xfs_trans_free(tp);
974 957
975 XFS_STATS_INC(xs_trans_empty); 958 XFS_STATS_INC(xs_trans_empty);
976 return error; 959 return error;
977} 960}
978 961
962int
963xfs_trans_commit(
964 struct xfs_trans *tp)
965{
966 return __xfs_trans_commit(tp, false);
967}
968
979/* 969/*
980 * Unlock all of the transaction's items and free the transaction. 970 * Unlock all of the transaction's items and free the transaction.
981 * The transaction must not have modified any of its items, because 971 * The transaction must not have modified any of its items, because
@@ -986,29 +976,22 @@ out_unreserve:
986 */ 976 */
987void 977void
988xfs_trans_cancel( 978xfs_trans_cancel(
989 xfs_trans_t *tp, 979 struct xfs_trans *tp)
990 int flags)
991{ 980{
992 int log_flags; 981 struct xfs_mount *mp = tp->t_mountp;
993 xfs_mount_t *mp = tp->t_mountp; 982 bool dirty = (tp->t_flags & XFS_TRANS_DIRTY);
994 983
995 /* 984 /*
996 * See if the caller is being too lazy to figure out if
997 * the transaction really needs an abort.
998 */
999 if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY))
1000 flags &= ~XFS_TRANS_ABORT;
1001 /*
1002 * See if the caller is relying on us to shut down the 985 * See if the caller is relying on us to shut down the
1003 * filesystem. This happens in paths where we detect 986 * filesystem. This happens in paths where we detect
1004 * corruption and decide to give up. 987 * corruption and decide to give up.
1005 */ 988 */
1006 if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) { 989 if (dirty && !XFS_FORCED_SHUTDOWN(mp)) {
1007 XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp); 990 XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp);
1008 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 991 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1009 } 992 }
1010#ifdef DEBUG 993#ifdef DEBUG
1011 if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) { 994 if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) {
1012 struct xfs_log_item_desc *lidp; 995 struct xfs_log_item_desc *lidp;
1013 996
1014 list_for_each_entry(lidp, &tp->t_items, lid_trans) 997 list_for_each_entry(lidp, &tp->t_items, lid_trans)
@@ -1018,27 +1001,20 @@ xfs_trans_cancel(
1018 xfs_trans_unreserve_and_mod_sb(tp); 1001 xfs_trans_unreserve_and_mod_sb(tp);
1019 xfs_trans_unreserve_and_mod_dquots(tp); 1002 xfs_trans_unreserve_and_mod_dquots(tp);
1020 1003
1021 if (tp->t_ticket) { 1004 if (tp->t_ticket)
1022 if (flags & XFS_TRANS_RELEASE_LOG_RES) { 1005 xfs_log_done(mp, tp->t_ticket, NULL, false);
1023 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1024 log_flags = XFS_LOG_REL_PERM_RESERV;
1025 } else {
1026 log_flags = 0;
1027 }
1028 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
1029 }
1030 1006
1031 /* mark this thread as no longer being in a transaction */ 1007 /* mark this thread as no longer being in a transaction */
1032 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1008 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1033 1009
1034 xfs_trans_free_items(tp, NULLCOMMITLSN, flags); 1010 xfs_trans_free_items(tp, NULLCOMMITLSN, dirty);
1035 xfs_trans_free(tp); 1011 xfs_trans_free(tp);
1036} 1012}
1037 1013
1038/* 1014/*
1039 * Roll from one trans in the sequence of PERMANENT transactions to 1015 * Roll from one trans in the sequence of PERMANENT transactions to
1040 * the next: permanent transactions are only flushed out when 1016 * the next: permanent transactions are only flushed out when
1041 * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon 1017 * committed with xfs_trans_commit(), but we still want as soon
1042 * as possible to let chunks of it go to the log. So we commit the 1018 * as possible to let chunks of it go to the log. So we commit the
1043 * chunk we've been working on and get a new transaction to continue. 1019 * chunk we've been working on and get a new transaction to continue.
1044 */ 1020 */
@@ -1055,7 +1031,8 @@ xfs_trans_roll(
1055 * Ensure that the inode is always logged. 1031 * Ensure that the inode is always logged.
1056 */ 1032 */
1057 trans = *tpp; 1033 trans = *tpp;
1058 xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); 1034 if (dp)
1035 xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
1059 1036
1060 /* 1037 /*
1061 * Copy the critical parameters from one trans to the next. 1038 * Copy the critical parameters from one trans to the next.
@@ -1071,20 +1048,13 @@ xfs_trans_roll(
1071 * is in progress. The caller takes the responsibility to cancel 1048 * is in progress. The caller takes the responsibility to cancel
1072 * the duplicate transaction that gets returned. 1049 * the duplicate transaction that gets returned.
1073 */ 1050 */
1074 error = xfs_trans_commit(trans, 0); 1051 error = __xfs_trans_commit(trans, true);
1075 if (error) 1052 if (error)
1076 return error; 1053 return error;
1077 1054
1078 trans = *tpp; 1055 trans = *tpp;
1079 1056
1080 /* 1057 /*
1081 * transaction commit worked ok so we can drop the extra ticket
1082 * reference that we gained in xfs_trans_dup()
1083 */
1084 xfs_log_ticket_put(trans->t_ticket);
1085
1086
1087 /*
1088 * Reserve space in the log for th next transaction. 1058 * Reserve space in the log for th next transaction.
1089 * This also pushes items in the "AIL", the list of logged items, 1059 * This also pushes items in the "AIL", the list of logged items,
1090 * out to disk if they are taking up space at the tail of the log 1060 * out to disk if they are taking up space at the tail of the log
@@ -1100,6 +1070,7 @@ xfs_trans_roll(
1100 if (error) 1070 if (error)
1101 return error; 1071 return error;
1102 1072
1103 xfs_trans_ijoin(trans, dp, 0); 1073 if (dp)
1074 xfs_trans_ijoin(trans, dp, 0);
1104 return 0; 1075 return 0;
1105} 1076}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index b5bc1ab3c4da..3b21b4e5e467 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -133,8 +133,6 @@ typedef struct xfs_trans {
133 * XFS transaction mechanism exported interfaces that are 133 * XFS transaction mechanism exported interfaces that are
134 * actually macros. 134 * actually macros.
135 */ 135 */
136#define xfs_trans_get_log_res(tp) ((tp)->t_log_res)
137#define xfs_trans_get_log_count(tp) ((tp)->t_log_count)
138#define xfs_trans_get_block_res(tp) ((tp)->t_blk_res) 136#define xfs_trans_get_block_res(tp) ((tp)->t_blk_res)
139#define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC) 137#define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC)
140 138
@@ -153,7 +151,6 @@ typedef struct xfs_trans {
153 */ 151 */
154xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); 152xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint);
155xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t); 153xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
156xfs_trans_t *xfs_trans_dup(xfs_trans_t *);
157int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *, 154int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *,
158 uint, uint); 155 uint, uint);
159void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); 156void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
@@ -228,9 +225,9 @@ void xfs_trans_log_efd_extent(xfs_trans_t *,
228 struct xfs_efd_log_item *, 225 struct xfs_efd_log_item *,
229 xfs_fsblock_t, 226 xfs_fsblock_t,
230 xfs_extlen_t); 227 xfs_extlen_t);
231int xfs_trans_commit(xfs_trans_t *, uint flags); 228int xfs_trans_commit(struct xfs_trans *);
232int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); 229int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
233void xfs_trans_cancel(xfs_trans_t *, int); 230void xfs_trans_cancel(xfs_trans_t *);
234int xfs_trans_ail_init(struct xfs_mount *); 231int xfs_trans_ail_init(struct xfs_mount *);
235void xfs_trans_ail_destroy(struct xfs_mount *); 232void xfs_trans_ail_destroy(struct xfs_mount *);
236 233
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 76a16df55ef7..ce78534a047e 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -90,8 +90,9 @@ xfs_trans_dup_dqinfo(
90 xfs_trans_t *ntp) 90 xfs_trans_t *ntp)
91{ 91{
92 xfs_dqtrx_t *oq, *nq; 92 xfs_dqtrx_t *oq, *nq;
93 int i,j; 93 int i, j;
94 xfs_dqtrx_t *oqa, *nqa; 94 xfs_dqtrx_t *oqa, *nqa;
95 ulong blk_res_used;
95 96
96 if (!otp->t_dqinfo) 97 if (!otp->t_dqinfo)
97 return; 98 return;
@@ -102,18 +103,23 @@ xfs_trans_dup_dqinfo(
102 * Because the quota blk reservation is carried forward, 103 * Because the quota blk reservation is carried forward,
103 * it is also necessary to carry forward the DQ_DIRTY flag. 104 * it is also necessary to carry forward the DQ_DIRTY flag.
104 */ 105 */
105 if(otp->t_flags & XFS_TRANS_DQ_DIRTY) 106 if (otp->t_flags & XFS_TRANS_DQ_DIRTY)
106 ntp->t_flags |= XFS_TRANS_DQ_DIRTY; 107 ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
107 108
108 for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { 109 for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
109 oqa = otp->t_dqinfo->dqs[j]; 110 oqa = otp->t_dqinfo->dqs[j];
110 nqa = ntp->t_dqinfo->dqs[j]; 111 nqa = ntp->t_dqinfo->dqs[j];
111 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { 112 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
113 blk_res_used = 0;
114
112 if (oqa[i].qt_dquot == NULL) 115 if (oqa[i].qt_dquot == NULL)
113 break; 116 break;
114 oq = &oqa[i]; 117 oq = &oqa[i];
115 nq = &nqa[i]; 118 nq = &nqa[i];
116 119
120 if (oq->qt_blk_res && oq->qt_bcount_delta > 0)
121 blk_res_used = oq->qt_bcount_delta;
122
117 nq->qt_dquot = oq->qt_dquot; 123 nq->qt_dquot = oq->qt_dquot;
118 nq->qt_bcount_delta = nq->qt_icount_delta = 0; 124 nq->qt_bcount_delta = nq->qt_icount_delta = 0;
119 nq->qt_rtbcount_delta = 0; 125 nq->qt_rtbcount_delta = 0;
@@ -121,8 +127,8 @@ xfs_trans_dup_dqinfo(
121 /* 127 /*
122 * Transfer whatever is left of the reservations. 128 * Transfer whatever is left of the reservations.
123 */ 129 */
124 nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used; 130 nq->qt_blk_res = oq->qt_blk_res - blk_res_used;
125 oq->qt_blk_res = oq->qt_blk_res_used; 131 oq->qt_blk_res = blk_res_used;
126 132
127 nq->qt_rtblk_res = oq->qt_rtblk_res - 133 nq->qt_rtblk_res = oq->qt_rtblk_res -
128 oq->qt_rtblk_res_used; 134 oq->qt_rtblk_res_used;
@@ -239,10 +245,6 @@ xfs_trans_mod_dquot(
239 * disk blocks used. 245 * disk blocks used.
240 */ 246 */
241 case XFS_TRANS_DQ_BCOUNT: 247 case XFS_TRANS_DQ_BCOUNT:
242 if (qtrx->qt_blk_res && delta > 0) {
243 qtrx->qt_blk_res_used += (ulong)delta;
244 ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used);
245 }
246 qtrx->qt_bcount_delta += delta; 248 qtrx->qt_bcount_delta += delta;
247 break; 249 break;
248 250
@@ -423,15 +425,19 @@ xfs_trans_apply_dquot_deltas(
423 * reservation that a transaction structure knows of. 425 * reservation that a transaction structure knows of.
424 */ 426 */
425 if (qtrx->qt_blk_res != 0) { 427 if (qtrx->qt_blk_res != 0) {
426 if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) { 428 ulong blk_res_used = 0;
427 if (qtrx->qt_blk_res > 429
428 qtrx->qt_blk_res_used) 430 if (qtrx->qt_bcount_delta > 0)
431 blk_res_used = qtrx->qt_bcount_delta;
432
433 if (qtrx->qt_blk_res != blk_res_used) {
434 if (qtrx->qt_blk_res > blk_res_used)
429 dqp->q_res_bcount -= (xfs_qcnt_t) 435 dqp->q_res_bcount -= (xfs_qcnt_t)
430 (qtrx->qt_blk_res - 436 (qtrx->qt_blk_res -
431 qtrx->qt_blk_res_used); 437 blk_res_used);
432 else 438 else
433 dqp->q_res_bcount -= (xfs_qcnt_t) 439 dqp->q_res_bcount -= (xfs_qcnt_t)
434 (qtrx->qt_blk_res_used - 440 (blk_res_used -
435 qtrx->qt_blk_res); 441 qtrx->qt_blk_res);
436 } 442 }
437 } else { 443 } else {
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index bd1281862ad7..1b736294558a 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -30,7 +30,7 @@ void xfs_trans_init(struct xfs_mount *);
30void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); 30void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
31void xfs_trans_del_item(struct xfs_log_item *); 31void xfs_trans_del_item(struct xfs_log_item *);
32void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, 32void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
33 int flags); 33 bool abort);
34void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); 34void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
35 35
36void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, 36void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 35ec87e490b1..5784377e7c56 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -70,6 +70,7 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock,
70 struct buffer_head *bh_result, int create); 70 struct buffer_head *bh_result, int create);
71typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, 71typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
72 ssize_t bytes, void *private); 72 ssize_t bytes, void *private);
73typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
73 74
74#define MAY_EXEC 0x00000001 75#define MAY_EXEC 0x00000001
75#define MAY_WRITE 0x00000002 76#define MAY_WRITE 0x00000002
@@ -2627,9 +2628,13 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
2627int dax_clear_blocks(struct inode *, sector_t block, long size); 2628int dax_clear_blocks(struct inode *, sector_t block, long size);
2628int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); 2629int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
2629int dax_truncate_page(struct inode *, loff_t from, get_block_t); 2630int dax_truncate_page(struct inode *, loff_t from, get_block_t);
2630int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); 2631int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
2632 dax_iodone_t);
2633int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
2634 dax_iodone_t);
2631int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); 2635int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
2632#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) 2636#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod)
2637#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod)
2633 2638
2634#ifdef CONFIG_BLOCK 2639#ifdef CONFIG_BLOCK
2635typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode, 2640typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 50e50095c8d1..84a109449610 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -41,7 +41,12 @@ void percpu_counter_destroy(struct percpu_counter *fbc);
41void percpu_counter_set(struct percpu_counter *fbc, s64 amount); 41void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
42void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch); 42void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
43s64 __percpu_counter_sum(struct percpu_counter *fbc); 43s64 __percpu_counter_sum(struct percpu_counter *fbc);
44int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs); 44int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
45
46static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
47{
48 return __percpu_counter_compare(fbc, rhs, percpu_counter_batch);
49}
45 50
46static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) 51static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
47{ 52{
@@ -116,6 +121,12 @@ static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
116 return 0; 121 return 0;
117} 122}
118 123
124static inline int
125__percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
126{
127 return percpu_counter_compare(fbc, rhs);
128}
129
119static inline void 130static inline void
120percpu_counter_add(struct percpu_counter *fbc, s64 amount) 131percpu_counter_add(struct percpu_counter *fbc, s64 amount)
121{ 132{
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 48144cdae819..f051d69f0910 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -197,13 +197,13 @@ static int percpu_counter_hotcpu_callback(struct notifier_block *nb,
197 * Compare counter against given value. 197 * Compare counter against given value.
198 * Return 1 if greater, 0 if equal and -1 if less 198 * Return 1 if greater, 0 if equal and -1 if less
199 */ 199 */
200int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs) 200int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
201{ 201{
202 s64 count; 202 s64 count;
203 203
204 count = percpu_counter_read(fbc); 204 count = percpu_counter_read(fbc);
205 /* Check to see if rough count will be sufficient for comparison */ 205 /* Check to see if rough count will be sufficient for comparison */
206 if (abs(count - rhs) > (percpu_counter_batch*num_online_cpus())) { 206 if (abs(count - rhs) > (batch * num_online_cpus())) {
207 if (count > rhs) 207 if (count > rhs)
208 return 1; 208 return 1;
209 else 209 else
@@ -218,7 +218,7 @@ int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
218 else 218 else
219 return 0; 219 return 0;
220} 220}
221EXPORT_SYMBOL(percpu_counter_compare); 221EXPORT_SYMBOL(__percpu_counter_compare);
222 222
223static int __init percpu_counter_startup(void) 223static int __init percpu_counter_startup(void)
224{ 224{