aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS9
-rw-r--r--fs/dax.c6
-rw-r--r--fs/ext2/ext2.h2
-rw-r--r--fs/ext2/inode.c4
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/inode.c2
-rw-r--r--fs/internal.h2
-rw-r--r--fs/iomap.c18
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c109
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h4
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c199
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c10
-rw-r--r--fs/xfs/libxfs/xfs_btree.c48
-rw-r--r--fs/xfs/libxfs/xfs_btree.h8
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c6
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c51
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c3
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c9
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h1
-rw-r--r--fs/xfs/xfs_aops.c6
-rw-r--r--fs/xfs/xfs_bmap_util.c81
-rw-r--r--fs/xfs/xfs_bmap_util.h5
-rw-r--r--fs/xfs/xfs_buf_item.c1
-rw-r--r--fs/xfs/xfs_discard.c29
-rw-r--r--fs/xfs/xfs_discard.h1
-rw-r--r--fs/xfs/xfs_extent_busy.c156
-rw-r--r--fs/xfs/xfs_extent_busy.h11
-rw-r--r--fs/xfs/xfs_file.c34
-rw-r--r--fs/xfs/xfs_fsops.c39
-rw-r--r--fs/xfs/xfs_icache.c59
-rw-r--r--fs/xfs/xfs_icache.h2
-rw-r--r--fs/xfs/xfs_inode.c51
-rw-r--r--fs/xfs/xfs_ioctl.c4
-rw-r--r--fs/xfs/xfs_iomap.c75
-rw-r--r--fs/xfs/xfs_iomap.h24
-rw-r--r--fs/xfs/xfs_log.h1
-rw-r--r--fs/xfs/xfs_log_cil.c84
-rw-r--r--fs/xfs/xfs_log_priv.h1
-rw-r--r--fs/xfs/xfs_mount.c33
-rw-r--r--fs/xfs/xfs_mount.h17
-rw-r--r--fs/xfs/xfs_reflink.c265
-rw-r--r--fs/xfs/xfs_reflink.h6
-rw-r--r--fs/xfs/xfs_rtalloc.c24
-rw-r--r--fs/xfs/xfs_rtalloc.h3
-rw-r--r--fs/xfs/xfs_super.c8
-rw-r--r--fs/xfs/xfs_super.h2
-rw-r--r--fs/xfs/xfs_sysfs.c14
-rw-r--r--fs/xfs/xfs_trace.h13
-rw-r--r--fs/xfs/xfs_trans.h1
-rw-r--r--include/linux/dax.h8
-rw-r--r--include/linux/iomap.h14
-rw-r--r--include/linux/module.h6
-rw-r--r--include/linux/printk.h21
-rw-r--r--init/Kconfig16
-rw-r--r--init/main.c2
-rw-r--r--kernel/kexec_core.c2
-rw-r--r--kernel/module.c30
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/printk/Makefile2
-rw-r--r--kernel/printk/internal.h79
-rw-r--r--kernel/printk/printk.c232
-rw-r--r--kernel/printk/printk_safe.c (renamed from kernel/printk/nmi.c)234
-rw-r--r--lib/nmi_backtrace.c2
64 files changed, 1287 insertions, 910 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 545633d6663d..ca6f5f7a4752 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8364,6 +8364,7 @@ F: drivers/media/dvb-frontends/mn88473*
8364MODULE SUPPORT 8364MODULE SUPPORT
8365M: Jessica Yu <jeyu@redhat.com> 8365M: Jessica Yu <jeyu@redhat.com>
8366M: Rusty Russell <rusty@rustcorp.com.au> 8366M: Rusty Russell <rusty@rustcorp.com.au>
8367T: git git://git.kernel.org/pub/scm/linux/kernel/git/jeyu/linux.git modules-next
8367S: Maintained 8368S: Maintained
8368F: include/linux/module.h 8369F: include/linux/module.h
8369F: kernel/module.c 8370F: kernel/module.c
@@ -9996,6 +9997,14 @@ S: Supported
9996F: Documentation/preempt-locking.txt 9997F: Documentation/preempt-locking.txt
9997F: include/linux/preempt.h 9998F: include/linux/preempt.h
9998 9999
10000PRINTK
10001M: Petr Mladek <pmladek@suse.com>
10002M: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
10003R: Steven Rostedt <rostedt@goodmis.org>
10004S: Maintained
10005F: kernel/printk/
10006F: include/linux/printk.h
10007
9999PRISM54 WIRELESS DRIVER 10008PRISM54 WIRELESS DRIVER
10000M: "Luis R. Rodriguez" <mcgrof@gmail.com> 10009M: "Luis R. Rodriguez" <mcgrof@gmail.com>
10001L: linux-wireless@vger.kernel.org 10010L: linux-wireless@vger.kernel.org
diff --git a/fs/dax.c b/fs/dax.c
index e9cf8b4cd234..99b5b4458a78 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1079,7 +1079,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1079 */ 1079 */
1080ssize_t 1080ssize_t
1081dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, 1081dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
1082 struct iomap_ops *ops) 1082 const struct iomap_ops *ops)
1083{ 1083{
1084 struct address_space *mapping = iocb->ki_filp->f_mapping; 1084 struct address_space *mapping = iocb->ki_filp->f_mapping;
1085 struct inode *inode = mapping->host; 1085 struct inode *inode = mapping->host;
@@ -1127,7 +1127,7 @@ static int dax_fault_return(int error)
1127 * necessary locking for the page fault to proceed successfully. 1127 * necessary locking for the page fault to proceed successfully.
1128 */ 1128 */
1129int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 1129int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1130 struct iomap_ops *ops) 1130 const struct iomap_ops *ops)
1131{ 1131{
1132 struct address_space *mapping = vma->vm_file->f_mapping; 1132 struct address_space *mapping = vma->vm_file->f_mapping;
1133 struct inode *inode = mapping->host; 1133 struct inode *inode = mapping->host;
@@ -1326,7 +1326,7 @@ static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
1326} 1326}
1327 1327
1328int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, 1328int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1329 pmd_t *pmd, unsigned int flags, struct iomap_ops *ops) 1329 pmd_t *pmd, unsigned int flags, const struct iomap_ops *ops)
1330{ 1330{
1331 struct address_space *mapping = vma->vm_file->f_mapping; 1331 struct address_space *mapping = vma->vm_file->f_mapping;
1332 unsigned long pmd_addr = address & PMD_MASK; 1332 unsigned long pmd_addr = address & PMD_MASK;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 37e2be784ac7..5e64de9c5093 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -814,7 +814,7 @@ extern const struct file_operations ext2_file_operations;
814/* inode.c */ 814/* inode.c */
815extern const struct address_space_operations ext2_aops; 815extern const struct address_space_operations ext2_aops;
816extern const struct address_space_operations ext2_nobh_aops; 816extern const struct address_space_operations ext2_nobh_aops;
817extern struct iomap_ops ext2_iomap_ops; 817extern const struct iomap_ops ext2_iomap_ops;
818 818
819/* namei.c */ 819/* namei.c */
820extern const struct inode_operations ext2_dir_inode_operations; 820extern const struct inode_operations ext2_dir_inode_operations;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index f073bfca694b..128cce540645 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -842,13 +842,13 @@ ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length,
842 return 0; 842 return 0;
843} 843}
844 844
845struct iomap_ops ext2_iomap_ops = { 845const struct iomap_ops ext2_iomap_ops = {
846 .iomap_begin = ext2_iomap_begin, 846 .iomap_begin = ext2_iomap_begin,
847 .iomap_end = ext2_iomap_end, 847 .iomap_end = ext2_iomap_end,
848}; 848};
849#else 849#else
850/* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */ 850/* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */
851struct iomap_ops ext2_iomap_ops; 851const struct iomap_ops ext2_iomap_ops;
852#endif /* CONFIG_FS_DAX */ 852#endif /* CONFIG_FS_DAX */
853 853
854int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 854int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 01d52b98f9a7..cee23b684f47 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3244,7 +3244,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
3244 } 3244 }
3245} 3245}
3246 3246
3247extern struct iomap_ops ext4_iomap_ops; 3247extern const struct iomap_ops ext4_iomap_ops;
3248 3248
3249#endif /* __KERNEL__ */ 3249#endif /* __KERNEL__ */
3250 3250
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f622d4a577e3..75212a6e69f8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3450,7 +3450,7 @@ orphan_del:
3450 return ret; 3450 return ret;
3451} 3451}
3452 3452
3453struct iomap_ops ext4_iomap_ops = { 3453const struct iomap_ops ext4_iomap_ops = {
3454 .iomap_begin = ext4_iomap_begin, 3454 .iomap_begin = ext4_iomap_begin,
3455 .iomap_end = ext4_iomap_end, 3455 .iomap_end = ext4_iomap_end,
3456}; 3456};
diff --git a/fs/internal.h b/fs/internal.h
index b63cf3af2dc2..11c6d89dce9c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -182,7 +182,7 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
182 void *data, struct iomap *iomap); 182 void *data, struct iomap *iomap);
183 183
184loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, 184loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
185 unsigned flags, struct iomap_ops *ops, void *data, 185 unsigned flags, const struct iomap_ops *ops, void *data,
186 iomap_actor_t actor); 186 iomap_actor_t actor);
187 187
188/* direct-io.c: */ 188/* direct-io.c: */
diff --git a/fs/iomap.c b/fs/iomap.c
index a51cb4c07d4d..d89f70bbb952 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -41,7 +41,7 @@
41 */ 41 */
42loff_t 42loff_t
43iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, 43iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
44 struct iomap_ops *ops, void *data, iomap_actor_t actor) 44 const struct iomap_ops *ops, void *data, iomap_actor_t actor)
45{ 45{
46 struct iomap iomap = { 0 }; 46 struct iomap iomap = { 0 };
47 loff_t written = 0, ret; 47 loff_t written = 0, ret;
@@ -235,7 +235,7 @@ again:
235 235
236ssize_t 236ssize_t
237iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, 237iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
238 struct iomap_ops *ops) 238 const struct iomap_ops *ops)
239{ 239{
240 struct inode *inode = iocb->ki_filp->f_mapping->host; 240 struct inode *inode = iocb->ki_filp->f_mapping->host;
241 loff_t pos = iocb->ki_pos, ret = 0, written = 0; 241 loff_t pos = iocb->ki_pos, ret = 0, written = 0;
@@ -318,7 +318,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
318 318
319int 319int
320iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, 320iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
321 struct iomap_ops *ops) 321 const struct iomap_ops *ops)
322{ 322{
323 loff_t ret; 323 loff_t ret;
324 324
@@ -398,7 +398,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
398 398
399int 399int
400iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, 400iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
401 struct iomap_ops *ops) 401 const struct iomap_ops *ops)
402{ 402{
403 loff_t ret; 403 loff_t ret;
404 404
@@ -418,7 +418,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range);
418 418
419int 419int
420iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, 420iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
421 struct iomap_ops *ops) 421 const struct iomap_ops *ops)
422{ 422{
423 unsigned blocksize = (1 << inode->i_blkbits); 423 unsigned blocksize = (1 << inode->i_blkbits);
424 unsigned off = pos & (blocksize - 1); 424 unsigned off = pos & (blocksize - 1);
@@ -446,7 +446,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
446} 446}
447 447
448int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, 448int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
449 struct iomap_ops *ops) 449 const struct iomap_ops *ops)
450{ 450{
451 struct page *page = vmf->page; 451 struct page *page = vmf->page;
452 struct inode *inode = file_inode(vma->vm_file); 452 struct inode *inode = file_inode(vma->vm_file);
@@ -545,7 +545,7 @@ iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
545} 545}
546 546
547int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, 547int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
548 loff_t start, loff_t len, struct iomap_ops *ops) 548 loff_t start, loff_t len, const struct iomap_ops *ops)
549{ 549{
550 struct fiemap_ctx ctx; 550 struct fiemap_ctx ctx;
551 loff_t ret; 551 loff_t ret;
@@ -839,8 +839,8 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
839} 839}
840 840
841ssize_t 841ssize_t
842iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops, 842iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
843 iomap_dio_end_io_t end_io) 843 const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
844{ 844{
845 struct address_space *mapping = iocb->ki_filp->f_mapping; 845 struct address_space *mapping = iocb->ki_filp->f_mapping;
846 struct inode *inode = file_inode(iocb->ki_filp); 846 struct inode *inode = file_inode(iocb->ki_filp);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 9f06a211e157..369adcc18c02 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -221,20 +221,22 @@ xfs_alloc_get_rec(
221 * Compute aligned version of the found extent. 221 * Compute aligned version of the found extent.
222 * Takes alignment and min length into account. 222 * Takes alignment and min length into account.
223 */ 223 */
224STATIC void 224STATIC bool
225xfs_alloc_compute_aligned( 225xfs_alloc_compute_aligned(
226 xfs_alloc_arg_t *args, /* allocation argument structure */ 226 xfs_alloc_arg_t *args, /* allocation argument structure */
227 xfs_agblock_t foundbno, /* starting block in found extent */ 227 xfs_agblock_t foundbno, /* starting block in found extent */
228 xfs_extlen_t foundlen, /* length in found extent */ 228 xfs_extlen_t foundlen, /* length in found extent */
229 xfs_agblock_t *resbno, /* result block number */ 229 xfs_agblock_t *resbno, /* result block number */
230 xfs_extlen_t *reslen) /* result length */ 230 xfs_extlen_t *reslen, /* result length */
231 unsigned *busy_gen)
231{ 232{
232 xfs_agblock_t bno; 233 xfs_agblock_t bno = foundbno;
233 xfs_extlen_t len; 234 xfs_extlen_t len = foundlen;
234 xfs_extlen_t diff; 235 xfs_extlen_t diff;
236 bool busy;
235 237
236 /* Trim busy sections out of found extent */ 238 /* Trim busy sections out of found extent */
237 xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len); 239 busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
238 240
239 /* 241 /*
240 * If we have a largish extent that happens to start before min_agbno, 242 * If we have a largish extent that happens to start before min_agbno,
@@ -259,6 +261,8 @@ xfs_alloc_compute_aligned(
259 *resbno = bno; 261 *resbno = bno;
260 *reslen = len; 262 *reslen = len;
261 } 263 }
264
265 return busy;
262} 266}
263 267
264/* 268/*
@@ -737,10 +741,11 @@ xfs_alloc_ag_vextent_exact(
737 int error; 741 int error;
738 xfs_agblock_t fbno; /* start block of found extent */ 742 xfs_agblock_t fbno; /* start block of found extent */
739 xfs_extlen_t flen; /* length of found extent */ 743 xfs_extlen_t flen; /* length of found extent */
740 xfs_agblock_t tbno; /* start block of trimmed extent */ 744 xfs_agblock_t tbno; /* start block of busy extent */
741 xfs_extlen_t tlen; /* length of trimmed extent */ 745 xfs_extlen_t tlen; /* length of busy extent */
742 xfs_agblock_t tend; /* end block of trimmed extent */ 746 xfs_agblock_t tend; /* end block of busy extent */
743 int i; /* success/failure of operation */ 747 int i; /* success/failure of operation */
748 unsigned busy_gen;
744 749
745 ASSERT(args->alignment == 1); 750 ASSERT(args->alignment == 1);
746 751
@@ -773,7 +778,9 @@ xfs_alloc_ag_vextent_exact(
773 /* 778 /*
774 * Check for overlapping busy extents. 779 * Check for overlapping busy extents.
775 */ 780 */
776 xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen); 781 tbno = fbno;
782 tlen = flen;
783 xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
777 784
778 /* 785 /*
779 * Give up if the start of the extent is busy, or the freespace isn't 786 * Give up if the start of the extent is busy, or the freespace isn't
@@ -853,6 +860,7 @@ xfs_alloc_find_best_extent(
853 xfs_agblock_t sdiff; 860 xfs_agblock_t sdiff;
854 int error; 861 int error;
855 int i; 862 int i;
863 unsigned busy_gen;
856 864
857 /* The good extent is perfect, no need to search. */ 865 /* The good extent is perfect, no need to search. */
858 if (!gdiff) 866 if (!gdiff)
@@ -866,7 +874,8 @@ xfs_alloc_find_best_extent(
866 if (error) 874 if (error)
867 goto error0; 875 goto error0;
868 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); 876 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
869 xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena); 877 xfs_alloc_compute_aligned(args, *sbno, *slen,
878 sbnoa, slena, &busy_gen);
870 879
871 /* 880 /*
872 * The good extent is closer than this one. 881 * The good extent is closer than this one.
@@ -955,7 +964,8 @@ xfs_alloc_ag_vextent_near(
955 xfs_extlen_t ltlena; /* aligned ... */ 964 xfs_extlen_t ltlena; /* aligned ... */
956 xfs_agblock_t ltnew; /* useful start bno of left side */ 965 xfs_agblock_t ltnew; /* useful start bno of left side */
957 xfs_extlen_t rlen; /* length of returned extent */ 966 xfs_extlen_t rlen; /* length of returned extent */
958 int forced = 0; 967 bool busy;
968 unsigned busy_gen;
959#ifdef DEBUG 969#ifdef DEBUG
960 /* 970 /*
961 * Randomly don't execute the first algorithm. 971 * Randomly don't execute the first algorithm.
@@ -982,6 +992,7 @@ restart:
982 ltlen = 0; 992 ltlen = 0;
983 gtlena = 0; 993 gtlena = 0;
984 ltlena = 0; 994 ltlena = 0;
995 busy = false;
985 996
986 /* 997 /*
987 * Get a cursor for the by-size btree. 998 * Get a cursor for the by-size btree.
@@ -1064,8 +1075,8 @@ restart:
1064 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i))) 1075 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
1065 goto error0; 1076 goto error0;
1066 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); 1077 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
1067 xfs_alloc_compute_aligned(args, ltbno, ltlen, 1078 busy = xfs_alloc_compute_aligned(args, ltbno, ltlen,
1068 &ltbnoa, &ltlena); 1079 &ltbnoa, &ltlena, &busy_gen);
1069 if (ltlena < args->minlen) 1080 if (ltlena < args->minlen)
1070 continue; 1081 continue;
1071 if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno) 1082 if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno)
@@ -1183,8 +1194,8 @@ restart:
1183 if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i))) 1194 if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
1184 goto error0; 1195 goto error0;
1185 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); 1196 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
1186 xfs_alloc_compute_aligned(args, ltbno, ltlen, 1197 busy |= xfs_alloc_compute_aligned(args, ltbno, ltlen,
1187 &ltbnoa, &ltlena); 1198 &ltbnoa, &ltlena, &busy_gen);
1188 if (ltlena >= args->minlen && ltbnoa >= args->min_agbno) 1199 if (ltlena >= args->minlen && ltbnoa >= args->min_agbno)
1189 break; 1200 break;
1190 if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) 1201 if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
@@ -1199,8 +1210,8 @@ restart:
1199 if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i))) 1210 if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
1200 goto error0; 1211 goto error0;
1201 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); 1212 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
1202 xfs_alloc_compute_aligned(args, gtbno, gtlen, 1213 busy |= xfs_alloc_compute_aligned(args, gtbno, gtlen,
1203 &gtbnoa, &gtlena); 1214 &gtbnoa, &gtlena, &busy_gen);
1204 if (gtlena >= args->minlen && gtbnoa <= args->max_agbno) 1215 if (gtlena >= args->minlen && gtbnoa <= args->max_agbno)
1205 break; 1216 break;
1206 if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) 1217 if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
@@ -1261,9 +1272,9 @@ restart:
1261 if (bno_cur_lt == NULL && bno_cur_gt == NULL) { 1272 if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
1262 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1273 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1263 1274
1264 if (!forced++) { 1275 if (busy) {
1265 trace_xfs_alloc_near_busy(args); 1276 trace_xfs_alloc_near_busy(args);
1266 xfs_log_force(args->mp, XFS_LOG_SYNC); 1277 xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
1267 goto restart; 1278 goto restart;
1268 } 1279 }
1269 trace_xfs_alloc_size_neither(args); 1280 trace_xfs_alloc_size_neither(args);
@@ -1344,7 +1355,8 @@ xfs_alloc_ag_vextent_size(
1344 int i; /* temp status variable */ 1355 int i; /* temp status variable */
1345 xfs_agblock_t rbno; /* returned block number */ 1356 xfs_agblock_t rbno; /* returned block number */
1346 xfs_extlen_t rlen; /* length of returned extent */ 1357 xfs_extlen_t rlen; /* length of returned extent */
1347 int forced = 0; 1358 bool busy;
1359 unsigned busy_gen;
1348 1360
1349restart: 1361restart:
1350 /* 1362 /*
@@ -1353,6 +1365,7 @@ restart:
1353 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1365 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1354 args->agno, XFS_BTNUM_CNT); 1366 args->agno, XFS_BTNUM_CNT);
1355 bno_cur = NULL; 1367 bno_cur = NULL;
1368 busy = false;
1356 1369
1357 /* 1370 /*
1358 * Look for an entry >= maxlen+alignment-1 blocks. 1371 * Look for an entry >= maxlen+alignment-1 blocks.
@@ -1362,14 +1375,13 @@ restart:
1362 goto error0; 1375 goto error0;
1363 1376
1364 /* 1377 /*
1365 * If none or we have busy extents that we cannot allocate from, then 1378 * If none then we have to settle for a smaller extent. In the case that
1366 * we have to settle for a smaller extent. In the case that there are 1379 * there are no large extents, this will return the last entry in the
1367 * no large extents, this will return the last entry in the tree unless 1380 * tree unless the tree is empty. In the case that there are only busy
1368 * the tree is empty. In the case that there are only busy large 1381 * large extents, this will return the largest small extent unless there
1369 * extents, this will return the largest small extent unless there
1370 * are no smaller extents available. 1382 * are no smaller extents available.
1371 */ 1383 */
1372 if (!i || forced > 1) { 1384 if (!i) {
1373 error = xfs_alloc_ag_vextent_small(args, cnt_cur, 1385 error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1374 &fbno, &flen, &i); 1386 &fbno, &flen, &i);
1375 if (error) 1387 if (error)
@@ -1380,13 +1392,11 @@ restart:
1380 return 0; 1392 return 0;
1381 } 1393 }
1382 ASSERT(i == 1); 1394 ASSERT(i == 1);
1383 xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); 1395 busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
1396 &rlen, &busy_gen);
1384 } else { 1397 } else {
1385 /* 1398 /*
1386 * Search for a non-busy extent that is large enough. 1399 * Search for a non-busy extent that is large enough.
1387 * If we are at low space, don't check, or if we fall of
1388 * the end of the btree, turn off the busy check and
1389 * restart.
1390 */ 1400 */
1391 for (;;) { 1401 for (;;) {
1392 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); 1402 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
@@ -1394,8 +1404,8 @@ restart:
1394 goto error0; 1404 goto error0;
1395 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); 1405 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
1396 1406
1397 xfs_alloc_compute_aligned(args, fbno, flen, 1407 busy = xfs_alloc_compute_aligned(args, fbno, flen,
1398 &rbno, &rlen); 1408 &rbno, &rlen, &busy_gen);
1399 1409
1400 if (rlen >= args->maxlen) 1410 if (rlen >= args->maxlen)
1401 break; 1411 break;
@@ -1407,18 +1417,13 @@ restart:
1407 /* 1417 /*
1408 * Our only valid extents must have been busy. 1418 * Our only valid extents must have been busy.
1409 * Make it unbusy by forcing the log out and 1419 * Make it unbusy by forcing the log out and
1410 * retrying. If we've been here before, forcing 1420 * retrying.
1411 * the log isn't making the extents available,
1412 * which means they have probably been freed in
1413 * this transaction. In that case, we have to
1414 * give up on them and we'll attempt a minlen
1415 * allocation the next time around.
1416 */ 1421 */
1417 xfs_btree_del_cursor(cnt_cur, 1422 xfs_btree_del_cursor(cnt_cur,
1418 XFS_BTREE_NOERROR); 1423 XFS_BTREE_NOERROR);
1419 trace_xfs_alloc_size_busy(args); 1424 trace_xfs_alloc_size_busy(args);
1420 if (!forced++) 1425 xfs_extent_busy_flush(args->mp,
1421 xfs_log_force(args->mp, XFS_LOG_SYNC); 1426 args->pag, busy_gen);
1422 goto restart; 1427 goto restart;
1423 } 1428 }
1424 } 1429 }
@@ -1454,8 +1459,8 @@ restart:
1454 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); 1459 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
1455 if (flen < bestrlen) 1460 if (flen < bestrlen)
1456 break; 1461 break;
1457 xfs_alloc_compute_aligned(args, fbno, flen, 1462 busy = xfs_alloc_compute_aligned(args, fbno, flen,
1458 &rbno, &rlen); 1463 &rbno, &rlen, &busy_gen);
1459 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1464 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1460 XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || 1465 XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
1461 (rlen <= flen && rbno + rlen <= fbno + flen), 1466 (rlen <= flen && rbno + rlen <= fbno + flen),
@@ -1484,10 +1489,10 @@ restart:
1484 */ 1489 */
1485 args->len = rlen; 1490 args->len = rlen;
1486 if (rlen < args->minlen) { 1491 if (rlen < args->minlen) {
1487 if (!forced++) { 1492 if (busy) {
1488 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1493 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1489 trace_xfs_alloc_size_busy(args); 1494 trace_xfs_alloc_size_busy(args);
1490 xfs_log_force(args->mp, XFS_LOG_SYNC); 1495 xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
1491 goto restart; 1496 goto restart;
1492 } 1497 }
1493 goto out_nominleft; 1498 goto out_nominleft;
@@ -2659,21 +2664,11 @@ xfs_alloc_vextent(
2659 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); 2664 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
2660 args->type = XFS_ALLOCTYPE_NEAR_BNO; 2665 args->type = XFS_ALLOCTYPE_NEAR_BNO;
2661 /* FALLTHROUGH */ 2666 /* FALLTHROUGH */
2662 case XFS_ALLOCTYPE_ANY_AG:
2663 case XFS_ALLOCTYPE_START_AG:
2664 case XFS_ALLOCTYPE_FIRST_AG: 2667 case XFS_ALLOCTYPE_FIRST_AG:
2665 /* 2668 /*
2666 * Rotate through the allocation groups looking for a winner. 2669 * Rotate through the allocation groups looking for a winner.
2667 */ 2670 */
2668 if (type == XFS_ALLOCTYPE_ANY_AG) { 2671 if (type == XFS_ALLOCTYPE_FIRST_AG) {
2669 /*
2670 * Start with the last place we left off.
2671 */
2672 args->agno = sagno = (mp->m_agfrotor / rotorstep) %
2673 mp->m_sb.sb_agcount;
2674 args->type = XFS_ALLOCTYPE_THIS_AG;
2675 flags = XFS_ALLOC_FLAG_TRYLOCK;
2676 } else if (type == XFS_ALLOCTYPE_FIRST_AG) {
2677 /* 2672 /*
2678 * Start with allocation group given by bno. 2673 * Start with allocation group given by bno.
2679 */ 2674 */
@@ -2682,8 +2677,6 @@ xfs_alloc_vextent(
2682 sagno = 0; 2677 sagno = 0;
2683 flags = 0; 2678 flags = 0;
2684 } else { 2679 } else {
2685 if (type == XFS_ALLOCTYPE_START_AG)
2686 args->type = XFS_ALLOCTYPE_THIS_AG;
2687 /* 2680 /*
2688 * Start with the given allocation group. 2681 * Start with the given allocation group.
2689 */ 2682 */
@@ -2751,7 +2744,7 @@ xfs_alloc_vextent(
2751 } 2744 }
2752 xfs_perag_put(args->pag); 2745 xfs_perag_put(args->pag);
2753 } 2746 }
2754 if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) { 2747 if (bump_rotor) {
2755 if (args->agno == sagno) 2748 if (args->agno == sagno)
2756 mp->m_agfrotor = (mp->m_agfrotor + 1) % 2749 mp->m_agfrotor = (mp->m_agfrotor + 1) %
2757 (mp->m_sb.sb_agcount * rotorstep); 2750 (mp->m_sb.sb_agcount * rotorstep);
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 1d0f48a501a3..2a8d0fa6fbbe 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -29,9 +29,7 @@ extern struct workqueue_struct *xfs_alloc_wq;
29/* 29/*
30 * Freespace allocation types. Argument to xfs_alloc_[v]extent. 30 * Freespace allocation types. Argument to xfs_alloc_[v]extent.
31 */ 31 */
32#define XFS_ALLOCTYPE_ANY_AG 0x01 /* allocate anywhere, use rotor */
33#define XFS_ALLOCTYPE_FIRST_AG 0x02 /* ... start at ag 0 */ 32#define XFS_ALLOCTYPE_FIRST_AG 0x02 /* ... start at ag 0 */
34#define XFS_ALLOCTYPE_START_AG 0x04 /* anywhere, start in this a.g. */
35#define XFS_ALLOCTYPE_THIS_AG 0x08 /* anywhere in this a.g. */ 33#define XFS_ALLOCTYPE_THIS_AG 0x08 /* anywhere in this a.g. */
36#define XFS_ALLOCTYPE_START_BNO 0x10 /* near this block else anywhere */ 34#define XFS_ALLOCTYPE_START_BNO 0x10 /* near this block else anywhere */
37#define XFS_ALLOCTYPE_NEAR_BNO 0x20 /* in this a.g. and near this block */ 35#define XFS_ALLOCTYPE_NEAR_BNO 0x20 /* in this a.g. and near this block */
@@ -41,9 +39,7 @@ extern struct workqueue_struct *xfs_alloc_wq;
41typedef unsigned int xfs_alloctype_t; 39typedef unsigned int xfs_alloctype_t;
42 40
43#define XFS_ALLOC_TYPES \ 41#define XFS_ALLOC_TYPES \
44 { XFS_ALLOCTYPE_ANY_AG, "ANY_AG" }, \
45 { XFS_ALLOCTYPE_FIRST_AG, "FIRST_AG" }, \ 42 { XFS_ALLOCTYPE_FIRST_AG, "FIRST_AG" }, \
46 { XFS_ALLOCTYPE_START_AG, "START_AG" }, \
47 { XFS_ALLOCTYPE_THIS_AG, "THIS_AG" }, \ 43 { XFS_ALLOCTYPE_THIS_AG, "THIS_AG" }, \
48 { XFS_ALLOCTYPE_START_BNO, "START_BNO" }, \ 44 { XFS_ALLOCTYPE_START_BNO, "START_BNO" }, \
49 { XFS_ALLOCTYPE_NEAR_BNO, "NEAR_BNO" }, \ 45 { XFS_ALLOCTYPE_NEAR_BNO, "NEAR_BNO" }, \
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index bfc00de5c6f1..a9c66d47757a 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -740,15 +740,9 @@ xfs_bmap_extents_to_btree(
740 * Fill in the root. 740 * Fill in the root.
741 */ 741 */
742 block = ifp->if_broot; 742 block = ifp->if_broot;
743 if (xfs_sb_version_hascrc(&mp->m_sb)) 743 xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
744 xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, 744 XFS_BTNUM_BMAP, 1, 1, ip->i_ino,
745 XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino,
746 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
747 else
748 xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
749 XFS_BMAP_MAGIC, 1, 1, ip->i_ino,
750 XFS_BTREE_LONG_PTRS); 745 XFS_BTREE_LONG_PTRS);
751
752 /* 746 /*
753 * Need a cursor. Can't allocate until bb_level is filled in. 747 * Need a cursor. Can't allocate until bb_level is filled in.
754 */ 748 */
@@ -804,9 +798,7 @@ try_another_ag:
804 */ 798 */
805 ASSERT(args.fsbno != NULLFSBLOCK); 799 ASSERT(args.fsbno != NULLFSBLOCK);
806 ASSERT(*firstblock == NULLFSBLOCK || 800 ASSERT(*firstblock == NULLFSBLOCK ||
807 args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || 801 args.agno >= XFS_FSB_TO_AGNO(mp, *firstblock));
808 (dfops->dop_low &&
809 args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
810 *firstblock = cur->bc_private.b.firstblock = args.fsbno; 802 *firstblock = cur->bc_private.b.firstblock = args.fsbno;
811 cur->bc_private.b.allocated++; 803 cur->bc_private.b.allocated++;
812 ip->i_d.di_nblocks++; 804 ip->i_d.di_nblocks++;
@@ -817,13 +809,8 @@ try_another_ag:
817 */ 809 */
818 abp->b_ops = &xfs_bmbt_buf_ops; 810 abp->b_ops = &xfs_bmbt_buf_ops;
819 ablock = XFS_BUF_TO_BLOCK(abp); 811 ablock = XFS_BUF_TO_BLOCK(abp);
820 if (xfs_sb_version_hascrc(&mp->m_sb)) 812 xfs_btree_init_block_int(mp, ablock, abp->b_bn,
821 xfs_btree_init_block_int(mp, ablock, abp->b_bn, 813 XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
822 XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
823 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
824 else
825 xfs_btree_init_block_int(mp, ablock, abp->b_bn,
826 XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
827 XFS_BTREE_LONG_PTRS); 814 XFS_BTREE_LONG_PTRS);
828 815
829 arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); 816 arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
@@ -1278,7 +1265,6 @@ xfs_bmap_read_extents(
1278 /* REFERENCED */ 1265 /* REFERENCED */
1279 xfs_extnum_t room; /* number of entries there's room for */ 1266 xfs_extnum_t room; /* number of entries there's room for */
1280 1267
1281 bno = NULLFSBLOCK;
1282 mp = ip->i_mount; 1268 mp = ip->i_mount;
1283 ifp = XFS_IFORK_PTR(ip, whichfork); 1269 ifp = XFS_IFORK_PTR(ip, whichfork);
1284 exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : 1270 exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
@@ -1291,9 +1277,7 @@ xfs_bmap_read_extents(
1291 ASSERT(level > 0); 1277 ASSERT(level > 0);
1292 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); 1278 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
1293 bno = be64_to_cpu(*pp); 1279 bno = be64_to_cpu(*pp);
1294 ASSERT(bno != NULLFSBLOCK); 1280
1295 ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
1296 ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
1297 /* 1281 /*
1298 * Go down the tree until leaf level is reached, following the first 1282 * Go down the tree until leaf level is reached, following the first
1299 * pointer (leftmost) at each level. 1283 * pointer (leftmost) at each level.
@@ -1864,6 +1848,7 @@ xfs_bmap_add_extent_delay_real(
1864 */ 1848 */
1865 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); 1849 trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
1866 xfs_bmbt_set_startblock(ep, new->br_startblock); 1850 xfs_bmbt_set_startblock(ep, new->br_startblock);
1851 xfs_bmbt_set_state(ep, new->br_state);
1867 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); 1852 trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
1868 1853
1869 (*nextents)++; 1854 (*nextents)++;
@@ -2202,6 +2187,7 @@ STATIC int /* error */
2202xfs_bmap_add_extent_unwritten_real( 2187xfs_bmap_add_extent_unwritten_real(
2203 struct xfs_trans *tp, 2188 struct xfs_trans *tp,
2204 xfs_inode_t *ip, /* incore inode pointer */ 2189 xfs_inode_t *ip, /* incore inode pointer */
2190 int whichfork,
2205 xfs_extnum_t *idx, /* extent number to update/insert */ 2191 xfs_extnum_t *idx, /* extent number to update/insert */
2206 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 2192 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
2207 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 2193 xfs_bmbt_irec_t *new, /* new data to add to file extents */
@@ -2221,12 +2207,14 @@ xfs_bmap_add_extent_unwritten_real(
2221 /* left is 0, right is 1, prev is 2 */ 2207 /* left is 0, right is 1, prev is 2 */
2222 int rval=0; /* return value (logging flags) */ 2208 int rval=0; /* return value (logging flags) */
2223 int state = 0;/* state bits, accessed thru macros */ 2209 int state = 0;/* state bits, accessed thru macros */
2224 struct xfs_mount *mp = tp->t_mountp; 2210 struct xfs_mount *mp = ip->i_mount;
2225 2211
2226 *logflagsp = 0; 2212 *logflagsp = 0;
2227 2213
2228 cur = *curp; 2214 cur = *curp;
2229 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 2215 ifp = XFS_IFORK_PTR(ip, whichfork);
2216 if (whichfork == XFS_COW_FORK)
2217 state |= BMAP_COWFORK;
2230 2218
2231 ASSERT(*idx >= 0); 2219 ASSERT(*idx >= 0);
2232 ASSERT(*idx <= xfs_iext_count(ifp)); 2220 ASSERT(*idx <= xfs_iext_count(ifp));
@@ -2285,7 +2273,7 @@ xfs_bmap_add_extent_unwritten_real(
2285 * Don't set contiguous if the combined extent would be too large. 2273 * Don't set contiguous if the combined extent would be too large.
2286 * Also check for all-three-contiguous being too large. 2274 * Also check for all-three-contiguous being too large.
2287 */ 2275 */
2288 if (*idx < xfs_iext_count(&ip->i_df) - 1) { 2276 if (*idx < xfs_iext_count(ifp) - 1) {
2289 state |= BMAP_RIGHT_VALID; 2277 state |= BMAP_RIGHT_VALID;
2290 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); 2278 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
2291 if (isnullstartblock(RIGHT.br_startblock)) 2279 if (isnullstartblock(RIGHT.br_startblock))
@@ -2325,7 +2313,8 @@ xfs_bmap_add_extent_unwritten_real(
2325 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 2313 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2326 2314
2327 xfs_iext_remove(ip, *idx + 1, 2, state); 2315 xfs_iext_remove(ip, *idx + 1, 2, state);
2328 ip->i_d.di_nextents -= 2; 2316 XFS_IFORK_NEXT_SET(ip, whichfork,
2317 XFS_IFORK_NEXTENTS(ip, whichfork) - 2);
2329 if (cur == NULL) 2318 if (cur == NULL)
2330 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 2319 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
2331 else { 2320 else {
@@ -2368,7 +2357,8 @@ xfs_bmap_add_extent_unwritten_real(
2368 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 2357 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2369 2358
2370 xfs_iext_remove(ip, *idx + 1, 1, state); 2359 xfs_iext_remove(ip, *idx + 1, 1, state);
2371 ip->i_d.di_nextents--; 2360 XFS_IFORK_NEXT_SET(ip, whichfork,
2361 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
2372 if (cur == NULL) 2362 if (cur == NULL)
2373 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 2363 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
2374 else { 2364 else {
@@ -2403,7 +2393,8 @@ xfs_bmap_add_extent_unwritten_real(
2403 xfs_bmbt_set_state(ep, newext); 2393 xfs_bmbt_set_state(ep, newext);
2404 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 2394 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2405 xfs_iext_remove(ip, *idx + 1, 1, state); 2395 xfs_iext_remove(ip, *idx + 1, 1, state);
2406 ip->i_d.di_nextents--; 2396 XFS_IFORK_NEXT_SET(ip, whichfork,
2397 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
2407 if (cur == NULL) 2398 if (cur == NULL)
2408 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 2399 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
2409 else { 2400 else {
@@ -2515,7 +2506,8 @@ xfs_bmap_add_extent_unwritten_real(
2515 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 2506 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2516 2507
2517 xfs_iext_insert(ip, *idx, 1, new, state); 2508 xfs_iext_insert(ip, *idx, 1, new, state);
2518 ip->i_d.di_nextents++; 2509 XFS_IFORK_NEXT_SET(ip, whichfork,
2510 XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
2519 if (cur == NULL) 2511 if (cur == NULL)
2520 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 2512 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
2521 else { 2513 else {
@@ -2593,7 +2585,8 @@ xfs_bmap_add_extent_unwritten_real(
2593 ++*idx; 2585 ++*idx;
2594 xfs_iext_insert(ip, *idx, 1, new, state); 2586 xfs_iext_insert(ip, *idx, 1, new, state);
2595 2587
2596 ip->i_d.di_nextents++; 2588 XFS_IFORK_NEXT_SET(ip, whichfork,
2589 XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
2597 if (cur == NULL) 2590 if (cur == NULL)
2598 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 2591 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
2599 else { 2592 else {
@@ -2641,7 +2634,8 @@ xfs_bmap_add_extent_unwritten_real(
2641 ++*idx; 2634 ++*idx;
2642 xfs_iext_insert(ip, *idx, 2, &r[0], state); 2635 xfs_iext_insert(ip, *idx, 2, &r[0], state);
2643 2636
2644 ip->i_d.di_nextents += 2; 2637 XFS_IFORK_NEXT_SET(ip, whichfork,
2638 XFS_IFORK_NEXTENTS(ip, whichfork) + 2);
2645 if (cur == NULL) 2639 if (cur == NULL)
2646 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 2640 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
2647 else { 2641 else {
@@ -2695,17 +2689,17 @@ xfs_bmap_add_extent_unwritten_real(
2695 } 2689 }
2696 2690
2697 /* update reverse mappings */ 2691 /* update reverse mappings */
2698 error = xfs_rmap_convert_extent(mp, dfops, ip, XFS_DATA_FORK, new); 2692 error = xfs_rmap_convert_extent(mp, dfops, ip, whichfork, new);
2699 if (error) 2693 if (error)
2700 goto done; 2694 goto done;
2701 2695
2702 /* convert to a btree if necessary */ 2696 /* convert to a btree if necessary */
2703 if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) { 2697 if (xfs_bmap_needs_btree(ip, whichfork)) {
2704 int tmp_logflags; /* partial log flag return val */ 2698 int tmp_logflags; /* partial log flag return val */
2705 2699
2706 ASSERT(cur == NULL); 2700 ASSERT(cur == NULL);
2707 error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, &cur, 2701 error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, &cur,
2708 0, &tmp_logflags, XFS_DATA_FORK); 2702 0, &tmp_logflags, whichfork);
2709 *logflagsp |= tmp_logflags; 2703 *logflagsp |= tmp_logflags;
2710 if (error) 2704 if (error)
2711 goto done; 2705 goto done;
@@ -2717,7 +2711,7 @@ xfs_bmap_add_extent_unwritten_real(
2717 *curp = cur; 2711 *curp = cur;
2718 } 2712 }
2719 2713
2720 xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK); 2714 xfs_bmap_check_leaf_extents(*curp, ip, whichfork);
2721done: 2715done:
2722 *logflagsp |= rval; 2716 *logflagsp |= rval;
2723 return error; 2717 return error;
@@ -2809,7 +2803,8 @@ xfs_bmap_add_extent_hole_delay(
2809 oldlen = startblockval(left.br_startblock) + 2803 oldlen = startblockval(left.br_startblock) +
2810 startblockval(new->br_startblock) + 2804 startblockval(new->br_startblock) +
2811 startblockval(right.br_startblock); 2805 startblockval(right.br_startblock);
2812 newlen = xfs_bmap_worst_indlen(ip, temp); 2806 newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
2807 oldlen);
2813 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), 2808 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
2814 nullstartblock((int)newlen)); 2809 nullstartblock((int)newlen));
2815 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 2810 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
@@ -2830,7 +2825,8 @@ xfs_bmap_add_extent_hole_delay(
2830 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); 2825 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
2831 oldlen = startblockval(left.br_startblock) + 2826 oldlen = startblockval(left.br_startblock) +
2832 startblockval(new->br_startblock); 2827 startblockval(new->br_startblock);
2833 newlen = xfs_bmap_worst_indlen(ip, temp); 2828 newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
2829 oldlen);
2834 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), 2830 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
2835 nullstartblock((int)newlen)); 2831 nullstartblock((int)newlen));
2836 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); 2832 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
@@ -2846,7 +2842,8 @@ xfs_bmap_add_extent_hole_delay(
2846 temp = new->br_blockcount + right.br_blockcount; 2842 temp = new->br_blockcount + right.br_blockcount;
2847 oldlen = startblockval(new->br_startblock) + 2843 oldlen = startblockval(new->br_startblock) +
2848 startblockval(right.br_startblock); 2844 startblockval(right.br_startblock);
2849 newlen = xfs_bmap_worst_indlen(ip, temp); 2845 newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
2846 oldlen);
2850 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), 2847 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
2851 new->br_startoff, 2848 new->br_startoff,
2852 nullstartblock((int)newlen), temp, right.br_state); 2849 nullstartblock((int)newlen), temp, right.br_state);
@@ -2899,13 +2896,14 @@ xfs_bmap_add_extent_hole_real(
2899 ASSERT(!isnullstartblock(new->br_startblock)); 2896 ASSERT(!isnullstartblock(new->br_startblock));
2900 ASSERT(!bma->cur || 2897 ASSERT(!bma->cur ||
2901 !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); 2898 !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
2902 ASSERT(whichfork != XFS_COW_FORK);
2903 2899
2904 XFS_STATS_INC(mp, xs_add_exlist); 2900 XFS_STATS_INC(mp, xs_add_exlist);
2905 2901
2906 state = 0; 2902 state = 0;
2907 if (whichfork == XFS_ATTR_FORK) 2903 if (whichfork == XFS_ATTR_FORK)
2908 state |= BMAP_ATTRFORK; 2904 state |= BMAP_ATTRFORK;
2905 if (whichfork == XFS_COW_FORK)
2906 state |= BMAP_COWFORK;
2909 2907
2910 /* 2908 /*
2911 * Check and set flags if this segment has a left neighbor. 2909 * Check and set flags if this segment has a left neighbor.
@@ -3822,17 +3820,13 @@ xfs_bmap_btalloc(
3822 * the first block that was allocated. 3820 * the first block that was allocated.
3823 */ 3821 */
3824 ASSERT(*ap->firstblock == NULLFSBLOCK || 3822 ASSERT(*ap->firstblock == NULLFSBLOCK ||
3825 XFS_FSB_TO_AGNO(mp, *ap->firstblock) == 3823 XFS_FSB_TO_AGNO(mp, *ap->firstblock) <=
3826 XFS_FSB_TO_AGNO(mp, args.fsbno) || 3824 XFS_FSB_TO_AGNO(mp, args.fsbno));
3827 (ap->dfops->dop_low &&
3828 XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
3829 XFS_FSB_TO_AGNO(mp, args.fsbno)));
3830 3825
3831 ap->blkno = args.fsbno; 3826 ap->blkno = args.fsbno;
3832 if (*ap->firstblock == NULLFSBLOCK) 3827 if (*ap->firstblock == NULLFSBLOCK)
3833 *ap->firstblock = args.fsbno; 3828 *ap->firstblock = args.fsbno;
3834 ASSERT(nullfb || fb_agno == args.agno || 3829 ASSERT(nullfb || fb_agno <= args.agno);
3835 (ap->dfops->dop_low && fb_agno < args.agno));
3836 ap->length = args.len; 3830 ap->length = args.len;
3837 if (!(ap->flags & XFS_BMAPI_COWFORK)) 3831 if (!(ap->flags & XFS_BMAPI_COWFORK))
3838 ap->ip->i_d.di_nblocks += args.len; 3832 ap->ip->i_d.di_nblocks += args.len;
@@ -4368,10 +4362,16 @@ xfs_bmapi_allocate(
4368 bma->got.br_state = XFS_EXT_NORM; 4362 bma->got.br_state = XFS_EXT_NORM;
4369 4363
4370 /* 4364 /*
4371 * A wasdelay extent has been initialized, so shouldn't be flagged 4365 * In the data fork, a wasdelay extent has been initialized, so
4372 * as unwritten. 4366 * shouldn't be flagged as unwritten.
4367 *
4368 * For the cow fork, however, we convert delalloc reservations
4369 * (extents allocated for speculative preallocation) to
4370 * allocated unwritten extents, and only convert the unwritten
4371 * extents to real extents when we're about to write the data.
4373 */ 4372 */
4374 if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) && 4373 if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) &&
4374 (bma->flags & XFS_BMAPI_PREALLOC) &&
4375 xfs_sb_version_hasextflgbit(&mp->m_sb)) 4375 xfs_sb_version_hasextflgbit(&mp->m_sb))
4376 bma->got.br_state = XFS_EXT_UNWRITTEN; 4376 bma->got.br_state = XFS_EXT_UNWRITTEN;
4377 4377
@@ -4422,8 +4422,6 @@ xfs_bmapi_convert_unwritten(
4422 (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) 4422 (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
4423 return 0; 4423 return 0;
4424 4424
4425 ASSERT(whichfork != XFS_COW_FORK);
4426
4427 /* 4425 /*
4428 * Modify (by adding) the state flag, if writing. 4426 * Modify (by adding) the state flag, if writing.
4429 */ 4427 */
@@ -4448,8 +4446,8 @@ xfs_bmapi_convert_unwritten(
4448 return error; 4446 return error;
4449 } 4447 }
4450 4448
4451 error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, 4449 error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, whichfork,
4452 &bma->cur, mval, bma->firstblock, bma->dfops, 4450 &bma->idx, &bma->cur, mval, bma->firstblock, bma->dfops,
4453 &tmp_logflags); 4451 &tmp_logflags);
4454 /* 4452 /*
4455 * Log the inode core unconditionally in the unwritten extent conversion 4453 * Log the inode core unconditionally in the unwritten extent conversion
@@ -4458,8 +4456,12 @@ xfs_bmapi_convert_unwritten(
4458 * in the transaction for the sake of fsync(), even if nothing has 4456 * in the transaction for the sake of fsync(), even if nothing has
4459 * changed, because fsync() will not force the log for this transaction 4457 * changed, because fsync() will not force the log for this transaction
4460 * unless it sees the inode pinned. 4458 * unless it sees the inode pinned.
4459 *
4460 * Note: If we're only converting cow fork extents, there aren't
4461 * any on-disk updates to make, so we don't need to log anything.
4461 */ 4462 */
4462 bma->logflags |= tmp_logflags | XFS_ILOG_CORE; 4463 if (whichfork != XFS_COW_FORK)
4464 bma->logflags |= tmp_logflags | XFS_ILOG_CORE;
4463 if (error) 4465 if (error)
4464 return error; 4466 return error;
4465 4467
@@ -4533,15 +4535,15 @@ xfs_bmapi_write(
4533 ASSERT(*nmap >= 1); 4535 ASSERT(*nmap >= 1);
4534 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); 4536 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
4535 ASSERT(!(flags & XFS_BMAPI_IGSTATE)); 4537 ASSERT(!(flags & XFS_BMAPI_IGSTATE));
4536 ASSERT(tp != NULL); 4538 ASSERT(tp != NULL ||
4539 (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) ==
4540 (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK));
4537 ASSERT(len > 0); 4541 ASSERT(len > 0);
4538 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); 4542 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
4539 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 4543 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
4540 ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK); 4544 ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK);
4541 ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP)); 4545 ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP));
4542 ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP)); 4546 ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP));
4543 ASSERT(!(flags & XFS_BMAPI_PREALLOC) || whichfork != XFS_COW_FORK);
4544 ASSERT(!(flags & XFS_BMAPI_CONVERT) || whichfork != XFS_COW_FORK);
4545 4547
4546 /* zeroing is for currently only for data extents, not metadata */ 4548 /* zeroing is for currently only for data extents, not metadata */
4547 ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) != 4549 ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
@@ -4746,13 +4748,9 @@ error0:
4746 if (bma.cur) { 4748 if (bma.cur) {
4747 if (!error) { 4749 if (!error) {
4748 ASSERT(*firstblock == NULLFSBLOCK || 4750 ASSERT(*firstblock == NULLFSBLOCK ||
4749 XFS_FSB_TO_AGNO(mp, *firstblock) == 4751 XFS_FSB_TO_AGNO(mp, *firstblock) <=
4750 XFS_FSB_TO_AGNO(mp, 4752 XFS_FSB_TO_AGNO(mp,
4751 bma.cur->bc_private.b.firstblock) || 4753 bma.cur->bc_private.b.firstblock));
4752 (dfops->dop_low &&
4753 XFS_FSB_TO_AGNO(mp, *firstblock) <
4754 XFS_FSB_TO_AGNO(mp,
4755 bma.cur->bc_private.b.firstblock)));
4756 *firstblock = bma.cur->bc_private.b.firstblock; 4754 *firstblock = bma.cur->bc_private.b.firstblock;
4757 } 4755 }
4758 xfs_btree_del_cursor(bma.cur, 4756 xfs_btree_del_cursor(bma.cur,
@@ -4787,34 +4785,59 @@ xfs_bmap_split_indlen(
4787 xfs_filblks_t len2 = *indlen2; 4785 xfs_filblks_t len2 = *indlen2;
4788 xfs_filblks_t nres = len1 + len2; /* new total res. */ 4786 xfs_filblks_t nres = len1 + len2; /* new total res. */
4789 xfs_filblks_t stolen = 0; 4787 xfs_filblks_t stolen = 0;
4788 xfs_filblks_t resfactor;
4790 4789
4791 /* 4790 /*
4792 * Steal as many blocks as we can to try and satisfy the worst case 4791 * Steal as many blocks as we can to try and satisfy the worst case
4793 * indlen for both new extents. 4792 * indlen for both new extents.
4794 */ 4793 */
4795 while (nres > ores && avail) { 4794 if (ores < nres && avail)
4796 nres--; 4795 stolen = XFS_FILBLKS_MIN(nres - ores, avail);
4797 avail--; 4796 ores += stolen;
4798 stolen++; 4797
4799 } 4798 /* nothing else to do if we've satisfied the new reservation */
4799 if (ores >= nres)
4800 return stolen;
4801
4802 /*
4803 * We can't meet the total required reservation for the two extents.
4804 * Calculate the percent of the overall shortage between both extents
4805 * and apply this percentage to each of the requested indlen values.
4806 * This distributes the shortage fairly and reduces the chances that one
4807 * of the two extents is left with nothing when extents are repeatedly
4808 * split.
4809 */
4810 resfactor = (ores * 100);
4811 do_div(resfactor, nres);
4812 len1 *= resfactor;
4813 do_div(len1, 100);
4814 len2 *= resfactor;
4815 do_div(len2, 100);
4816 ASSERT(len1 + len2 <= ores);
4817 ASSERT(len1 < *indlen1 && len2 < *indlen2);
4800 4818
4801 /* 4819 /*
4802 * The only blocks available are those reserved for the original 4820 * Hand out the remainder to each extent. If one of the two reservations
4803 * extent and what we can steal from the extent being removed. 4821 * is zero, we want to make sure that one gets a block first. The loop
4804 * If this still isn't enough to satisfy the combined 4822 * below starts with len1, so hand len2 a block right off the bat if it
4805 * requirements for the two new extents, skim blocks off of each 4823 * is zero.
4806 * of the new reservations until they match what is available.
4807 */ 4824 */
4808 while (nres > ores) { 4825 ores -= (len1 + len2);
4809 if (len1) { 4826 ASSERT((*indlen1 - len1) + (*indlen2 - len2) >= ores);
4810 len1--; 4827 if (ores && !len2 && *indlen2) {
4811 nres--; 4828 len2++;
4829 ores--;
4830 }
4831 while (ores) {
4832 if (len1 < *indlen1) {
4833 len1++;
4834 ores--;
4812 } 4835 }
4813 if (nres == ores) 4836 if (!ores)
4814 break; 4837 break;
4815 if (len2) { 4838 if (len2 < *indlen2) {
4816 len2--; 4839 len2++;
4817 nres--; 4840 ores--;
4818 } 4841 }
4819 } 4842 }
4820 4843
@@ -5556,8 +5579,8 @@ __xfs_bunmapi(
5556 } 5579 }
5557 del.br_state = XFS_EXT_UNWRITTEN; 5580 del.br_state = XFS_EXT_UNWRITTEN;
5558 error = xfs_bmap_add_extent_unwritten_real(tp, ip, 5581 error = xfs_bmap_add_extent_unwritten_real(tp, ip,
5559 &lastx, &cur, &del, firstblock, dfops, 5582 whichfork, &lastx, &cur, &del,
5560 &logflags); 5583 firstblock, dfops, &logflags);
5561 if (error) 5584 if (error)
5562 goto error0; 5585 goto error0;
5563 goto nodelete; 5586 goto nodelete;
@@ -5610,8 +5633,9 @@ __xfs_bunmapi(
5610 prev.br_state = XFS_EXT_UNWRITTEN; 5633 prev.br_state = XFS_EXT_UNWRITTEN;
5611 lastx--; 5634 lastx--;
5612 error = xfs_bmap_add_extent_unwritten_real(tp, 5635 error = xfs_bmap_add_extent_unwritten_real(tp,
5613 ip, &lastx, &cur, &prev, 5636 ip, whichfork, &lastx, &cur,
5614 firstblock, dfops, &logflags); 5637 &prev, firstblock, dfops,
5638 &logflags);
5615 if (error) 5639 if (error)
5616 goto error0; 5640 goto error0;
5617 goto nodelete; 5641 goto nodelete;
@@ -5619,8 +5643,9 @@ __xfs_bunmapi(
5619 ASSERT(del.br_state == XFS_EXT_NORM); 5643 ASSERT(del.br_state == XFS_EXT_NORM);
5620 del.br_state = XFS_EXT_UNWRITTEN; 5644 del.br_state = XFS_EXT_UNWRITTEN;
5621 error = xfs_bmap_add_extent_unwritten_real(tp, 5645 error = xfs_bmap_add_extent_unwritten_real(tp,
5622 ip, &lastx, &cur, &del, 5646 ip, whichfork, &lastx, &cur,
5623 firstblock, dfops, &logflags); 5647 &del, firstblock, dfops,
5648 &logflags);
5624 if (error) 5649 if (error)
5625 goto error0; 5650 goto error0;
5626 goto nodelete; 5651 goto nodelete;
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index d9be241fc86f..f93072b58a58 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -71,15 +71,9 @@ xfs_bmdr_to_bmbt(
71 xfs_bmbt_key_t *tkp; 71 xfs_bmbt_key_t *tkp;
72 __be64 *tpp; 72 __be64 *tpp;
73 73
74 if (xfs_sb_version_hascrc(&mp->m_sb)) 74 xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
75 xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, 75 XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
76 XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
77 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
78 else
79 xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
80 XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
81 XFS_BTREE_LONG_PTRS); 76 XFS_BTREE_LONG_PTRS);
82
83 rblock->bb_level = dblock->bb_level; 77 rblock->bb_level = dblock->bb_level;
84 ASSERT(be16_to_cpu(rblock->bb_level) > 0); 78 ASSERT(be16_to_cpu(rblock->bb_level) > 0);
85 rblock->bb_numrecs = dblock->bb_numrecs; 79 rblock->bb_numrecs = dblock->bb_numrecs;
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 21e6a6ab6b9a..c3decedc9455 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -50,8 +50,18 @@ static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
50 XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC, 50 XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
51 XFS_REFC_CRC_MAGIC } 51 XFS_REFC_CRC_MAGIC }
52}; 52};
53#define xfs_btree_magic(cur) \ 53
54 xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum] 54__uint32_t
55xfs_btree_magic(
56 int crc,
57 xfs_btnum_t btnum)
58{
59 __uint32_t magic = xfs_magics[crc][btnum];
60
61 /* Ensure we asked for crc for crc-only magics. */
62 ASSERT(magic != 0);
63 return magic;
64}
55 65
56STATIC int /* error (0 or EFSCORRUPTED) */ 66STATIC int /* error (0 or EFSCORRUPTED) */
57xfs_btree_check_lblock( 67xfs_btree_check_lblock(
@@ -62,10 +72,13 @@ xfs_btree_check_lblock(
62{ 72{
63 int lblock_ok = 1; /* block passes checks */ 73 int lblock_ok = 1; /* block passes checks */
64 struct xfs_mount *mp; /* file system mount point */ 74 struct xfs_mount *mp; /* file system mount point */
75 xfs_btnum_t btnum = cur->bc_btnum;
76 int crc;
65 77
66 mp = cur->bc_mp; 78 mp = cur->bc_mp;
79 crc = xfs_sb_version_hascrc(&mp->m_sb);
67 80
68 if (xfs_sb_version_hascrc(&mp->m_sb)) { 81 if (crc) {
69 lblock_ok = lblock_ok && 82 lblock_ok = lblock_ok &&
70 uuid_equal(&block->bb_u.l.bb_uuid, 83 uuid_equal(&block->bb_u.l.bb_uuid,
71 &mp->m_sb.sb_meta_uuid) && 84 &mp->m_sb.sb_meta_uuid) &&
@@ -74,7 +87,7 @@ xfs_btree_check_lblock(
74 } 87 }
75 88
76 lblock_ok = lblock_ok && 89 lblock_ok = lblock_ok &&
77 be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && 90 be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) &&
78 be16_to_cpu(block->bb_level) == level && 91 be16_to_cpu(block->bb_level) == level &&
79 be16_to_cpu(block->bb_numrecs) <= 92 be16_to_cpu(block->bb_numrecs) <=
80 cur->bc_ops->get_maxrecs(cur, level) && 93 cur->bc_ops->get_maxrecs(cur, level) &&
@@ -110,13 +123,16 @@ xfs_btree_check_sblock(
110 struct xfs_agf *agf; /* ag. freespace structure */ 123 struct xfs_agf *agf; /* ag. freespace structure */
111 xfs_agblock_t agflen; /* native ag. freespace length */ 124 xfs_agblock_t agflen; /* native ag. freespace length */
112 int sblock_ok = 1; /* block passes checks */ 125 int sblock_ok = 1; /* block passes checks */
126 xfs_btnum_t btnum = cur->bc_btnum;
127 int crc;
113 128
114 mp = cur->bc_mp; 129 mp = cur->bc_mp;
130 crc = xfs_sb_version_hascrc(&mp->m_sb);
115 agbp = cur->bc_private.a.agbp; 131 agbp = cur->bc_private.a.agbp;
116 agf = XFS_BUF_TO_AGF(agbp); 132 agf = XFS_BUF_TO_AGF(agbp);
117 agflen = be32_to_cpu(agf->agf_length); 133 agflen = be32_to_cpu(agf->agf_length);
118 134
119 if (xfs_sb_version_hascrc(&mp->m_sb)) { 135 if (crc) {
120 sblock_ok = sblock_ok && 136 sblock_ok = sblock_ok &&
121 uuid_equal(&block->bb_u.s.bb_uuid, 137 uuid_equal(&block->bb_u.s.bb_uuid,
122 &mp->m_sb.sb_meta_uuid) && 138 &mp->m_sb.sb_meta_uuid) &&
@@ -125,7 +141,7 @@ xfs_btree_check_sblock(
125 } 141 }
126 142
127 sblock_ok = sblock_ok && 143 sblock_ok = sblock_ok &&
128 be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && 144 be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) &&
129 be16_to_cpu(block->bb_level) == level && 145 be16_to_cpu(block->bb_level) == level &&
130 be16_to_cpu(block->bb_numrecs) <= 146 be16_to_cpu(block->bb_numrecs) <=
131 cur->bc_ops->get_maxrecs(cur, level) && 147 cur->bc_ops->get_maxrecs(cur, level) &&
@@ -810,7 +826,8 @@ xfs_btree_read_bufl(
810 xfs_daddr_t d; /* real disk block address */ 826 xfs_daddr_t d; /* real disk block address */
811 int error; 827 int error;
812 828
813 ASSERT(fsbno != NULLFSBLOCK); 829 if (!XFS_FSB_SANITY_CHECK(mp, fsbno))
830 return -EFSCORRUPTED;
814 d = XFS_FSB_TO_DADDR(mp, fsbno); 831 d = XFS_FSB_TO_DADDR(mp, fsbno);
815 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, 832 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
816 mp->m_bsize, lock, &bp, ops); 833 mp->m_bsize, lock, &bp, ops);
@@ -1084,12 +1101,15 @@ xfs_btree_init_block_int(
1084 struct xfs_mount *mp, 1101 struct xfs_mount *mp,
1085 struct xfs_btree_block *buf, 1102 struct xfs_btree_block *buf,
1086 xfs_daddr_t blkno, 1103 xfs_daddr_t blkno,
1087 __u32 magic, 1104 xfs_btnum_t btnum,
1088 __u16 level, 1105 __u16 level,
1089 __u16 numrecs, 1106 __u16 numrecs,
1090 __u64 owner, 1107 __u64 owner,
1091 unsigned int flags) 1108 unsigned int flags)
1092{ 1109{
1110 int crc = xfs_sb_version_hascrc(&mp->m_sb);
1111 __u32 magic = xfs_btree_magic(crc, btnum);
1112
1093 buf->bb_magic = cpu_to_be32(magic); 1113 buf->bb_magic = cpu_to_be32(magic);
1094 buf->bb_level = cpu_to_be16(level); 1114 buf->bb_level = cpu_to_be16(level);
1095 buf->bb_numrecs = cpu_to_be16(numrecs); 1115 buf->bb_numrecs = cpu_to_be16(numrecs);
@@ -1097,7 +1117,7 @@ xfs_btree_init_block_int(
1097 if (flags & XFS_BTREE_LONG_PTRS) { 1117 if (flags & XFS_BTREE_LONG_PTRS) {
1098 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); 1118 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
1099 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); 1119 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1100 if (flags & XFS_BTREE_CRC_BLOCKS) { 1120 if (crc) {
1101 buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); 1121 buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1102 buf->bb_u.l.bb_owner = cpu_to_be64(owner); 1122 buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1103 uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid); 1123 uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
@@ -1110,7 +1130,7 @@ xfs_btree_init_block_int(
1110 1130
1111 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); 1131 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1112 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); 1132 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1113 if (flags & XFS_BTREE_CRC_BLOCKS) { 1133 if (crc) {
1114 buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); 1134 buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1115 buf->bb_u.s.bb_owner = cpu_to_be32(__owner); 1135 buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1116 uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid); 1136 uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
@@ -1123,14 +1143,14 @@ void
1123xfs_btree_init_block( 1143xfs_btree_init_block(
1124 struct xfs_mount *mp, 1144 struct xfs_mount *mp,
1125 struct xfs_buf *bp, 1145 struct xfs_buf *bp,
1126 __u32 magic, 1146 xfs_btnum_t btnum,
1127 __u16 level, 1147 __u16 level,
1128 __u16 numrecs, 1148 __u16 numrecs,
1129 __u64 owner, 1149 __u64 owner,
1130 unsigned int flags) 1150 unsigned int flags)
1131{ 1151{
1132 xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, 1152 xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1133 magic, level, numrecs, owner, flags); 1153 btnum, level, numrecs, owner, flags);
1134} 1154}
1135 1155
1136STATIC void 1156STATIC void
@@ -1140,7 +1160,7 @@ xfs_btree_init_block_cur(
1140 int level, 1160 int level,
1141 int numrecs) 1161 int numrecs)
1142{ 1162{
1143 __u64 owner; 1163 __u64 owner;
1144 1164
1145 /* 1165 /*
1146 * we can pull the owner from the cursor right now as the different 1166 * we can pull the owner from the cursor right now as the different
@@ -1154,7 +1174,7 @@ xfs_btree_init_block_cur(
1154 owner = cur->bc_private.a.agno; 1174 owner = cur->bc_private.a.agno;
1155 1175
1156 xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, 1176 xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1157 xfs_btree_magic(cur), level, numrecs, 1177 cur->bc_btnum, level, numrecs,
1158 owner, cur->bc_flags); 1178 owner, cur->bc_flags);
1159} 1179}
1160 1180
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index b69b947c4c1b..4bb62580a7fd 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -76,6 +76,8 @@ union xfs_btree_rec {
76#define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi) 76#define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi)
77#define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi) 77#define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi)
78 78
79__uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
80
79/* 81/*
80 * For logging record fields. 82 * For logging record fields.
81 */ 83 */
@@ -378,7 +380,7 @@ void
378xfs_btree_init_block( 380xfs_btree_init_block(
379 struct xfs_mount *mp, 381 struct xfs_mount *mp,
380 struct xfs_buf *bp, 382 struct xfs_buf *bp,
381 __u32 magic, 383 xfs_btnum_t btnum,
382 __u16 level, 384 __u16 level,
383 __u16 numrecs, 385 __u16 numrecs,
384 __u64 owner, 386 __u64 owner,
@@ -389,7 +391,7 @@ xfs_btree_init_block_int(
389 struct xfs_mount *mp, 391 struct xfs_mount *mp,
390 struct xfs_btree_block *buf, 392 struct xfs_btree_block *buf,
391 xfs_daddr_t blkno, 393 xfs_daddr_t blkno,
392 __u32 magic, 394 xfs_btnum_t btnum,
393 __u16 level, 395 __u16 level,
394 __u16 numrecs, 396 __u16 numrecs,
395 __u64 owner, 397 __u64 owner,
@@ -456,7 +458,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block)
456#define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) 458#define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b))
457 459
458#define XFS_FSB_SANITY_CHECK(mp,fsb) \ 460#define XFS_FSB_SANITY_CHECK(mp,fsb) \
459 (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ 461 (fsb && XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
460 XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks) 462 XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks)
461 463
462/* 464/*
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index f2dc1a950c85..1bdf2888295b 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -2633,7 +2633,7 @@ out_free:
2633/* 2633/*
2634 * Readahead the dir/attr block. 2634 * Readahead the dir/attr block.
2635 */ 2635 */
2636xfs_daddr_t 2636int
2637xfs_da_reada_buf( 2637xfs_da_reada_buf(
2638 struct xfs_inode *dp, 2638 struct xfs_inode *dp,
2639 xfs_dablk_t bno, 2639 xfs_dablk_t bno,
@@ -2664,7 +2664,5 @@ out_free:
2664 if (mapp != &map) 2664 if (mapp != &map)
2665 kmem_free(mapp); 2665 kmem_free(mapp);
2666 2666
2667 if (error) 2667 return error;
2668 return -1;
2669 return mappedbno;
2670} 2668}
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 98c75cbe6ac2..4e29cb6a3627 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -201,7 +201,7 @@ int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
201 xfs_dablk_t bno, xfs_daddr_t mappedbno, 201 xfs_dablk_t bno, xfs_daddr_t mappedbno,
202 struct xfs_buf **bpp, int whichfork, 202 struct xfs_buf **bpp, int whichfork,
203 const struct xfs_buf_ops *ops); 203 const struct xfs_buf_ops *ops);
204xfs_daddr_t xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, 204int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
205 xfs_daddr_t mapped_bno, int whichfork, 205 xfs_daddr_t mapped_bno, int whichfork,
206 const struct xfs_buf_ops *ops); 206 const struct xfs_buf_ops *ops);
207int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, 207int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 75a557432d0f..bbd1238852b3 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -155,6 +155,42 @@ const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
155 .verify_write = xfs_dir3_free_write_verify, 155 .verify_write = xfs_dir3_free_write_verify,
156}; 156};
157 157
158/* Everything ok in the free block header? */
159static bool
160xfs_dir3_free_header_check(
161 struct xfs_inode *dp,
162 xfs_dablk_t fbno,
163 struct xfs_buf *bp)
164{
165 struct xfs_mount *mp = dp->i_mount;
166 unsigned int firstdb;
167 int maxbests;
168
169 maxbests = dp->d_ops->free_max_bests(mp->m_dir_geo);
170 firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) -
171 xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) *
172 maxbests;
173 if (xfs_sb_version_hascrc(&mp->m_sb)) {
174 struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
175
176 if (be32_to_cpu(hdr3->firstdb) != firstdb)
177 return false;
178 if (be32_to_cpu(hdr3->nvalid) > maxbests)
179 return false;
180 if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused))
181 return false;
182 } else {
183 struct xfs_dir2_free_hdr *hdr = bp->b_addr;
184
185 if (be32_to_cpu(hdr->firstdb) != firstdb)
186 return false;
187 if (be32_to_cpu(hdr->nvalid) > maxbests)
188 return false;
189 if (be32_to_cpu(hdr->nvalid) < be32_to_cpu(hdr->nused))
190 return false;
191 }
192 return true;
193}
158 194
159static int 195static int
160__xfs_dir3_free_read( 196__xfs_dir3_free_read(
@@ -168,11 +204,22 @@ __xfs_dir3_free_read(
168 204
169 err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, 205 err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
170 XFS_DATA_FORK, &xfs_dir3_free_buf_ops); 206 XFS_DATA_FORK, &xfs_dir3_free_buf_ops);
207 if (err || !*bpp)
208 return err;
209
210 /* Check things that we can't do in the verifier. */
211 if (!xfs_dir3_free_header_check(dp, fbno, *bpp)) {
212 xfs_buf_ioerror(*bpp, -EFSCORRUPTED);
213 xfs_verifier_error(*bpp);
214 xfs_trans_brelse(tp, *bpp);
215 return -EFSCORRUPTED;
216 }
171 217
172 /* try read returns without an error or *bpp if it lands in a hole */ 218 /* try read returns without an error or *bpp if it lands in a hole */
173 if (!err && tp && *bpp) 219 if (tp)
174 xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF); 220 xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF);
175 return err; 221
222 return 0;
176} 223}
177 224
178int 225int
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index f272abff11e1..d41ade5d293e 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -51,8 +51,7 @@ xfs_ialloc_cluster_alignment(
51 struct xfs_mount *mp) 51 struct xfs_mount *mp)
52{ 52{
53 if (xfs_sb_version_hasalign(&mp->m_sb) && 53 if (xfs_sb_version_hasalign(&mp->m_sb) &&
54 mp->m_sb.sb_inoalignmt >= 54 mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp))
55 XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
56 return mp->m_sb.sb_inoalignmt; 55 return mp->m_sb.sb_inoalignmt;
57 return 1; 56 return 1;
58} 57}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 222e103356c6..25c1e078aef6 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -26,6 +26,7 @@
26#include "xfs_inode.h" 26#include "xfs_inode.h"
27#include "xfs_trans.h" 27#include "xfs_trans.h"
28#include "xfs_inode_item.h" 28#include "xfs_inode_item.h"
29#include "xfs_btree.h"
29#include "xfs_bmap_btree.h" 30#include "xfs_bmap_btree.h"
30#include "xfs_bmap.h" 31#include "xfs_bmap.h"
31#include "xfs_error.h" 32#include "xfs_error.h"
@@ -429,11 +430,13 @@ xfs_iformat_btree(
429 /* REFERENCED */ 430 /* REFERENCED */
430 int nrecs; 431 int nrecs;
431 int size; 432 int size;
433 int level;
432 434
433 ifp = XFS_IFORK_PTR(ip, whichfork); 435 ifp = XFS_IFORK_PTR(ip, whichfork);
434 dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); 436 dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
435 size = XFS_BMAP_BROOT_SPACE(mp, dfp); 437 size = XFS_BMAP_BROOT_SPACE(mp, dfp);
436 nrecs = be16_to_cpu(dfp->bb_numrecs); 438 nrecs = be16_to_cpu(dfp->bb_numrecs);
439 level = be16_to_cpu(dfp->bb_level);
437 440
438 /* 441 /*
439 * blow out if -- fork has less extents than can fit in 442 * blow out if -- fork has less extents than can fit in
@@ -446,7 +449,8 @@ xfs_iformat_btree(
446 XFS_IFORK_MAXEXT(ip, whichfork) || 449 XFS_IFORK_MAXEXT(ip, whichfork) ||
447 XFS_BMDR_SPACE_CALC(nrecs) > 450 XFS_BMDR_SPACE_CALC(nrecs) >
448 XFS_DFORK_SIZE(dip, mp, whichfork) || 451 XFS_DFORK_SIZE(dip, mp, whichfork) ||
449 XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { 452 XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks) ||
453 level == 0 || level > XFS_BTREE_MAXLEVELS) {
450 xfs_warn(mp, "corrupt inode %Lu (btree).", 454 xfs_warn(mp, "corrupt inode %Lu (btree).",
451 (unsigned long long) ip->i_ino); 455 (unsigned long long) ip->i_ino);
452 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, 456 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
@@ -497,15 +501,14 @@ xfs_iread_extents(
497 * We know that the size is valid (it's checked in iformat_btree) 501 * We know that the size is valid (it's checked in iformat_btree)
498 */ 502 */
499 ifp->if_bytes = ifp->if_real_bytes = 0; 503 ifp->if_bytes = ifp->if_real_bytes = 0;
500 ifp->if_flags |= XFS_IFEXTENTS;
501 xfs_iext_add(ifp, 0, nextents); 504 xfs_iext_add(ifp, 0, nextents);
502 error = xfs_bmap_read_extents(tp, ip, whichfork); 505 error = xfs_bmap_read_extents(tp, ip, whichfork);
503 if (error) { 506 if (error) {
504 xfs_iext_destroy(ifp); 507 xfs_iext_destroy(ifp);
505 ifp->if_flags &= ~XFS_IFEXTENTS;
506 return error; 508 return error;
507 } 509 }
508 xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); 510 xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
511 ifp->if_flags |= XFS_IFEXTENTS;
509 return 0; 512 return 0;
510} 513}
511/* 514/*
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index d9f65e2d5cc8..29a01ec89dd0 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -42,7 +42,6 @@ typedef struct xlog_recover_item {
42 xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */ 42 xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */
43} xlog_recover_item_t; 43} xlog_recover_item_t;
44 44
45struct xlog_tid;
46typedef struct xlog_recover { 45typedef struct xlog_recover {
47 struct hlist_node r_list; 46 struct hlist_node r_list;
48 xlog_tid_t r_log_tid; /* log's transaction id */ 47 xlog_tid_t r_log_tid; /* log's transaction id */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 631e7c0e0a29..1ff9df7a3ce8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -481,6 +481,12 @@ xfs_submit_ioend(
481 struct xfs_ioend *ioend, 481 struct xfs_ioend *ioend,
482 int status) 482 int status)
483{ 483{
484 /* Convert CoW extents to regular */
485 if (!status && ioend->io_type == XFS_IO_COW) {
486 status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
487 ioend->io_offset, ioend->io_size);
488 }
489
484 /* Reserve log space if we might write beyond the on-disk inode size. */ 490 /* Reserve log space if we might write beyond the on-disk inode size. */
485 if (!status && 491 if (!status &&
486 ioend->io_type != XFS_IO_UNWRITTEN && 492 ioend->io_type != XFS_IO_UNWRITTEN &&
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index c1417919ab0a..8b75dcea5966 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -88,7 +88,6 @@ int
88xfs_bmap_rtalloc( 88xfs_bmap_rtalloc(
89 struct xfs_bmalloca *ap) /* bmap alloc argument struct */ 89 struct xfs_bmalloca *ap) /* bmap alloc argument struct */
90{ 90{
91 xfs_alloctype_t atype = 0; /* type for allocation routines */
92 int error; /* error return value */ 91 int error; /* error return value */
93 xfs_mount_t *mp; /* mount point structure */ 92 xfs_mount_t *mp; /* mount point structure */
94 xfs_extlen_t prod = 0; /* product factor for allocators */ 93 xfs_extlen_t prod = 0; /* product factor for allocators */
@@ -155,18 +154,14 @@ xfs_bmap_rtalloc(
155 /* 154 /*
156 * Realtime allocation, done through xfs_rtallocate_extent. 155 * Realtime allocation, done through xfs_rtallocate_extent.
157 */ 156 */
158 atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
159 do_div(ap->blkno, mp->m_sb.sb_rextsize); 157 do_div(ap->blkno, mp->m_sb.sb_rextsize);
160 rtb = ap->blkno; 158 rtb = ap->blkno;
161 ap->length = ralen; 159 ap->length = ralen;
162 if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length, 160 error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
163 &ralen, atype, ap->wasdel, prod, &rtb))) 161 &ralen, ap->wasdel, prod, &rtb);
164 return error; 162 if (error)
165 if (rtb == NULLFSBLOCK && prod > 1 &&
166 (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
167 ap->length, &ralen, atype,
168 ap->wasdel, 1, &rtb)))
169 return error; 163 return error;
164
170 ap->blkno = rtb; 165 ap->blkno = rtb;
171 if (ap->blkno != NULLFSBLOCK) { 166 if (ap->blkno != NULLFSBLOCK) {
172 ap->blkno *= mp->m_sb.sb_rextsize; 167 ap->blkno *= mp->m_sb.sb_rextsize;
@@ -787,11 +782,9 @@ xfs_getbmap(
787 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 782 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
788 783
789 for (i = 0; i < cur_ext; i++) { 784 for (i = 0; i < cur_ext; i++) {
790 int full = 0; /* user array is full */
791
792 /* format results & advance arg */ 785 /* format results & advance arg */
793 error = formatter(&arg, &out[i], &full); 786 error = formatter(&arg, &out[i]);
794 if (error || full) 787 if (error)
795 break; 788 break;
796 } 789 }
797 790
@@ -917,17 +910,18 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
917 */ 910 */
918int 911int
919xfs_free_eofblocks( 912xfs_free_eofblocks(
920 xfs_mount_t *mp, 913 struct xfs_inode *ip)
921 xfs_inode_t *ip,
922 bool need_iolock)
923{ 914{
924 xfs_trans_t *tp; 915 struct xfs_trans *tp;
925 int error; 916 int error;
926 xfs_fileoff_t end_fsb; 917 xfs_fileoff_t end_fsb;
927 xfs_fileoff_t last_fsb; 918 xfs_fileoff_t last_fsb;
928 xfs_filblks_t map_len; 919 xfs_filblks_t map_len;
929 int nimaps; 920 int nimaps;
930 xfs_bmbt_irec_t imap; 921 struct xfs_bmbt_irec imap;
922 struct xfs_mount *mp = ip->i_mount;
923
924 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
931 925
932 /* 926 /*
933 * Figure out if there are any blocks beyond the end 927 * Figure out if there are any blocks beyond the end
@@ -944,6 +938,10 @@ xfs_free_eofblocks(
944 error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0); 938 error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
945 xfs_iunlock(ip, XFS_ILOCK_SHARED); 939 xfs_iunlock(ip, XFS_ILOCK_SHARED);
946 940
941 /*
942 * If there are blocks after the end of file, truncate the file to its
943 * current size to free them up.
944 */
947 if (!error && (nimaps != 0) && 945 if (!error && (nimaps != 0) &&
948 (imap.br_startblock != HOLESTARTBLOCK || 946 (imap.br_startblock != HOLESTARTBLOCK ||
949 ip->i_delayed_blks)) { 947 ip->i_delayed_blks)) {
@@ -954,22 +952,13 @@ xfs_free_eofblocks(
954 if (error) 952 if (error)
955 return error; 953 return error;
956 954
957 /* 955 /* wait on dio to ensure i_size has settled */
958 * There are blocks after the end of file. 956 inode_dio_wait(VFS_I(ip));
959 * Free them up now by truncating the file to
960 * its current size.
961 */
962 if (need_iolock) {
963 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
964 return -EAGAIN;
965 }
966 957
967 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, 958 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
968 &tp); 959 &tp);
969 if (error) { 960 if (error) {
970 ASSERT(XFS_FORCED_SHUTDOWN(mp)); 961 ASSERT(XFS_FORCED_SHUTDOWN(mp));
971 if (need_iolock)
972 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
973 return error; 962 return error;
974 } 963 }
975 964
@@ -997,8 +986,6 @@ xfs_free_eofblocks(
997 } 986 }
998 987
999 xfs_iunlock(ip, XFS_ILOCK_EXCL); 988 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1000 if (need_iolock)
1001 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1002 } 989 }
1003 return error; 990 return error;
1004} 991}
@@ -1393,10 +1380,16 @@ xfs_shift_file_space(
1393 xfs_fileoff_t stop_fsb; 1380 xfs_fileoff_t stop_fsb;
1394 xfs_fileoff_t next_fsb; 1381 xfs_fileoff_t next_fsb;
1395 xfs_fileoff_t shift_fsb; 1382 xfs_fileoff_t shift_fsb;
1383 uint resblks;
1396 1384
1397 ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); 1385 ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
1398 1386
1399 if (direction == SHIFT_LEFT) { 1387 if (direction == SHIFT_LEFT) {
1388 /*
1389 * Reserve blocks to cover potential extent merges after left
1390 * shift operations.
1391 */
1392 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1400 next_fsb = XFS_B_TO_FSB(mp, offset + len); 1393 next_fsb = XFS_B_TO_FSB(mp, offset + len);
1401 stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size); 1394 stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
1402 } else { 1395 } else {
@@ -1404,6 +1397,7 @@ xfs_shift_file_space(
1404 * If right shift, delegate the work of initialization of 1397 * If right shift, delegate the work of initialization of
1405 * next_fsb to xfs_bmap_shift_extent as it has ilock held. 1398 * next_fsb to xfs_bmap_shift_extent as it has ilock held.
1406 */ 1399 */
1400 resblks = 0;
1407 next_fsb = NULLFSBLOCK; 1401 next_fsb = NULLFSBLOCK;
1408 stop_fsb = XFS_B_TO_FSB(mp, offset); 1402 stop_fsb = XFS_B_TO_FSB(mp, offset);
1409 } 1403 }
@@ -1415,7 +1409,7 @@ xfs_shift_file_space(
1415 * into the accessible region of the file. 1409 * into the accessible region of the file.
1416 */ 1410 */
1417 if (xfs_can_free_eofblocks(ip, true)) { 1411 if (xfs_can_free_eofblocks(ip, true)) {
1418 error = xfs_free_eofblocks(mp, ip, false); 1412 error = xfs_free_eofblocks(ip);
1419 if (error) 1413 if (error)
1420 return error; 1414 return error;
1421 } 1415 }
@@ -1445,21 +1439,14 @@ xfs_shift_file_space(
1445 } 1439 }
1446 1440
1447 while (!error && !done) { 1441 while (!error && !done) {
1448 /* 1442 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
1449 * We would need to reserve permanent block for transaction. 1443 &tp);
1450 * This will come into picture when after shifting extent into
1451 * hole we found that adjacent extents can be merged which
1452 * may lead to freeing of a block during record update.
1453 */
1454 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
1455 XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
1456 if (error) 1444 if (error)
1457 break; 1445 break;
1458 1446
1459 xfs_ilock(ip, XFS_ILOCK_EXCL); 1447 xfs_ilock(ip, XFS_ILOCK_EXCL);
1460 error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, 1448 error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
1461 ip->i_gdquot, ip->i_pdquot, 1449 ip->i_gdquot, ip->i_pdquot, resblks, 0,
1462 XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
1463 XFS_QMOPT_RES_REGBLKS); 1450 XFS_QMOPT_RES_REGBLKS);
1464 if (error) 1451 if (error)
1465 goto out_trans_cancel; 1452 goto out_trans_cancel;
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 68a621a8e0c0..135d8267e284 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -35,7 +35,7 @@ int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
35 xfs_fileoff_t start_fsb, xfs_fileoff_t length); 35 xfs_fileoff_t start_fsb, xfs_fileoff_t length);
36 36
37/* bmap to userspace formatter - copy to user & advance pointer */ 37/* bmap to userspace formatter - copy to user & advance pointer */
38typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *); 38typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *);
39int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, 39int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
40 xfs_bmap_format_t formatter, void *arg); 40 xfs_bmap_format_t formatter, void *arg);
41 41
@@ -63,8 +63,7 @@ int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
63 63
64/* EOF block manipulation functions */ 64/* EOF block manipulation functions */
65bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); 65bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
66int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip, 66int xfs_free_eofblocks(struct xfs_inode *ip);
67 bool need_iolock);
68 67
69int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, 68int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
70 struct xfs_swapext *sx); 69 struct xfs_swapext *sx);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 2975cb2319f4..0306168af332 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1162,6 +1162,7 @@ xfs_buf_iodone_callbacks(
1162 */ 1162 */
1163 bp->b_last_error = 0; 1163 bp->b_last_error = 0;
1164 bp->b_retries = 0; 1164 bp->b_retries = 0;
1165 bp->b_first_retry_time = 0;
1165 1166
1166 xfs_buf_do_callbacks(bp); 1167 xfs_buf_do_callbacks(bp);
1167 bp->b_fspriv = NULL; 1168 bp->b_fspriv = NULL;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 4ff499aa7338..d796ffac7296 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -208,32 +208,3 @@ xfs_ioc_trim(
208 return -EFAULT; 208 return -EFAULT;
209 return 0; 209 return 0;
210} 210}
211
212int
213xfs_discard_extents(
214 struct xfs_mount *mp,
215 struct list_head *list)
216{
217 struct xfs_extent_busy *busyp;
218 int error = 0;
219
220 list_for_each_entry(busyp, list, list) {
221 trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
222 busyp->length);
223
224 error = blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
225 XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
226 XFS_FSB_TO_BB(mp, busyp->length),
227 GFP_NOFS, 0);
228 if (error && error != -EOPNOTSUPP) {
229 xfs_info(mp,
230 "discard failed for extent [0x%llx,%u], error %d",
231 (unsigned long long)busyp->bno,
232 busyp->length,
233 error);
234 return error;
235 }
236 }
237
238 return 0;
239}
diff --git a/fs/xfs/xfs_discard.h b/fs/xfs/xfs_discard.h
index 344879aea646..0f070f9e44e1 100644
--- a/fs/xfs/xfs_discard.h
+++ b/fs/xfs/xfs_discard.h
@@ -5,6 +5,5 @@ struct fstrim_range;
5struct list_head; 5struct list_head;
6 6
7extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); 7extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
8extern int xfs_discard_extents(struct xfs_mount *, struct list_head *);
9 8
10#endif /* XFS_DISCARD_H */ 9#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 162dc186cf04..77760dbf0242 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -45,18 +45,7 @@ xfs_extent_busy_insert(
45 struct rb_node **rbp; 45 struct rb_node **rbp;
46 struct rb_node *parent = NULL; 46 struct rb_node *parent = NULL;
47 47
48 new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL); 48 new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP);
49 if (!new) {
50 /*
51 * No Memory! Since it is now not possible to track the free
52 * block, make this a synchronous transaction to insure that
53 * the block is not reused before this transaction commits.
54 */
55 trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len);
56 xfs_trans_set_sync(tp);
57 return;
58 }
59
60 new->agno = agno; 49 new->agno = agno;
61 new->bno = bno; 50 new->bno = bno;
62 new->length = len; 51 new->length = len;
@@ -345,25 +334,31 @@ restart:
345 * subset of the extent that is not busy. If *rlen is smaller than 334 * subset of the extent that is not busy. If *rlen is smaller than
346 * args->minlen no suitable extent could be found, and the higher level 335 * args->minlen no suitable extent could be found, and the higher level
347 * code needs to force out the log and retry the allocation. 336 * code needs to force out the log and retry the allocation.
337 *
338 * Return the current busy generation for the AG if the extent is busy. This
339 * value can be used to wait for at least one of the currently busy extents
340 * to be cleared. Note that the busy list is not guaranteed to be empty after
341 * the gen is woken. The state of a specific extent must always be confirmed
342 * with another call to xfs_extent_busy_trim() before it can be used.
348 */ 343 */
349void 344bool
350xfs_extent_busy_trim( 345xfs_extent_busy_trim(
351 struct xfs_alloc_arg *args, 346 struct xfs_alloc_arg *args,
352 xfs_agblock_t bno, 347 xfs_agblock_t *bno,
353 xfs_extlen_t len, 348 xfs_extlen_t *len,
354 xfs_agblock_t *rbno, 349 unsigned *busy_gen)
355 xfs_extlen_t *rlen)
356{ 350{
357 xfs_agblock_t fbno; 351 xfs_agblock_t fbno;
358 xfs_extlen_t flen; 352 xfs_extlen_t flen;
359 struct rb_node *rbp; 353 struct rb_node *rbp;
354 bool ret = false;
360 355
361 ASSERT(len > 0); 356 ASSERT(*len > 0);
362 357
363 spin_lock(&args->pag->pagb_lock); 358 spin_lock(&args->pag->pagb_lock);
364restart: 359restart:
365 fbno = bno; 360 fbno = *bno;
366 flen = len; 361 flen = *len;
367 rbp = args->pag->pagb_tree.rb_node; 362 rbp = args->pag->pagb_tree.rb_node;
368 while (rbp && flen >= args->minlen) { 363 while (rbp && flen >= args->minlen) {
369 struct xfs_extent_busy *busyp = 364 struct xfs_extent_busy *busyp =
@@ -515,24 +510,25 @@ restart:
515 510
516 flen = fend - fbno; 511 flen = fend - fbno;
517 } 512 }
518 spin_unlock(&args->pag->pagb_lock); 513out:
519 514
520 if (fbno != bno || flen != len) { 515 if (fbno != *bno || flen != *len) {
521 trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, 516 trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len,
522 fbno, flen); 517 fbno, flen);
518 *bno = fbno;
519 *len = flen;
520 *busy_gen = args->pag->pagb_gen;
521 ret = true;
523 } 522 }
524 *rbno = fbno; 523 spin_unlock(&args->pag->pagb_lock);
525 *rlen = flen; 524 return ret;
526 return;
527fail: 525fail:
528 /* 526 /*
529 * Return a zero extent length as failure indications. All callers 527 * Return a zero extent length as failure indications. All callers
530 * re-check if the trimmed extent satisfies the minlen requirement. 528 * re-check if the trimmed extent satisfies the minlen requirement.
531 */ 529 */
532 spin_unlock(&args->pag->pagb_lock); 530 flen = 0;
533 trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0); 531 goto out;
534 *rbno = fbno;
535 *rlen = 0;
536} 532}
537 533
538STATIC void 534STATIC void
@@ -551,6 +547,21 @@ xfs_extent_busy_clear_one(
551 kmem_free(busyp); 547 kmem_free(busyp);
552} 548}
553 549
550static void
551xfs_extent_busy_put_pag(
552 struct xfs_perag *pag,
553 bool wakeup)
554 __releases(pag->pagb_lock)
555{
556 if (wakeup) {
557 pag->pagb_gen++;
558 wake_up_all(&pag->pagb_wait);
559 }
560
561 spin_unlock(&pag->pagb_lock);
562 xfs_perag_put(pag);
563}
564
554/* 565/*
555 * Remove all extents on the passed in list from the busy extents tree. 566 * Remove all extents on the passed in list from the busy extents tree.
556 * If do_discard is set skip extents that need to be discarded, and mark 567 * If do_discard is set skip extents that need to be discarded, and mark
@@ -565,27 +576,76 @@ xfs_extent_busy_clear(
565 struct xfs_extent_busy *busyp, *n; 576 struct xfs_extent_busy *busyp, *n;
566 struct xfs_perag *pag = NULL; 577 struct xfs_perag *pag = NULL;
567 xfs_agnumber_t agno = NULLAGNUMBER; 578 xfs_agnumber_t agno = NULLAGNUMBER;
579 bool wakeup = false;
568 580
569 list_for_each_entry_safe(busyp, n, list, list) { 581 list_for_each_entry_safe(busyp, n, list, list) {
570 if (busyp->agno != agno) { 582 if (busyp->agno != agno) {
571 if (pag) { 583 if (pag)
572 spin_unlock(&pag->pagb_lock); 584 xfs_extent_busy_put_pag(pag, wakeup);
573 xfs_perag_put(pag);
574 }
575 pag = xfs_perag_get(mp, busyp->agno);
576 spin_lock(&pag->pagb_lock);
577 agno = busyp->agno; 585 agno = busyp->agno;
586 pag = xfs_perag_get(mp, agno);
587 spin_lock(&pag->pagb_lock);
588 wakeup = false;
578 } 589 }
579 590
580 if (do_discard && busyp->length && 591 if (do_discard && busyp->length &&
581 !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) 592 !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) {
582 busyp->flags = XFS_EXTENT_BUSY_DISCARDED; 593 busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
583 else 594 } else {
584 xfs_extent_busy_clear_one(mp, pag, busyp); 595 xfs_extent_busy_clear_one(mp, pag, busyp);
596 wakeup = true;
597 }
585 } 598 }
586 599
587 if (pag) { 600 if (pag)
588 spin_unlock(&pag->pagb_lock); 601 xfs_extent_busy_put_pag(pag, wakeup);
602}
603
604/*
605 * Flush out all busy extents for this AG.
606 */
607void
608xfs_extent_busy_flush(
609 struct xfs_mount *mp,
610 struct xfs_perag *pag,
611 unsigned busy_gen)
612{
613 DEFINE_WAIT (wait);
614 int log_flushed = 0, error;
615
616 trace_xfs_log_force(mp, 0, _THIS_IP_);
617 error = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed);
618 if (error)
619 return;
620
621 do {
622 prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
623 if (busy_gen != READ_ONCE(pag->pagb_gen))
624 break;
625 schedule();
626 } while (1);
627
628 finish_wait(&pag->pagb_wait, &wait);
629}
630
631void
632xfs_extent_busy_wait_all(
633 struct xfs_mount *mp)
634{
635 DEFINE_WAIT (wait);
636 xfs_agnumber_t agno;
637
638 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
639 struct xfs_perag *pag = xfs_perag_get(mp, agno);
640
641 do {
642 prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
643 if (RB_EMPTY_ROOT(&pag->pagb_tree))
644 break;
645 schedule();
646 } while (1);
647 finish_wait(&pag->pagb_wait, &wait);
648
589 xfs_perag_put(pag); 649 xfs_perag_put(pag);
590 } 650 }
591} 651}
@@ -596,9 +656,17 @@ xfs_extent_busy_clear(
596int 656int
597xfs_extent_busy_ag_cmp( 657xfs_extent_busy_ag_cmp(
598 void *priv, 658 void *priv,
599 struct list_head *a, 659 struct list_head *l1,
600 struct list_head *b) 660 struct list_head *l2)
601{ 661{
602 return container_of(a, struct xfs_extent_busy, list)->agno - 662 struct xfs_extent_busy *b1 =
603 container_of(b, struct xfs_extent_busy, list)->agno; 663 container_of(l1, struct xfs_extent_busy, list);
664 struct xfs_extent_busy *b2 =
665 container_of(l2, struct xfs_extent_busy, list);
666 s32 diff;
667
668 diff = b1->agno - b2->agno;
669 if (!diff)
670 diff = b1->bno - b2->bno;
671 return diff;
604} 672}
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index bfff284d2dcc..60195ea1b84a 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -58,9 +58,16 @@ void
58xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno, 58xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
59 xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); 59 xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
60 60
61bool
62xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t *bno,
63 xfs_extlen_t *len, unsigned *busy_gen);
64
65void
66xfs_extent_busy_flush(struct xfs_mount *mp, struct xfs_perag *pag,
67 unsigned busy_gen);
68
61void 69void
62xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno, 70xfs_extent_busy_wait_all(struct xfs_mount *mp);
63 xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen);
64 71
65int 72int
66xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b); 73xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index bbb9eb6811b2..086440e79b86 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -527,6 +527,15 @@ xfs_file_dio_aio_write(
527 if ((iocb->ki_pos & mp->m_blockmask) || 527 if ((iocb->ki_pos & mp->m_blockmask) ||
528 ((iocb->ki_pos + count) & mp->m_blockmask)) { 528 ((iocb->ki_pos + count) & mp->m_blockmask)) {
529 unaligned_io = 1; 529 unaligned_io = 1;
530
531 /*
532 * We can't properly handle unaligned direct I/O to reflink
533 * files yet, as we can't unshare a partial block.
534 */
535 if (xfs_is_reflink_inode(ip)) {
536 trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
537 return -EREMCHG;
538 }
530 iolock = XFS_IOLOCK_EXCL; 539 iolock = XFS_IOLOCK_EXCL;
531 } else { 540 } else {
532 iolock = XFS_IOLOCK_SHARED; 541 iolock = XFS_IOLOCK_SHARED;
@@ -552,14 +561,6 @@ xfs_file_dio_aio_write(
552 } 561 }
553 562
554 trace_xfs_file_direct_write(ip, count, iocb->ki_pos); 563 trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
555
556 /* If this is a block-aligned directio CoW, remap immediately. */
557 if (xfs_is_reflink_inode(ip) && !unaligned_io) {
558 ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count);
559 if (ret)
560 goto out;
561 }
562
563 ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); 564 ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
564out: 565out:
565 xfs_iunlock(ip, iolock); 566 xfs_iunlock(ip, iolock);
@@ -614,8 +615,10 @@ xfs_file_buffered_aio_write(
614 struct xfs_inode *ip = XFS_I(inode); 615 struct xfs_inode *ip = XFS_I(inode);
615 ssize_t ret; 616 ssize_t ret;
616 int enospc = 0; 617 int enospc = 0;
617 int iolock = XFS_IOLOCK_EXCL; 618 int iolock;
618 619
620write_retry:
621 iolock = XFS_IOLOCK_EXCL;
619 xfs_ilock(ip, iolock); 622 xfs_ilock(ip, iolock);
620 623
621 ret = xfs_file_aio_write_checks(iocb, from, &iolock); 624 ret = xfs_file_aio_write_checks(iocb, from, &iolock);
@@ -625,7 +628,6 @@ xfs_file_buffered_aio_write(
625 /* We can write back this queue in page reclaim */ 628 /* We can write back this queue in page reclaim */
626 current->backing_dev_info = inode_to_bdi(inode); 629 current->backing_dev_info = inode_to_bdi(inode);
627 630
628write_retry:
629 trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); 631 trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
630 ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); 632 ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
631 if (likely(ret >= 0)) 633 if (likely(ret >= 0))
@@ -641,18 +643,21 @@ write_retry:
641 * running at the same time. 643 * running at the same time.
642 */ 644 */
643 if (ret == -EDQUOT && !enospc) { 645 if (ret == -EDQUOT && !enospc) {
646 xfs_iunlock(ip, iolock);
644 enospc = xfs_inode_free_quota_eofblocks(ip); 647 enospc = xfs_inode_free_quota_eofblocks(ip);
645 if (enospc) 648 if (enospc)
646 goto write_retry; 649 goto write_retry;
647 enospc = xfs_inode_free_quota_cowblocks(ip); 650 enospc = xfs_inode_free_quota_cowblocks(ip);
648 if (enospc) 651 if (enospc)
649 goto write_retry; 652 goto write_retry;
653 iolock = 0;
650 } else if (ret == -ENOSPC && !enospc) { 654 } else if (ret == -ENOSPC && !enospc) {
651 struct xfs_eofblocks eofb = {0}; 655 struct xfs_eofblocks eofb = {0};
652 656
653 enospc = 1; 657 enospc = 1;
654 xfs_flush_inodes(ip->i_mount); 658 xfs_flush_inodes(ip->i_mount);
655 eofb.eof_scan_owner = ip->i_ino; /* for locking */ 659
660 xfs_iunlock(ip, iolock);
656 eofb.eof_flags = XFS_EOF_FLAGS_SYNC; 661 eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
657 xfs_icache_free_eofblocks(ip->i_mount, &eofb); 662 xfs_icache_free_eofblocks(ip->i_mount, &eofb);
658 goto write_retry; 663 goto write_retry;
@@ -660,7 +665,8 @@ write_retry:
660 665
661 current->backing_dev_info = NULL; 666 current->backing_dev_info = NULL;
662out: 667out:
663 xfs_iunlock(ip, iolock); 668 if (iolock)
669 xfs_iunlock(ip, iolock);
664 return ret; 670 return ret;
665} 671}
666 672
@@ -908,9 +914,9 @@ xfs_dir_open(
908 */ 914 */
909 mode = xfs_ilock_data_map_shared(ip); 915 mode = xfs_ilock_data_map_shared(ip);
910 if (ip->i_d.di_nextents > 0) 916 if (ip->i_d.di_nextents > 0)
911 xfs_dir3_data_readahead(ip, 0, -1); 917 error = xfs_dir3_data_readahead(ip, 0, -1);
912 xfs_iunlock(ip, mode); 918 xfs_iunlock(ip, mode);
913 return 0; 919 return error;
914} 920}
915 921
916STATIC int 922STATIC int
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 242e8091296d..6ccaae9eb0ee 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -352,12 +352,7 @@ xfs_growfs_data_private(
352 goto error0; 352 goto error0;
353 } 353 }
354 354
355 if (xfs_sb_version_hascrc(&mp->m_sb)) 355 xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, agno, 0);
356 xfs_btree_init_block(mp, bp, XFS_ABTB_CRC_MAGIC, 0, 1,
357 agno, XFS_BTREE_CRC_BLOCKS);
358 else
359 xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1,
360 agno, 0);
361 356
362 arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); 357 arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
363 arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); 358 arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
@@ -381,12 +376,7 @@ xfs_growfs_data_private(
381 goto error0; 376 goto error0;
382 } 377 }
383 378
384 if (xfs_sb_version_hascrc(&mp->m_sb)) 379 xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, agno, 0);
385 xfs_btree_init_block(mp, bp, XFS_ABTC_CRC_MAGIC, 0, 1,
386 agno, XFS_BTREE_CRC_BLOCKS);
387 else
388 xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1,
389 agno, 0);
390 380
391 arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); 381 arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
392 arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); 382 arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
@@ -413,8 +403,8 @@ xfs_growfs_data_private(
413 goto error0; 403 goto error0;
414 } 404 }
415 405
416 xfs_btree_init_block(mp, bp, XFS_RMAP_CRC_MAGIC, 0, 0, 406 xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 0,
417 agno, XFS_BTREE_CRC_BLOCKS); 407 agno, 0);
418 block = XFS_BUF_TO_BLOCK(bp); 408 block = XFS_BUF_TO_BLOCK(bp);
419 409
420 410
@@ -488,12 +478,7 @@ xfs_growfs_data_private(
488 goto error0; 478 goto error0;
489 } 479 }
490 480
491 if (xfs_sb_version_hascrc(&mp->m_sb)) 481 xfs_btree_init_block(mp, bp, XFS_BTNUM_INO , 0, 0, agno, 0);
492 xfs_btree_init_block(mp, bp, XFS_IBT_CRC_MAGIC, 0, 0,
493 agno, XFS_BTREE_CRC_BLOCKS);
494 else
495 xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0,
496 agno, 0);
497 482
498 error = xfs_bwrite(bp); 483 error = xfs_bwrite(bp);
499 xfs_buf_relse(bp); 484 xfs_buf_relse(bp);
@@ -513,13 +498,8 @@ xfs_growfs_data_private(
513 goto error0; 498 goto error0;
514 } 499 }
515 500
516 if (xfs_sb_version_hascrc(&mp->m_sb)) 501 xfs_btree_init_block(mp, bp, XFS_BTNUM_FINO,
517 xfs_btree_init_block(mp, bp, XFS_FIBT_CRC_MAGIC, 502 0, 0, agno, 0);
518 0, 0, agno,
519 XFS_BTREE_CRC_BLOCKS);
520 else
521 xfs_btree_init_block(mp, bp, XFS_FIBT_MAGIC, 0,
522 0, agno, 0);
523 503
524 error = xfs_bwrite(bp); 504 error = xfs_bwrite(bp);
525 xfs_buf_relse(bp); 505 xfs_buf_relse(bp);
@@ -540,9 +520,8 @@ xfs_growfs_data_private(
540 goto error0; 520 goto error0;
541 } 521 }
542 522
543 xfs_btree_init_block(mp, bp, XFS_REFC_CRC_MAGIC, 523 xfs_btree_init_block(mp, bp, XFS_BTNUM_REFC,
544 0, 0, agno, 524 0, 0, agno, 0);
545 XFS_BTREE_CRC_BLOCKS);
546 525
547 error = xfs_bwrite(bp); 526 error = xfs_bwrite(bp);
548 xfs_buf_relse(bp); 527 xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 70ca4f608321..7234b9748c36 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1322,13 +1322,10 @@ xfs_inode_free_eofblocks(
1322 int flags, 1322 int flags,
1323 void *args) 1323 void *args)
1324{ 1324{
1325 int ret; 1325 int ret = 0;
1326 struct xfs_eofblocks *eofb = args; 1326 struct xfs_eofblocks *eofb = args;
1327 bool need_iolock = true;
1328 int match; 1327 int match;
1329 1328
1330 ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
1331
1332 if (!xfs_can_free_eofblocks(ip, false)) { 1329 if (!xfs_can_free_eofblocks(ip, false)) {
1333 /* inode could be preallocated or append-only */ 1330 /* inode could be preallocated or append-only */
1334 trace_xfs_inode_free_eofblocks_invalid(ip); 1331 trace_xfs_inode_free_eofblocks_invalid(ip);
@@ -1356,21 +1353,19 @@ xfs_inode_free_eofblocks(
1356 if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && 1353 if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
1357 XFS_ISIZE(ip) < eofb->eof_min_file_size) 1354 XFS_ISIZE(ip) < eofb->eof_min_file_size)
1358 return 0; 1355 return 0;
1359
1360 /*
1361 * A scan owner implies we already hold the iolock. Skip it in
1362 * xfs_free_eofblocks() to avoid deadlock. This also eliminates
1363 * the possibility of EAGAIN being returned.
1364 */
1365 if (eofb->eof_scan_owner == ip->i_ino)
1366 need_iolock = false;
1367 } 1356 }
1368 1357
1369 ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock); 1358 /*
1370 1359 * If the caller is waiting, return -EAGAIN to keep the background
1371 /* don't revisit the inode if we're not waiting */ 1360 * scanner moving and revisit the inode in a subsequent pass.
1372 if (ret == -EAGAIN && !(flags & SYNC_WAIT)) 1361 */
1373 ret = 0; 1362 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1363 if (flags & SYNC_WAIT)
1364 ret = -EAGAIN;
1365 return ret;
1366 }
1367 ret = xfs_free_eofblocks(ip);
1368 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1374 1369
1375 return ret; 1370 return ret;
1376} 1371}
@@ -1417,15 +1412,10 @@ __xfs_inode_free_quota_eofblocks(
1417 struct xfs_eofblocks eofb = {0}; 1412 struct xfs_eofblocks eofb = {0};
1418 struct xfs_dquot *dq; 1413 struct xfs_dquot *dq;
1419 1414
1420 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1421
1422 /* 1415 /*
1423 * Set the scan owner to avoid a potential livelock. Otherwise, the scan 1416 * Run a sync scan to increase effectiveness and use the union filter to
1424 * can repeatedly trylock on the inode we're currently processing. We
1425 * run a sync scan to increase effectiveness and use the union filter to
1426 * cover all applicable quotas in a single scan. 1417 * cover all applicable quotas in a single scan.
1427 */ 1418 */
1428 eofb.eof_scan_owner = ip->i_ino;
1429 eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; 1419 eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
1430 1420
1431 if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { 1421 if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
@@ -1577,12 +1567,9 @@ xfs_inode_free_cowblocks(
1577{ 1567{
1578 int ret; 1568 int ret;
1579 struct xfs_eofblocks *eofb = args; 1569 struct xfs_eofblocks *eofb = args;
1580 bool need_iolock = true;
1581 int match; 1570 int match;
1582 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 1571 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
1583 1572
1584 ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
1585
1586 /* 1573 /*
1587 * Just clear the tag if we have an empty cow fork or none at all. It's 1574 * Just clear the tag if we have an empty cow fork or none at all. It's
1588 * possible the inode was fully unshared since it was originally tagged. 1575 * possible the inode was fully unshared since it was originally tagged.
@@ -1615,28 +1602,16 @@ xfs_inode_free_cowblocks(
1615 if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && 1602 if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
1616 XFS_ISIZE(ip) < eofb->eof_min_file_size) 1603 XFS_ISIZE(ip) < eofb->eof_min_file_size)
1617 return 0; 1604 return 0;
1618
1619 /*
1620 * A scan owner implies we already hold the iolock. Skip it in
1621 * xfs_free_eofblocks() to avoid deadlock. This also eliminates
1622 * the possibility of EAGAIN being returned.
1623 */
1624 if (eofb->eof_scan_owner == ip->i_ino)
1625 need_iolock = false;
1626 } 1605 }
1627 1606
1628 /* Free the CoW blocks */ 1607 /* Free the CoW blocks */
1629 if (need_iolock) { 1608 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1630 xfs_ilock(ip, XFS_IOLOCK_EXCL); 1609 xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
1631 xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
1632 }
1633 1610
1634 ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); 1611 ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
1635 1612
1636 if (need_iolock) { 1613 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
1637 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); 1614 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1638 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1639 }
1640 1615
1641 return ret; 1616 return ret;
1642} 1617}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index a1e02f4708ab..8a7c849b4dea 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -27,7 +27,6 @@ struct xfs_eofblocks {
27 kgid_t eof_gid; 27 kgid_t eof_gid;
28 prid_t eof_prid; 28 prid_t eof_prid;
29 __u64 eof_min_file_size; 29 __u64 eof_min_file_size;
30 xfs_ino_t eof_scan_owner;
31}; 30};
32 31
33#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ 32#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
@@ -102,7 +101,6 @@ xfs_fs_eofblocks_from_user(
102 dst->eof_flags = src->eof_flags; 101 dst->eof_flags = src->eof_flags;
103 dst->eof_prid = src->eof_prid; 102 dst->eof_prid = src->eof_prid;
104 dst->eof_min_file_size = src->eof_min_file_size; 103 dst->eof_min_file_size = src->eof_min_file_size;
105 dst->eof_scan_owner = NULLFSINO;
106 104
107 dst->eof_uid = INVALID_UID; 105 dst->eof_uid = INVALID_UID;
108 if (src->eof_flags & XFS_EOF_FLAGS_UID) { 106 if (src->eof_flags & XFS_EOF_FLAGS_UID) {
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index de32f0fe47c8..edfa6a55b064 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1692,32 +1692,34 @@ xfs_release(
1692 if (xfs_can_free_eofblocks(ip, false)) { 1692 if (xfs_can_free_eofblocks(ip, false)) {
1693 1693
1694 /* 1694 /*
1695 * Check if the inode is being opened, written and closed
1696 * frequently and we have delayed allocation blocks outstanding
1697 * (e.g. streaming writes from the NFS server), truncating the
1698 * blocks past EOF will cause fragmentation to occur.
1699 *
1700 * In this case don't do the truncation, but we have to be
1701 * careful how we detect this case. Blocks beyond EOF show up as
1702 * i_delayed_blks even when the inode is clean, so we need to
1703 * truncate them away first before checking for a dirty release.
1704 * Hence on the first dirty close we will still remove the
1705 * speculative allocation, but after that we will leave it in
1706 * place.
1707 */
1708 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
1709 return 0;
1710 /*
1695 * If we can't get the iolock just skip truncating the blocks 1711 * If we can't get the iolock just skip truncating the blocks
1696 * past EOF because we could deadlock with the mmap_sem 1712 * past EOF because we could deadlock with the mmap_sem
1697 * otherwise. We'll get another chance to drop them once the 1713 * otherwise. We'll get another chance to drop them once the
1698 * last reference to the inode is dropped, so we'll never leak 1714 * last reference to the inode is dropped, so we'll never leak
1699 * blocks permanently. 1715 * blocks permanently.
1700 *
1701 * Further, check if the inode is being opened, written and
1702 * closed frequently and we have delayed allocation blocks
1703 * outstanding (e.g. streaming writes from the NFS server),
1704 * truncating the blocks past EOF will cause fragmentation to
1705 * occur.
1706 *
1707 * In this case don't do the truncation, either, but we have to
1708 * be careful how we detect this case. Blocks beyond EOF show
1709 * up as i_delayed_blks even when the inode is clean, so we
1710 * need to truncate them away first before checking for a dirty
1711 * release. Hence on the first dirty close we will still remove
1712 * the speculative allocation, but after that we will leave it
1713 * in place.
1714 */ 1716 */
1715 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) 1717 if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1716 return 0; 1718 error = xfs_free_eofblocks(ip);
1717 1719 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1718 error = xfs_free_eofblocks(mp, ip, true); 1720 if (error)
1719 if (error && error != -EAGAIN) 1721 return error;
1720 return error; 1722 }
1721 1723
1722 /* delalloc blocks after truncation means it really is dirty */ 1724 /* delalloc blocks after truncation means it really is dirty */
1723 if (ip->i_delayed_blks) 1725 if (ip->i_delayed_blks)
@@ -1904,8 +1906,11 @@ xfs_inactive(
1904 * cache. Post-eof blocks must be freed, lest we end up with 1906 * cache. Post-eof blocks must be freed, lest we end up with
1905 * broken free space accounting. 1907 * broken free space accounting.
1906 */ 1908 */
1907 if (xfs_can_free_eofblocks(ip, true)) 1909 if (xfs_can_free_eofblocks(ip, true)) {
1908 xfs_free_eofblocks(mp, ip, false); 1910 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1911 xfs_free_eofblocks(ip);
1912 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1913 }
1909 1914
1910 return; 1915 return;
1911 } 1916 }
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index c67cfb451fd3..cf1363dbf32b 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1524,7 +1524,7 @@ out_drop_write:
1524} 1524}
1525 1525
1526STATIC int 1526STATIC int
1527xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full) 1527xfs_getbmap_format(void **ap, struct getbmapx *bmv)
1528{ 1528{
1529 struct getbmap __user *base = (struct getbmap __user *)*ap; 1529 struct getbmap __user *base = (struct getbmap __user *)*ap;
1530 1530
@@ -1567,7 +1567,7 @@ xfs_ioc_getbmap(
1567} 1567}
1568 1568
1569STATIC int 1569STATIC int
1570xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full) 1570xfs_getbmapx_format(void **ap, struct getbmapx *bmv)
1571{ 1571{
1572 struct getbmapx __user *base = (struct getbmapx __user *)*ap; 1572 struct getbmapx __user *base = (struct getbmapx __user *)*ap;
1573 1573
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 1aa3abd67b36..41662fb14e87 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -162,7 +162,7 @@ xfs_iomap_write_direct(
162 xfs_fileoff_t last_fsb; 162 xfs_fileoff_t last_fsb;
163 xfs_filblks_t count_fsb, resaligned; 163 xfs_filblks_t count_fsb, resaligned;
164 xfs_fsblock_t firstfsb; 164 xfs_fsblock_t firstfsb;
165 xfs_extlen_t extsz, temp; 165 xfs_extlen_t extsz;
166 int nimaps; 166 int nimaps;
167 int quota_flag; 167 int quota_flag;
168 int rt; 168 int rt;
@@ -203,14 +203,7 @@ xfs_iomap_write_direct(
203 } 203 }
204 count_fsb = last_fsb - offset_fsb; 204 count_fsb = last_fsb - offset_fsb;
205 ASSERT(count_fsb > 0); 205 ASSERT(count_fsb > 0);
206 206 resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, extsz);
207 resaligned = count_fsb;
208 if (unlikely(extsz)) {
209 if ((temp = do_mod(offset_fsb, extsz)))
210 resaligned += temp;
211 if ((temp = do_mod(resaligned, extsz)))
212 resaligned += extsz - temp;
213 }
214 207
215 if (unlikely(rt)) { 208 if (unlikely(rt)) {
216 resrtextents = qblocks = resaligned; 209 resrtextents = qblocks = resaligned;
@@ -685,7 +678,7 @@ xfs_iomap_write_allocate(
685 int nres; 678 int nres;
686 679
687 if (whichfork == XFS_COW_FORK) 680 if (whichfork == XFS_COW_FORK)
688 flags |= XFS_BMAPI_COWFORK; 681 flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
689 682
690 /* 683 /*
691 * Make sure that the dquots are there. 684 * Make sure that the dquots are there.
@@ -1002,47 +995,31 @@ xfs_file_iomap_begin(
1002 offset_fsb = XFS_B_TO_FSBT(mp, offset); 995 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1003 end_fsb = XFS_B_TO_FSB(mp, offset + length); 996 end_fsb = XFS_B_TO_FSB(mp, offset + length);
1004 997
1005 if (xfs_is_reflink_inode(ip) &&
1006 (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT)) {
1007 shared = xfs_reflink_find_cow_mapping(ip, offset, &imap);
1008 if (shared) {
1009 xfs_iunlock(ip, lockmode);
1010 goto alloc_done;
1011 }
1012 ASSERT(!isnullstartblock(imap.br_startblock));
1013 }
1014
1015 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, 998 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
1016 &nimaps, 0); 999 &nimaps, 0);
1017 if (error) 1000 if (error)
1018 goto out_unlock; 1001 goto out_unlock;
1019 1002
1020 if ((flags & IOMAP_REPORT) || 1003 if (flags & IOMAP_REPORT) {
1021 (xfs_is_reflink_inode(ip) &&
1022 (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT))) {
1023 /* Trim the mapping to the nearest shared extent boundary. */ 1004 /* Trim the mapping to the nearest shared extent boundary. */
1024 error = xfs_reflink_trim_around_shared(ip, &imap, &shared, 1005 error = xfs_reflink_trim_around_shared(ip, &imap, &shared,
1025 &trimmed); 1006 &trimmed);
1026 if (error) 1007 if (error)
1027 goto out_unlock; 1008 goto out_unlock;
1028
1029 /*
1030 * We're here because we're trying to do a directio write to a
1031 * region that isn't aligned to a filesystem block. If the
1032 * extent is shared, fall back to buffered mode to handle the
1033 * RMW.
1034 */
1035 if (!(flags & IOMAP_REPORT) && shared) {
1036 trace_xfs_reflink_bounce_dio_write(ip, &imap);
1037 error = -EREMCHG;
1038 goto out_unlock;
1039 }
1040 } 1009 }
1041 1010
1042 if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { 1011 if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
1043 error = xfs_reflink_reserve_cow(ip, &imap, &shared); 1012 if (flags & IOMAP_DIRECT) {
1044 if (error) 1013 /* may drop and re-acquire the ilock */
1045 goto out_unlock; 1014 error = xfs_reflink_allocate_cow(ip, &imap, &shared,
1015 &lockmode);
1016 if (error)
1017 goto out_unlock;
1018 } else {
1019 error = xfs_reflink_reserve_cow(ip, &imap, &shared);
1020 if (error)
1021 goto out_unlock;
1022 }
1046 1023
1047 end_fsb = imap.br_startoff + imap.br_blockcount; 1024 end_fsb = imap.br_startoff + imap.br_blockcount;
1048 length = XFS_FSB_TO_B(mp, end_fsb) - offset; 1025 length = XFS_FSB_TO_B(mp, end_fsb) - offset;
@@ -1071,7 +1048,6 @@ xfs_file_iomap_begin(
1071 if (error) 1048 if (error)
1072 return error; 1049 return error;
1073 1050
1074alloc_done:
1075 iomap->flags = IOMAP_F_NEW; 1051 iomap->flags = IOMAP_F_NEW;
1076 trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); 1052 trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
1077 } else { 1053 } else {
@@ -1102,7 +1078,19 @@ xfs_file_iomap_end_delalloc(
1102 xfs_fileoff_t end_fsb; 1078 xfs_fileoff_t end_fsb;
1103 int error = 0; 1079 int error = 0;
1104 1080
1105 start_fsb = XFS_B_TO_FSB(mp, offset + written); 1081 /* behave as if the write failed if drop writes is enabled */
1082 if (xfs_mp_drop_writes(mp))
1083 written = 0;
1084
1085 /*
1086 * start_fsb refers to the first unused block after a short write. If
1087 * nothing was written, round offset down to point at the first block in
1088 * the range.
1089 */
1090 if (unlikely(!written))
1091 start_fsb = XFS_B_TO_FSBT(mp, offset);
1092 else
1093 start_fsb = XFS_B_TO_FSB(mp, offset + written);
1106 end_fsb = XFS_B_TO_FSB(mp, offset + length); 1094 end_fsb = XFS_B_TO_FSB(mp, offset + length);
1107 1095
1108 /* 1096 /*
@@ -1114,6 +1102,9 @@ xfs_file_iomap_end_delalloc(
1114 * blocks in the range, they are ours. 1102 * blocks in the range, they are ours.
1115 */ 1103 */
1116 if (start_fsb < end_fsb) { 1104 if (start_fsb < end_fsb) {
1105 truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
1106 XFS_FSB_TO_B(mp, end_fsb) - 1);
1107
1117 xfs_ilock(ip, XFS_ILOCK_EXCL); 1108 xfs_ilock(ip, XFS_ILOCK_EXCL);
1118 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1109 error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
1119 end_fsb - start_fsb); 1110 end_fsb - start_fsb);
@@ -1144,7 +1135,7 @@ xfs_file_iomap_end(
1144 return 0; 1135 return 0;
1145} 1136}
1146 1137
1147struct iomap_ops xfs_iomap_ops = { 1138const struct iomap_ops xfs_iomap_ops = {
1148 .iomap_begin = xfs_file_iomap_begin, 1139 .iomap_begin = xfs_file_iomap_begin,
1149 .iomap_end = xfs_file_iomap_end, 1140 .iomap_end = xfs_file_iomap_end,
1150}; 1141};
@@ -1190,6 +1181,6 @@ out_unlock:
1190 return error; 1181 return error;
1191} 1182}
1192 1183
1193struct iomap_ops xfs_xattr_iomap_ops = { 1184const struct iomap_ops xfs_xattr_iomap_ops = {
1194 .iomap_begin = xfs_xattr_iomap_begin, 1185 .iomap_begin = xfs_xattr_iomap_begin,
1195}; 1186};
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 6d45cf01fcff..00db3ecea084 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -33,7 +33,27 @@ void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
33 struct xfs_bmbt_irec *); 33 struct xfs_bmbt_irec *);
34xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize); 34xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
35 35
36extern struct iomap_ops xfs_iomap_ops; 36static inline xfs_filblks_t
37extern struct iomap_ops xfs_xattr_iomap_ops; 37xfs_aligned_fsb_count(
38 xfs_fileoff_t offset_fsb,
39 xfs_filblks_t count_fsb,
40 xfs_extlen_t extsz)
41{
42 if (extsz) {
43 xfs_extlen_t align;
44
45 align = do_mod(offset_fsb, extsz);
46 if (align)
47 count_fsb += align;
48 align = do_mod(count_fsb, extsz);
49 if (align)
50 count_fsb += extsz - align;
51 }
52
53 return count_fsb;
54}
55
56extern const struct iomap_ops xfs_iomap_ops;
57extern const struct iomap_ops xfs_xattr_iomap_ops;
38 58
39#endif /* __XFS_IOMAP_H__*/ 59#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index b5e71072fde5..cc5a9f1574e7 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -124,7 +124,6 @@ struct xlog_ticket;
124struct xfs_log_item; 124struct xfs_log_item;
125struct xfs_item_ops; 125struct xfs_item_ops;
126struct xfs_trans; 126struct xfs_trans;
127struct xfs_log_callback;
128 127
129xfs_lsn_t xfs_log_done(struct xfs_mount *mp, 128xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
130 struct xlog_ticket *ticket, 129 struct xlog_ticket *ticket,
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index a4ab192e1792..82f1cbcc4de1 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -30,6 +30,9 @@
30#include "xfs_trans_priv.h" 30#include "xfs_trans_priv.h"
31#include "xfs_log.h" 31#include "xfs_log.h"
32#include "xfs_log_priv.h" 32#include "xfs_log_priv.h"
33#include "xfs_trace.h"
34
35struct workqueue_struct *xfs_discard_wq;
33 36
34/* 37/*
35 * Allocate a new ticket. Failing to get a new ticket makes it really hard to 38 * Allocate a new ticket. Failing to get a new ticket makes it really hard to
@@ -491,6 +494,75 @@ xlog_cil_free_logvec(
491 } 494 }
492} 495}
493 496
497static void
498xlog_discard_endio_work(
499 struct work_struct *work)
500{
501 struct xfs_cil_ctx *ctx =
502 container_of(work, struct xfs_cil_ctx, discard_endio_work);
503 struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
504
505 xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
506 kmem_free(ctx);
507}
508
509/*
510 * Queue up the actual completion to a thread to avoid IRQ-safe locking for
511 * pagb_lock. Note that we need a unbounded workqueue, otherwise we might
512 * get the execution delayed up to 30 seconds for weird reasons.
513 */
514static void
515xlog_discard_endio(
516 struct bio *bio)
517{
518 struct xfs_cil_ctx *ctx = bio->bi_private;
519
520 INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work);
521 queue_work(xfs_discard_wq, &ctx->discard_endio_work);
522}
523
524static void
525xlog_discard_busy_extents(
526 struct xfs_mount *mp,
527 struct xfs_cil_ctx *ctx)
528{
529 struct list_head *list = &ctx->busy_extents;
530 struct xfs_extent_busy *busyp;
531 struct bio *bio = NULL;
532 struct blk_plug plug;
533 int error = 0;
534
535 ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
536
537 blk_start_plug(&plug);
538 list_for_each_entry(busyp, list, list) {
539 trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
540 busyp->length);
541
542 error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
543 XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
544 XFS_FSB_TO_BB(mp, busyp->length),
545 GFP_NOFS, 0, &bio);
546 if (error && error != -EOPNOTSUPP) {
547 xfs_info(mp,
548 "discard failed for extent [0x%llx,%u], error %d",
549 (unsigned long long)busyp->bno,
550 busyp->length,
551 error);
552 break;
553 }
554 }
555
556 if (bio) {
557 bio->bi_private = ctx;
558 bio->bi_end_io = xlog_discard_endio;
559 submit_bio(bio);
560 } else {
561 xlog_discard_endio_work(&ctx->discard_endio_work);
562 }
563 blk_finish_plug(&plug);
564}
565
494/* 566/*
495 * Mark all items committed and clear busy extents. We free the log vector 567 * Mark all items committed and clear busy extents. We free the log vector
496 * chains in a separate pass so that we unpin the log items as quickly as 568 * chains in a separate pass so that we unpin the log items as quickly as
@@ -525,14 +597,10 @@ xlog_cil_committed(
525 597
526 xlog_cil_free_logvec(ctx->lv_chain); 598 xlog_cil_free_logvec(ctx->lv_chain);
527 599
528 if (!list_empty(&ctx->busy_extents)) { 600 if (!list_empty(&ctx->busy_extents))
529 ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); 601 xlog_discard_busy_extents(mp, ctx);
530 602 else
531 xfs_discard_extents(mp, &ctx->busy_extents); 603 kmem_free(ctx);
532 xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
533 }
534
535 kmem_free(ctx);
536} 604}
537 605
538/* 606/*
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 2b6eec52178e..c2604a5366f2 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -257,6 +257,7 @@ struct xfs_cil_ctx {
257 struct xfs_log_vec *lv_chain; /* logvecs being pushed */ 257 struct xfs_log_vec *lv_chain; /* logvecs being pushed */
258 struct xfs_log_callback log_cb; /* completion callback hook. */ 258 struct xfs_log_callback log_cb; /* completion callback hook. */
259 struct list_head committing; /* ctx committing list */ 259 struct list_head committing; /* ctx committing list */
260 struct work_struct discard_endio_work;
260}; 261};
261 262
262/* 263/*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 9b9540db17a6..450bde68bb75 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -45,6 +45,7 @@
45#include "xfs_rmap_btree.h" 45#include "xfs_rmap_btree.h"
46#include "xfs_refcount_btree.h" 46#include "xfs_refcount_btree.h"
47#include "xfs_reflink.h" 47#include "xfs_reflink.h"
48#include "xfs_extent_busy.h"
48 49
49 50
50static DEFINE_MUTEX(xfs_uuid_table_mutex); 51static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -187,7 +188,7 @@ xfs_initialize_perag(
187 xfs_agnumber_t *maxagi) 188 xfs_agnumber_t *maxagi)
188{ 189{
189 xfs_agnumber_t index; 190 xfs_agnumber_t index;
190 xfs_agnumber_t first_initialised = 0; 191 xfs_agnumber_t first_initialised = NULLAGNUMBER;
191 xfs_perag_t *pag; 192 xfs_perag_t *pag;
192 int error = -ENOMEM; 193 int error = -ENOMEM;
193 194
@@ -202,22 +203,21 @@ xfs_initialize_perag(
202 xfs_perag_put(pag); 203 xfs_perag_put(pag);
203 continue; 204 continue;
204 } 205 }
205 if (!first_initialised)
206 first_initialised = index;
207 206
208 pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); 207 pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
209 if (!pag) 208 if (!pag)
210 goto out_unwind; 209 goto out_unwind_new_pags;
211 pag->pag_agno = index; 210 pag->pag_agno = index;
212 pag->pag_mount = mp; 211 pag->pag_mount = mp;
213 spin_lock_init(&pag->pag_ici_lock); 212 spin_lock_init(&pag->pag_ici_lock);
214 mutex_init(&pag->pag_ici_reclaim_lock); 213 mutex_init(&pag->pag_ici_reclaim_lock);
215 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); 214 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
216 if (xfs_buf_hash_init(pag)) 215 if (xfs_buf_hash_init(pag))
217 goto out_unwind; 216 goto out_free_pag;
217 init_waitqueue_head(&pag->pagb_wait);
218 218
219 if (radix_tree_preload(GFP_NOFS)) 219 if (radix_tree_preload(GFP_NOFS))
220 goto out_unwind; 220 goto out_hash_destroy;
221 221
222 spin_lock(&mp->m_perag_lock); 222 spin_lock(&mp->m_perag_lock);
223 if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { 223 if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
@@ -225,10 +225,13 @@ xfs_initialize_perag(
225 spin_unlock(&mp->m_perag_lock); 225 spin_unlock(&mp->m_perag_lock);
226 radix_tree_preload_end(); 226 radix_tree_preload_end();
227 error = -EEXIST; 227 error = -EEXIST;
228 goto out_unwind; 228 goto out_hash_destroy;
229 } 229 }
230 spin_unlock(&mp->m_perag_lock); 230 spin_unlock(&mp->m_perag_lock);
231 radix_tree_preload_end(); 231 radix_tree_preload_end();
232 /* first new pag is fully initialized */
233 if (first_initialised == NULLAGNUMBER)
234 first_initialised = index;
232 } 235 }
233 236
234 index = xfs_set_inode_alloc(mp, agcount); 237 index = xfs_set_inode_alloc(mp, agcount);
@@ -239,11 +242,16 @@ xfs_initialize_perag(
239 mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp); 242 mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
240 return 0; 243 return 0;
241 244
242out_unwind: 245out_hash_destroy:
243 xfs_buf_hash_destroy(pag); 246 xfs_buf_hash_destroy(pag);
247out_free_pag:
244 kmem_free(pag); 248 kmem_free(pag);
245 for (; index > first_initialised; index--) { 249out_unwind_new_pags:
250 /* unwind any prior newly initialized pags */
251 for (index = first_initialised; index < agcount; index++) {
246 pag = radix_tree_delete(&mp->m_perag_tree, index); 252 pag = radix_tree_delete(&mp->m_perag_tree, index);
253 if (!pag)
254 break;
247 xfs_buf_hash_destroy(pag); 255 xfs_buf_hash_destroy(pag);
248 kmem_free(pag); 256 kmem_free(pag);
249 } 257 }
@@ -1073,6 +1081,13 @@ xfs_unmountfs(
1073 xfs_log_force(mp, XFS_LOG_SYNC); 1081 xfs_log_force(mp, XFS_LOG_SYNC);
1074 1082
1075 /* 1083 /*
1084 * Wait for all busy extents to be freed, including completion of
1085 * any discard operation.
1086 */
1087 xfs_extent_busy_wait_all(mp);
1088 flush_workqueue(xfs_discard_wq);
1089
1090 /*
1076 * We now need to tell the world we are unmounting. This will allow 1091 * We now need to tell the world we are unmounting. This will allow
1077 * us to detect that the filesystem is going away and we should error 1092 * us to detect that the filesystem is going away and we should error
1078 * out anything that we have been retrying in the background. This will 1093 * out anything that we have been retrying in the background. This will
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7f351f706b7a..6db6fd6b82b0 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -200,11 +200,12 @@ typedef struct xfs_mount {
200 /* 200 /*
201 * DEBUG mode instrumentation to test and/or trigger delayed allocation 201 * DEBUG mode instrumentation to test and/or trigger delayed allocation
202 * block killing in the event of failed writes. When enabled, all 202 * block killing in the event of failed writes. When enabled, all
203 * buffered writes are forced to fail. All delalloc blocks in the range 203 * buffered writes are silenty dropped and handled as if they failed.
204 * of the write (including pre-existing delalloc blocks!) are tossed as 204 * All delalloc blocks in the range of the write (including pre-existing
205 * part of the write failure error handling sequence. 205 * delalloc blocks!) are tossed as part of the write failure error
206 * handling sequence.
206 */ 207 */
207 bool m_fail_writes; 208 bool m_drop_writes;
208#endif 209#endif
209} xfs_mount_t; 210} xfs_mount_t;
210 211
@@ -325,13 +326,13 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
325 326
326#ifdef DEBUG 327#ifdef DEBUG
327static inline bool 328static inline bool
328xfs_mp_fail_writes(struct xfs_mount *mp) 329xfs_mp_drop_writes(struct xfs_mount *mp)
329{ 330{
330 return mp->m_fail_writes; 331 return mp->m_drop_writes;
331} 332}
332#else 333#else
333static inline bool 334static inline bool
334xfs_mp_fail_writes(struct xfs_mount *mp) 335xfs_mp_drop_writes(struct xfs_mount *mp)
335{ 336{
336 return 0; 337 return 0;
337} 338}
@@ -384,6 +385,8 @@ typedef struct xfs_perag {
384 xfs_agino_t pagl_rightrec; 385 xfs_agino_t pagl_rightrec;
385 spinlock_t pagb_lock; /* lock for pagb_tree */ 386 spinlock_t pagb_lock; /* lock for pagb_tree */
386 struct rb_root pagb_tree; /* ordered tree of busy extents */ 387 struct rb_root pagb_tree; /* ordered tree of busy extents */
388 unsigned int pagb_gen; /* generation count for pagb_tree */
389 wait_queue_head_t pagb_wait; /* woken when pagb_gen changes */
387 390
388 atomic_t pagf_fstrms; /* # of filestreams active in this AG */ 391 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
389 392
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 07593a362cd0..da6d08fb359c 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -82,11 +82,22 @@
82 * mappings are a reservation against the free space in the filesystem; 82 * mappings are a reservation against the free space in the filesystem;
83 * adjacent mappings can also be combined into fewer larger mappings. 83 * adjacent mappings can also be combined into fewer larger mappings.
84 * 84 *
85 * As an optimization, the CoW extent size hint (cowextsz) creates
86 * outsized aligned delalloc reservations in the hope of landing out of
87 * order nearby CoW writes in a single extent on disk, thereby reducing
88 * fragmentation and improving future performance.
89 *
90 * D: --RRRRRRSSSRRRRRRRR--- (data fork)
91 * C: ------DDDDDDD--------- (CoW fork)
92 *
85 * When dirty pages are being written out (typically in writepage), the 93 * When dirty pages are being written out (typically in writepage), the
86 * delalloc reservations are converted into real mappings by allocating 94 * delalloc reservations are converted into unwritten mappings by
87 * blocks and replacing the delalloc mapping with real ones. A delalloc 95 * allocating blocks and replacing the delalloc mapping with real ones.
88 * mapping can be replaced by several real ones if the free space is 96 * A delalloc mapping can be replaced by several unwritten ones if the
89 * fragmented. 97 * free space is fragmented.
98 *
99 * D: --RRRRRRSSSRRRRRRRR---
100 * C: ------UUUUUUU---------
90 * 101 *
91 * We want to adapt the delalloc mechanism for copy-on-write, since the 102 * We want to adapt the delalloc mechanism for copy-on-write, since the
92 * write paths are similar. The first two steps (creating the reservation 103 * write paths are similar. The first two steps (creating the reservation
@@ -101,13 +112,29 @@
101 * Block-aligned directio writes will use the same mechanism as buffered 112 * Block-aligned directio writes will use the same mechanism as buffered
102 * writes. 113 * writes.
103 * 114 *
115 * Just prior to submitting the actual disk write requests, we convert
116 * the extents representing the range of the file actually being written
117 * (as opposed to extra pieces created for the cowextsize hint) to real
118 * extents. This will become important in the next step:
119 *
120 * D: --RRRRRRSSSRRRRRRRR---
121 * C: ------UUrrUUU---------
122 *
104 * CoW remapping must be done after the data block write completes, 123 * CoW remapping must be done after the data block write completes,
105 * because we don't want to destroy the old data fork map until we're sure 124 * because we don't want to destroy the old data fork map until we're sure
106 * the new block has been written. Since the new mappings are kept in a 125 * the new block has been written. Since the new mappings are kept in a
107 * separate fork, we can simply iterate these mappings to find the ones 126 * separate fork, we can simply iterate these mappings to find the ones
108 * that cover the file blocks that we just CoW'd. For each extent, simply 127 * that cover the file blocks that we just CoW'd. For each extent, simply
109 * unmap the corresponding range in the data fork, map the new range into 128 * unmap the corresponding range in the data fork, map the new range into
110 * the data fork, and remove the extent from the CoW fork. 129 * the data fork, and remove the extent from the CoW fork. Because of
130 * the presence of the cowextsize hint, however, we must be careful
131 * only to remap the blocks that we've actually written out -- we must
132 * never remap delalloc reservations nor CoW staging blocks that have
133 * yet to be written. This corresponds exactly to the real extents in
134 * the CoW fork:
135 *
136 * D: --RRRRRRrrSRRRRRRRR---
137 * C: ------UU--UUU---------
111 * 138 *
112 * Since the remapping operation can be applied to an arbitrary file 139 * Since the remapping operation can be applied to an arbitrary file
113 * range, we record the need for the remap step as a flag in the ioend 140 * range, we record the need for the remap step as a flag in the ioend
@@ -296,103 +323,165 @@ xfs_reflink_reserve_cow(
296 return 0; 323 return 0;
297} 324}
298 325
299/* Allocate all CoW reservations covering a range of blocks in a file. */ 326/* Convert part of an unwritten CoW extent to a real one. */
300static int 327STATIC int
301__xfs_reflink_allocate_cow( 328xfs_reflink_convert_cow_extent(
302 struct xfs_inode *ip, 329 struct xfs_inode *ip,
303 xfs_fileoff_t *offset_fsb, 330 struct xfs_bmbt_irec *imap,
304 xfs_fileoff_t end_fsb) 331 xfs_fileoff_t offset_fsb,
332 xfs_filblks_t count_fsb,
333 struct xfs_defer_ops *dfops)
305{ 334{
306 struct xfs_mount *mp = ip->i_mount; 335 xfs_fsblock_t first_block;
307 struct xfs_bmbt_irec imap; 336 int nimaps = 1;
308 struct xfs_defer_ops dfops;
309 struct xfs_trans *tp;
310 xfs_fsblock_t first_block;
311 int nimaps = 1, error;
312 bool shared;
313
314 xfs_defer_init(&dfops, &first_block);
315 337
316 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 338 if (imap->br_state == XFS_EXT_NORM)
317 XFS_TRANS_RESERVE, &tp); 339 return 0;
318 if (error)
319 return error;
320 340
321 xfs_ilock(ip, XFS_ILOCK_EXCL); 341 xfs_trim_extent(imap, offset_fsb, count_fsb);
342 trace_xfs_reflink_convert_cow(ip, imap);
343 if (imap->br_blockcount == 0)
344 return 0;
345 return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount,
346 XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, &first_block,
347 0, imap, &nimaps, dfops);
348}
322 349
323 /* Read extent from the source file. */ 350/* Convert all of the unwritten CoW extents in a file's range to real ones. */
324 nimaps = 1; 351int
325 error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, 352xfs_reflink_convert_cow(
326 &imap, &nimaps, 0); 353 struct xfs_inode *ip,
327 if (error) 354 xfs_off_t offset,
328 goto out_unlock; 355 xfs_off_t count)
329 ASSERT(nimaps == 1); 356{
357 struct xfs_bmbt_irec got;
358 struct xfs_defer_ops dfops;
359 struct xfs_mount *mp = ip->i_mount;
360 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
361 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
362 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
363 xfs_extnum_t idx;
364 bool found;
365 int error = 0;
330 366
331 error = xfs_reflink_reserve_cow(ip, &imap, &shared); 367 xfs_ilock(ip, XFS_ILOCK_EXCL);
332 if (error)
333 goto out_trans_cancel;
334 368
335 if (!shared) { 369 /* Convert all the extents to real from unwritten. */
336 *offset_fsb = imap.br_startoff + imap.br_blockcount; 370 for (found = xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got);
337 goto out_trans_cancel; 371 found && got.br_startoff < end_fsb;
372 found = xfs_iext_get_extent(ifp, ++idx, &got)) {
373 error = xfs_reflink_convert_cow_extent(ip, &got, offset_fsb,
374 end_fsb - offset_fsb, &dfops);
375 if (error)
376 break;
338 } 377 }
339 378
340 xfs_trans_ijoin(tp, ip, 0); 379 /* Finish up. */
341 error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount,
342 XFS_BMAPI_COWFORK, &first_block,
343 XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK),
344 &imap, &nimaps, &dfops);
345 if (error)
346 goto out_trans_cancel;
347
348 error = xfs_defer_finish(&tp, &dfops, NULL);
349 if (error)
350 goto out_trans_cancel;
351
352 error = xfs_trans_commit(tp);
353
354 *offset_fsb = imap.br_startoff + imap.br_blockcount;
355out_unlock:
356 xfs_iunlock(ip, XFS_ILOCK_EXCL); 380 xfs_iunlock(ip, XFS_ILOCK_EXCL);
357 return error; 381 return error;
358out_trans_cancel:
359 xfs_defer_cancel(&dfops);
360 xfs_trans_cancel(tp);
361 goto out_unlock;
362} 382}
363 383
364/* Allocate all CoW reservations covering a part of a file. */ 384/* Allocate all CoW reservations covering a range of blocks in a file. */
365int 385int
366xfs_reflink_allocate_cow_range( 386xfs_reflink_allocate_cow(
367 struct xfs_inode *ip, 387 struct xfs_inode *ip,
368 xfs_off_t offset, 388 struct xfs_bmbt_irec *imap,
369 xfs_off_t count) 389 bool *shared,
390 uint *lockmode)
370{ 391{
371 struct xfs_mount *mp = ip->i_mount; 392 struct xfs_mount *mp = ip->i_mount;
372 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 393 xfs_fileoff_t offset_fsb = imap->br_startoff;
373 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); 394 xfs_filblks_t count_fsb = imap->br_blockcount;
374 int error; 395 struct xfs_bmbt_irec got;
396 struct xfs_defer_ops dfops;
397 struct xfs_trans *tp = NULL;
398 xfs_fsblock_t first_block;
399 int nimaps, error = 0;
400 bool trimmed;
401 xfs_filblks_t resaligned;
402 xfs_extlen_t resblks = 0;
403 xfs_extnum_t idx;
375 404
405retry:
376 ASSERT(xfs_is_reflink_inode(ip)); 406 ASSERT(xfs_is_reflink_inode(ip));
377 407 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
378 trace_xfs_reflink_allocate_cow_range(ip, offset, count);
379 408
380 /* 409 /*
381 * Make sure that the dquots are there. 410 * Even if the extent is not shared we might have a preallocation for
411 * it in the COW fork. If so use it.
382 */ 412 */
383 error = xfs_qm_dqattach(ip, 0); 413 if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &idx, &got) &&
384 if (error) 414 got.br_startoff <= offset_fsb) {
385 return error; 415 *shared = true;
386 416
387 while (offset_fsb < end_fsb) { 417 /* If we have a real allocation in the COW fork we're done. */
388 error = __xfs_reflink_allocate_cow(ip, &offset_fsb, end_fsb); 418 if (!isnullstartblock(got.br_startblock)) {
389 if (error) { 419 xfs_trim_extent(&got, offset_fsb, count_fsb);
390 trace_xfs_reflink_allocate_cow_range_error(ip, error, 420 *imap = got;
391 _RET_IP_); 421 goto convert;
392 break;
393 } 422 }
423
424 xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
425 } else {
426 error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
427 if (error || !*shared)
428 goto out;
429 }
430
431 if (!tp) {
432 resaligned = xfs_aligned_fsb_count(imap->br_startoff,
433 imap->br_blockcount, xfs_get_cowextsz_hint(ip));
434 resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
435
436 xfs_iunlock(ip, *lockmode);
437 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
438 *lockmode = XFS_ILOCK_EXCL;
439 xfs_ilock(ip, *lockmode);
440
441 if (error)
442 return error;
443
444 error = xfs_qm_dqattach_locked(ip, 0);
445 if (error)
446 goto out;
447 goto retry;
394 } 448 }
395 449
450 error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
451 XFS_QMOPT_RES_REGBLKS);
452 if (error)
453 goto out;
454
455 xfs_trans_ijoin(tp, ip, 0);
456
457 xfs_defer_init(&dfops, &first_block);
458 nimaps = 1;
459
460 /* Allocate the entire reservation as unwritten blocks. */
461 error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
462 XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, &first_block,
463 resblks, imap, &nimaps, &dfops);
464 if (error)
465 goto out_bmap_cancel;
466
467 /* Finish up. */
468 error = xfs_defer_finish(&tp, &dfops, NULL);
469 if (error)
470 goto out_bmap_cancel;
471
472 error = xfs_trans_commit(tp);
473 if (error)
474 return error;
475convert:
476 return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb,
477 &dfops);
478out_bmap_cancel:
479 xfs_defer_cancel(&dfops);
480 xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
481 XFS_QMOPT_RES_REGBLKS);
482out:
483 if (tp)
484 xfs_trans_cancel(tp);
396 return error; 485 return error;
397} 486}
398 487
@@ -641,6 +730,16 @@ xfs_reflink_end_cow(
641 730
642 ASSERT(!isnullstartblock(got.br_startblock)); 731 ASSERT(!isnullstartblock(got.br_startblock));
643 732
733 /*
734 * Don't remap unwritten extents; these are
735 * speculatively preallocated CoW extents that have been
736 * allocated but have not yet been involved in a write.
737 */
738 if (got.br_state == XFS_EXT_UNWRITTEN) {
739 idx--;
740 goto next_extent;
741 }
742
644 /* Unmap the old blocks in the data fork. */ 743 /* Unmap the old blocks in the data fork. */
645 xfs_defer_init(&dfops, &firstfsb); 744 xfs_defer_init(&dfops, &firstfsb);
646 rlen = del.br_blockcount; 745 rlen = del.br_blockcount;
@@ -855,13 +954,14 @@ STATIC int
855xfs_reflink_update_dest( 954xfs_reflink_update_dest(
856 struct xfs_inode *dest, 955 struct xfs_inode *dest,
857 xfs_off_t newlen, 956 xfs_off_t newlen,
858 xfs_extlen_t cowextsize) 957 xfs_extlen_t cowextsize,
958 bool is_dedupe)
859{ 959{
860 struct xfs_mount *mp = dest->i_mount; 960 struct xfs_mount *mp = dest->i_mount;
861 struct xfs_trans *tp; 961 struct xfs_trans *tp;
862 int error; 962 int error;
863 963
864 if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) 964 if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
865 return 0; 965 return 0;
866 966
867 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); 967 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
@@ -882,6 +982,10 @@ xfs_reflink_update_dest(
882 dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; 982 dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
883 } 983 }
884 984
985 if (!is_dedupe) {
986 xfs_trans_ichgtime(tp, dest,
987 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
988 }
885 xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); 989 xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
886 990
887 error = xfs_trans_commit(tp); 991 error = xfs_trans_commit(tp);
@@ -1195,7 +1299,8 @@ xfs_reflink_remap_range(
1195 !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) 1299 !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
1196 cowextsize = src->i_d.di_cowextsize; 1300 cowextsize = src->i_d.di_cowextsize;
1197 1301
1198 ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize); 1302 ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1303 is_dedupe);
1199 1304
1200out_unlock: 1305out_unlock:
1201 xfs_iunlock(src, XFS_MMAPLOCK_EXCL); 1306 xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index aa6a4d64bd35..33ac9b8db683 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -28,8 +28,10 @@ extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
28 28
29extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, 29extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
30 struct xfs_bmbt_irec *imap, bool *shared); 30 struct xfs_bmbt_irec *imap, bool *shared);
31extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, 31extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
32 xfs_off_t offset, xfs_off_t count); 32 struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
33extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
34 xfs_off_t count);
33extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset, 35extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset,
34 struct xfs_bmbt_irec *imap); 36 struct xfs_bmbt_irec *imap);
35extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, 37extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 802bcc326d9f..c57aa7f18087 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1093,7 +1093,6 @@ xfs_rtallocate_extent(
1093 xfs_extlen_t minlen, /* minimum length to allocate */ 1093 xfs_extlen_t minlen, /* minimum length to allocate */
1094 xfs_extlen_t maxlen, /* maximum length to allocate */ 1094 xfs_extlen_t maxlen, /* maximum length to allocate */
1095 xfs_extlen_t *len, /* out: actual length allocated */ 1095 xfs_extlen_t *len, /* out: actual length allocated */
1096 xfs_alloctype_t type, /* allocation type XFS_ALLOCTYPE... */
1097 int wasdel, /* was a delayed allocation extent */ 1096 int wasdel, /* was a delayed allocation extent */
1098 xfs_extlen_t prod, /* extent product factor */ 1097 xfs_extlen_t prod, /* extent product factor */
1099 xfs_rtblock_t *rtblock) /* out: start block allocated */ 1098 xfs_rtblock_t *rtblock) /* out: start block allocated */
@@ -1123,27 +1122,16 @@ xfs_rtallocate_extent(
1123 } 1122 }
1124 } 1123 }
1125 1124
1125retry:
1126 sumbp = NULL; 1126 sumbp = NULL;
1127 /* 1127 if (bno == 0) {
1128 * Allocate by size, or near another block, or exactly at some block.
1129 */
1130 switch (type) {
1131 case XFS_ALLOCTYPE_ANY_AG:
1132 error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len, 1128 error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len,
1133 &sumbp, &sb, prod, &r); 1129 &sumbp, &sb, prod, &r);
1134 break; 1130 } else {
1135 case XFS_ALLOCTYPE_NEAR_BNO:
1136 error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen, 1131 error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen,
1137 len, &sumbp, &sb, prod, &r); 1132 len, &sumbp, &sb, prod, &r);
1138 break;
1139 case XFS_ALLOCTYPE_THIS_BNO:
1140 error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen,
1141 len, &sumbp, &sb, prod, &r);
1142 break;
1143 default:
1144 error = -EIO;
1145 ASSERT(0);
1146 } 1133 }
1134
1147 if (error) 1135 if (error)
1148 return error; 1136 return error;
1149 1137
@@ -1158,7 +1146,11 @@ xfs_rtallocate_extent(
1158 xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen); 1146 xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen);
1159 else 1147 else
1160 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen); 1148 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen);
1149 } else if (prod > 1) {
1150 prod = 1;
1151 goto retry;
1161 } 1152 }
1153
1162 *rtblock = r; 1154 *rtblock = r;
1163 return 0; 1155 return 0;
1164} 1156}
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 355dd9e1cb64..51dd3c726608 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -40,7 +40,6 @@ xfs_rtallocate_extent(
40 xfs_extlen_t minlen, /* minimum length to allocate */ 40 xfs_extlen_t minlen, /* minimum length to allocate */
41 xfs_extlen_t maxlen, /* maximum length to allocate */ 41 xfs_extlen_t maxlen, /* maximum length to allocate */
42 xfs_extlen_t *len, /* out: actual length allocated */ 42 xfs_extlen_t *len, /* out: actual length allocated */
43 xfs_alloctype_t type, /* allocation type XFS_ALLOCTYPE... */
44 int wasdel, /* was a delayed allocation extent */ 43 int wasdel, /* was a delayed allocation extent */
45 xfs_extlen_t prod, /* extent product factor */ 44 xfs_extlen_t prod, /* extent product factor */
46 xfs_rtblock_t *rtblock); /* out: start block allocated */ 45 xfs_rtblock_t *rtblock); /* out: start block allocated */
@@ -122,7 +121,7 @@ int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
122 121
123 122
124#else 123#else
125# define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb) (ENOSYS) 124# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS)
126# define xfs_rtfree_extent(t,b,l) (ENOSYS) 125# define xfs_rtfree_extent(t,b,l) (ENOSYS)
127# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) 126# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS)
128# define xfs_growfs_rt(mp,in) (ENOSYS) 127# define xfs_growfs_rt(mp,in) (ENOSYS)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index eecbaac08eba..890862f2447c 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1956,12 +1956,20 @@ xfs_init_workqueues(void)
1956 if (!xfs_alloc_wq) 1956 if (!xfs_alloc_wq)
1957 return -ENOMEM; 1957 return -ENOMEM;
1958 1958
1959 xfs_discard_wq = alloc_workqueue("xfsdiscard", WQ_UNBOUND, 0);
1960 if (!xfs_discard_wq)
1961 goto out_free_alloc_wq;
1962
1959 return 0; 1963 return 0;
1964out_free_alloc_wq:
1965 destroy_workqueue(xfs_alloc_wq);
1966 return -ENOMEM;
1960} 1967}
1961 1968
1962STATIC void 1969STATIC void
1963xfs_destroy_workqueues(void) 1970xfs_destroy_workqueues(void)
1964{ 1971{
1972 destroy_workqueue(xfs_discard_wq);
1965 destroy_workqueue(xfs_alloc_wq); 1973 destroy_workqueue(xfs_alloc_wq);
1966} 1974}
1967 1975
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index b6418abd85ad..5f2f32408011 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -73,6 +73,8 @@ extern const struct quotactl_ops xfs_quotactl_operations;
73 73
74extern void xfs_reinit_percpu_counters(struct xfs_mount *mp); 74extern void xfs_reinit_percpu_counters(struct xfs_mount *mp);
75 75
76extern struct workqueue_struct *xfs_discard_wq;
77
76#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) 78#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
77 79
78#endif /* __XFS_SUPER_H__ */ 80#endif /* __XFS_SUPER_H__ */
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index de6195e38910..80ac15fb9638 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -93,7 +93,7 @@ to_mp(struct kobject *kobject)
93#ifdef DEBUG 93#ifdef DEBUG
94 94
95STATIC ssize_t 95STATIC ssize_t
96fail_writes_store( 96drop_writes_store(
97 struct kobject *kobject, 97 struct kobject *kobject,
98 const char *buf, 98 const char *buf,
99 size_t count) 99 size_t count)
@@ -107,9 +107,9 @@ fail_writes_store(
107 return ret; 107 return ret;
108 108
109 if (val == 1) 109 if (val == 1)
110 mp->m_fail_writes = true; 110 mp->m_drop_writes = true;
111 else if (val == 0) 111 else if (val == 0)
112 mp->m_fail_writes = false; 112 mp->m_drop_writes = false;
113 else 113 else
114 return -EINVAL; 114 return -EINVAL;
115 115
@@ -117,21 +117,21 @@ fail_writes_store(
117} 117}
118 118
119STATIC ssize_t 119STATIC ssize_t
120fail_writes_show( 120drop_writes_show(
121 struct kobject *kobject, 121 struct kobject *kobject,
122 char *buf) 122 char *buf)
123{ 123{
124 struct xfs_mount *mp = to_mp(kobject); 124 struct xfs_mount *mp = to_mp(kobject);
125 125
126 return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_writes ? 1 : 0); 126 return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_drop_writes ? 1 : 0);
127} 127}
128XFS_SYSFS_ATTR_RW(fail_writes); 128XFS_SYSFS_ATTR_RW(drop_writes);
129 129
130#endif /* DEBUG */ 130#endif /* DEBUG */
131 131
132static struct attribute *xfs_mp_attrs[] = { 132static struct attribute *xfs_mp_attrs[] = {
133#ifdef DEBUG 133#ifdef DEBUG
134 ATTR_LIST(fail_writes), 134 ATTR_LIST(drop_writes),
135#endif 135#endif
136 NULL, 136 NULL,
137}; 137};
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 69c5bcd9a51b..fb7555e73a62 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2245,7 +2245,6 @@ DEFINE_BTREE_CUR_EVENT(xfs_btree_overlapped_query_range);
2245 2245
2246/* deferred ops */ 2246/* deferred ops */
2247struct xfs_defer_pending; 2247struct xfs_defer_pending;
2248struct xfs_defer_intake;
2249struct xfs_defer_ops; 2248struct xfs_defer_ops;
2250 2249
2251DECLARE_EVENT_CLASS(xfs_defer_class, 2250DECLARE_EVENT_CLASS(xfs_defer_class,
@@ -3089,6 +3088,7 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class,
3089 __field(xfs_fileoff_t, lblk) 3088 __field(xfs_fileoff_t, lblk)
3090 __field(xfs_extlen_t, len) 3089 __field(xfs_extlen_t, len)
3091 __field(xfs_fsblock_t, pblk) 3090 __field(xfs_fsblock_t, pblk)
3091 __field(int, state)
3092 ), 3092 ),
3093 TP_fast_assign( 3093 TP_fast_assign(
3094 __entry->dev = VFS_I(ip)->i_sb->s_dev; 3094 __entry->dev = VFS_I(ip)->i_sb->s_dev;
@@ -3096,13 +3096,15 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class,
3096 __entry->lblk = irec->br_startoff; 3096 __entry->lblk = irec->br_startoff;
3097 __entry->len = irec->br_blockcount; 3097 __entry->len = irec->br_blockcount;
3098 __entry->pblk = irec->br_startblock; 3098 __entry->pblk = irec->br_startblock;
3099 __entry->state = irec->br_state;
3099 ), 3100 ),
3100 TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu", 3101 TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu st %d",
3101 MAJOR(__entry->dev), MINOR(__entry->dev), 3102 MAJOR(__entry->dev), MINOR(__entry->dev),
3102 __entry->ino, 3103 __entry->ino,
3103 __entry->lblk, 3104 __entry->lblk,
3104 __entry->len, 3105 __entry->len,
3105 __entry->pblk) 3106 __entry->pblk,
3107 __entry->state)
3106); 3108);
3107#define DEFINE_INODE_IREC_EVENT(name) \ 3109#define DEFINE_INODE_IREC_EVENT(name) \
3108DEFINE_EVENT(xfs_inode_irec_class, name, \ 3110DEFINE_EVENT(xfs_inode_irec_class, name, \
@@ -3242,11 +3244,11 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
3242DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc); 3244DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
3243DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); 3245DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
3244DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); 3246DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
3247DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
3245 3248
3246DEFINE_RW_EVENT(xfs_reflink_reserve_cow); 3249DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
3247DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range);
3248 3250
3249DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write); 3251DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
3250DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping); 3252DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping);
3251DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec); 3253DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec);
3252 3254
@@ -3254,7 +3256,6 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
3254DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); 3256DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
3255DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap); 3257DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
3256 3258
3257DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error);
3258DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); 3259DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
3259DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); 3260DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
3260 3261
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 61b7fbdd3ebd..1646f659b60f 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -32,7 +32,6 @@ struct xfs_mount;
32struct xfs_trans; 32struct xfs_trans;
33struct xfs_trans_res; 33struct xfs_trans_res;
34struct xfs_dquot_acct; 34struct xfs_dquot_acct;
35struct xfs_busy_extent;
36struct xfs_rud_log_item; 35struct xfs_rud_log_item;
37struct xfs_rui_log_item; 36struct xfs_rui_log_item;
38struct xfs_btree_cur; 37struct xfs_btree_cur;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 24ad71173995..2983e52efd07 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -37,9 +37,9 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
37} 37}
38 38
39ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, 39ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
40 struct iomap_ops *ops); 40 const struct iomap_ops *ops);
41int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 41int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
42 struct iomap_ops *ops); 42 const struct iomap_ops *ops);
43int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); 43int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
44int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index); 44int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index);
45int dax_invalidate_mapping_entry_sync(struct address_space *mapping, 45int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
@@ -72,7 +72,7 @@ static inline unsigned int dax_radix_order(void *entry)
72 return 0; 72 return 0;
73} 73}
74int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, 74int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
75 pmd_t *pmd, unsigned int flags, struct iomap_ops *ops); 75 pmd_t *pmd, unsigned int flags, const struct iomap_ops *ops);
76#else 76#else
77static inline unsigned int dax_radix_order(void *entry) 77static inline unsigned int dax_radix_order(void *entry)
78{ 78{
@@ -80,7 +80,7 @@ static inline unsigned int dax_radix_order(void *entry)
80} 80}
81static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma, 81static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
82 unsigned long address, pmd_t *pmd, unsigned int flags, 82 unsigned long address, pmd_t *pmd, unsigned int flags,
83 struct iomap_ops *ops) 83 const struct iomap_ops *ops)
84{ 84{
85 return VM_FAULT_FALLBACK; 85 return VM_FAULT_FALLBACK;
86} 86}
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index a4c94b86401e..891459caa278 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -72,17 +72,17 @@ struct iomap_ops {
72}; 72};
73 73
74ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, 74ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
75 struct iomap_ops *ops); 75 const struct iomap_ops *ops);
76int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, 76int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
77 struct iomap_ops *ops); 77 const struct iomap_ops *ops);
78int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, 78int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
79 bool *did_zero, struct iomap_ops *ops); 79 bool *did_zero, const struct iomap_ops *ops);
80int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, 80int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
81 struct iomap_ops *ops); 81 const struct iomap_ops *ops);
82int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, 82int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
83 struct iomap_ops *ops); 83 const struct iomap_ops *ops);
84int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 84int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
85 loff_t start, loff_t len, struct iomap_ops *ops); 85 loff_t start, loff_t len, const struct iomap_ops *ops);
86 86
87/* 87/*
88 * Flags for direct I/O ->end_io: 88 * Flags for direct I/O ->end_io:
@@ -92,6 +92,6 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
92typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t ret, 92typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t ret,
93 unsigned flags); 93 unsigned flags);
94ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, 94ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
95 struct iomap_ops *ops, iomap_dio_end_io_t end_io); 95 const struct iomap_ops *ops, iomap_dio_end_io_t end_io);
96 96
97#endif /* LINUX_IOMAP_H */ 97#endif /* LINUX_IOMAP_H */
diff --git a/include/linux/module.h b/include/linux/module.h
index f4f542ed3d92..0297c5cd7cdf 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -126,13 +126,13 @@ extern void cleanup_module(void);
126 126
127/* Each module must use one module_init(). */ 127/* Each module must use one module_init(). */
128#define module_init(initfn) \ 128#define module_init(initfn) \
129 static inline initcall_t __inittest(void) \ 129 static inline initcall_t __maybe_unused __inittest(void) \
130 { return initfn; } \ 130 { return initfn; } \
131 int init_module(void) __attribute__((alias(#initfn))); 131 int init_module(void) __attribute__((alias(#initfn)));
132 132
133/* This is only required if you want to be unloadable. */ 133/* This is only required if you want to be unloadable. */
134#define module_exit(exitfn) \ 134#define module_exit(exitfn) \
135 static inline exitcall_t __exittest(void) \ 135 static inline exitcall_t __maybe_unused __exittest(void) \
136 { return exitfn; } \ 136 { return exitfn; } \
137 void cleanup_module(void) __attribute__((alias(#exitfn))); 137 void cleanup_module(void) __attribute__((alias(#exitfn)));
138 138
@@ -281,8 +281,6 @@ enum module_state {
281 MODULE_STATE_UNFORMED, /* Still setting it up. */ 281 MODULE_STATE_UNFORMED, /* Still setting it up. */
282}; 282};
283 283
284struct module;
285
286struct mod_tree_node { 284struct mod_tree_node {
287 struct module *mod; 285 struct module *mod;
288 struct latch_tree_node node; 286 struct latch_tree_node node;
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 3472cc6b7a60..571257e0f53d 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -147,17 +147,11 @@ void early_printk(const char *s, ...) { }
147#endif 147#endif
148 148
149#ifdef CONFIG_PRINTK_NMI 149#ifdef CONFIG_PRINTK_NMI
150extern void printk_nmi_init(void);
151extern void printk_nmi_enter(void); 150extern void printk_nmi_enter(void);
152extern void printk_nmi_exit(void); 151extern void printk_nmi_exit(void);
153extern void printk_nmi_flush(void);
154extern void printk_nmi_flush_on_panic(void);
155#else 152#else
156static inline void printk_nmi_init(void) { }
157static inline void printk_nmi_enter(void) { } 153static inline void printk_nmi_enter(void) { }
158static inline void printk_nmi_exit(void) { } 154static inline void printk_nmi_exit(void) { }
159static inline void printk_nmi_flush(void) { }
160static inline void printk_nmi_flush_on_panic(void) { }
161#endif /* PRINTK_NMI */ 155#endif /* PRINTK_NMI */
162 156
163#ifdef CONFIG_PRINTK 157#ifdef CONFIG_PRINTK
@@ -209,6 +203,9 @@ void __init setup_log_buf(int early);
209__printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...); 203__printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...);
210void dump_stack_print_info(const char *log_lvl); 204void dump_stack_print_info(const char *log_lvl);
211void show_regs_print_info(const char *log_lvl); 205void show_regs_print_info(const char *log_lvl);
206extern void printk_safe_init(void);
207extern void printk_safe_flush(void);
208extern void printk_safe_flush_on_panic(void);
212#else 209#else
213static inline __printf(1, 0) 210static inline __printf(1, 0)
214int vprintk(const char *s, va_list args) 211int vprintk(const char *s, va_list args)
@@ -268,6 +265,18 @@ static inline void dump_stack_print_info(const char *log_lvl)
268static inline void show_regs_print_info(const char *log_lvl) 265static inline void show_regs_print_info(const char *log_lvl)
269{ 266{
270} 267}
268
269static inline void printk_safe_init(void)
270{
271}
272
273static inline void printk_safe_flush(void)
274{
275}
276
277static inline void printk_safe_flush_on_panic(void)
278{
279}
271#endif 280#endif
272 281
273extern asmlinkage void dump_stack(void) __cold; 282extern asmlinkage void dump_stack(void) __cold;
diff --git a/init/Kconfig b/init/Kconfig
index 55bb6fbc294e..483ad679aa37 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -861,17 +861,19 @@ config LOG_CPU_MAX_BUF_SHIFT
861 13 => 8 KB for each CPU 861 13 => 8 KB for each CPU
862 12 => 4 KB for each CPU 862 12 => 4 KB for each CPU
863 863
864config NMI_LOG_BUF_SHIFT 864config PRINTK_SAFE_LOG_BUF_SHIFT
865 int "Temporary per-CPU NMI log buffer size (12 => 4KB, 13 => 8KB)" 865 int "Temporary per-CPU printk log buffer size (12 => 4KB, 13 => 8KB)"
866 range 10 21 866 range 10 21
867 default 13 867 default 13
868 depends on PRINTK_NMI 868 depends on PRINTK
869 help 869 help
870 Select the size of a per-CPU buffer where NMI messages are temporary 870 Select the size of an alternate printk per-CPU buffer where messages
871 stored. They are copied to the main log buffer in a safe context 871 printed from usafe contexts are temporary stored. One example would
872 to avoid a deadlock. The value defines the size as a power of 2. 872 be NMI messages, another one - printk recursion. The messages are
873 copied to the main log buffer in a safe context to avoid a deadlock.
874 The value defines the size as a power of 2.
873 875
874 NMI messages are rare and limited. The largest one is when 876 Those messages are rare and limited. The largest one is when
875 a backtrace is printed. It usually fits into 4KB. Select 877 a backtrace is printed. It usually fits into 4KB. Select
876 8KB if you want to be on the safe side. 878 8KB if you want to be on the safe side.
877 879
diff --git a/init/main.c b/init/main.c
index c8a00f0f10ff..24ea48745061 100644
--- a/init/main.c
+++ b/init/main.c
@@ -581,7 +581,7 @@ asmlinkage __visible void __init start_kernel(void)
581 timekeeping_init(); 581 timekeeping_init();
582 time_init(); 582 time_init();
583 sched_clock_postinit(); 583 sched_clock_postinit();
584 printk_nmi_init(); 584 printk_safe_init();
585 perf_event_init(); 585 perf_event_init();
586 profile_init(); 586 profile_init();
587 call_function_init(); 587 call_function_init();
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index a01974e1bf6b..bfe62d5b3872 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -916,7 +916,7 @@ void crash_kexec(struct pt_regs *regs)
916 old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); 916 old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
917 if (old_cpu == PANIC_CPU_INVALID) { 917 if (old_cpu == PANIC_CPU_INVALID) {
918 /* This is the 1st CPU which comes here, so go ahead. */ 918 /* This is the 1st CPU which comes here, so go ahead. */
919 printk_nmi_flush_on_panic(); 919 printk_safe_flush_on_panic();
920 __crash_kexec(regs); 920 __crash_kexec(regs);
921 921
922 /* 922 /*
diff --git a/kernel/module.c b/kernel/module.c
index a3889169a3ae..7eba6dea4f41 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2811,6 +2811,8 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
2811 if (get_modinfo(info, "livepatch")) { 2811 if (get_modinfo(info, "livepatch")) {
2812 mod->klp = true; 2812 mod->klp = true;
2813 add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK); 2813 add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
2814 pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n",
2815 mod->name);
2814 } 2816 }
2815 2817
2816 return 0; 2818 return 0;
@@ -3723,6 +3725,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
3723 mod_sysfs_teardown(mod); 3725 mod_sysfs_teardown(mod);
3724 coming_cleanup: 3726 coming_cleanup:
3725 mod->state = MODULE_STATE_GOING; 3727 mod->state = MODULE_STATE_GOING;
3728 destroy_params(mod->kp, mod->num_kp);
3726 blocking_notifier_call_chain(&module_notify_list, 3729 blocking_notifier_call_chain(&module_notify_list,
3727 MODULE_STATE_GOING, mod); 3730 MODULE_STATE_GOING, mod);
3728 klp_module_going(mod); 3731 klp_module_going(mod);
@@ -4169,22 +4172,23 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
4169 struct module *mod; 4172 struct module *mod;
4170 4173
4171 preempt_disable(); 4174 preempt_disable();
4172 list_for_each_entry_rcu(mod, &modules, list) { 4175 mod = __module_address(addr);
4173 if (mod->state == MODULE_STATE_UNFORMED) 4176 if (!mod)
4174 continue; 4177 goto out;
4175 if (mod->num_exentries == 0)
4176 continue;
4177 4178
4178 e = search_extable(mod->extable, 4179 if (!mod->num_exentries)
4179 mod->extable + mod->num_exentries - 1, 4180 goto out;
4180 addr); 4181
4181 if (e) 4182 e = search_extable(mod->extable,
4182 break; 4183 mod->extable + mod->num_exentries - 1,
4183 } 4184 addr);
4185out:
4184 preempt_enable(); 4186 preempt_enable();
4185 4187
4186 /* Now, if we found one, we are running inside it now, hence 4188 /*
4187 we cannot unload the module, hence no refcnt needed. */ 4189 * Now, if we found one, we are running inside it now, hence
4190 * we cannot unload the module, hence no refcnt needed.
4191 */
4188 return e; 4192 return e;
4189} 4193}
4190 4194
diff --git a/kernel/panic.c b/kernel/panic.c
index 08aa88dde7de..b95959733ce0 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -188,7 +188,7 @@ void panic(const char *fmt, ...)
188 * Bypass the panic_cpu check and call __crash_kexec directly. 188 * Bypass the panic_cpu check and call __crash_kexec directly.
189 */ 189 */
190 if (!_crash_kexec_post_notifiers) { 190 if (!_crash_kexec_post_notifiers) {
191 printk_nmi_flush_on_panic(); 191 printk_safe_flush_on_panic();
192 __crash_kexec(NULL); 192 __crash_kexec(NULL);
193 193
194 /* 194 /*
@@ -213,7 +213,7 @@ void panic(const char *fmt, ...)
213 atomic_notifier_call_chain(&panic_notifier_list, 0, buf); 213 atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
214 214
215 /* Call flush even twice. It tries harder with a single online CPU */ 215 /* Call flush even twice. It tries harder with a single online CPU */
216 printk_nmi_flush_on_panic(); 216 printk_safe_flush_on_panic();
217 kmsg_dump(KMSG_DUMP_PANIC); 217 kmsg_dump(KMSG_DUMP_PANIC);
218 218
219 /* 219 /*
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index abb0042a427b..4a2ffc39eb95 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,3 +1,3 @@
1obj-y = printk.o 1obj-y = printk.o
2obj-$(CONFIG_PRINTK_NMI) += nmi.o 2obj-$(CONFIG_PRINTK) += printk_safe.o
3obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o 3obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 7fd2838fa417..1db044f808b7 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -16,42 +16,55 @@
16 */ 16 */
17#include <linux/percpu.h> 17#include <linux/percpu.h>
18 18
19typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args); 19#ifdef CONFIG_PRINTK
20 20
21int __printf(1, 0) vprintk_default(const char *fmt, va_list args); 21#define PRINTK_SAFE_CONTEXT_MASK 0x7fffffff
22 22#define PRINTK_NMI_CONTEXT_MASK 0x80000000
23#ifdef CONFIG_PRINTK_NMI
24 23
25extern raw_spinlock_t logbuf_lock; 24extern raw_spinlock_t logbuf_lock;
26 25
26__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
27__printf(1, 0) int vprintk_func(const char *fmt, va_list args);
28void __printk_safe_enter(void);
29void __printk_safe_exit(void);
30
31#define printk_safe_enter_irqsave(flags) \
32 do { \
33 local_irq_save(flags); \
34 __printk_safe_enter(); \
35 } while (0)
36
37#define printk_safe_exit_irqrestore(flags) \
38 do { \
39 __printk_safe_exit(); \
40 local_irq_restore(flags); \
41 } while (0)
42
43#define printk_safe_enter_irq() \
44 do { \
45 local_irq_disable(); \
46 __printk_safe_enter(); \
47 } while (0)
48
49#define printk_safe_exit_irq() \
50 do { \
51 __printk_safe_exit(); \
52 local_irq_enable(); \
53 } while (0)
54
55#else
56
57__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; }
58
27/* 59/*
28 * printk() could not take logbuf_lock in NMI context. Instead, 60 * In !PRINTK builds we still export logbuf_lock spin_lock, console_sem
29 * it temporary stores the strings into a per-CPU buffer. 61 * semaphore and some of console functions (console_unlock()/etc.), so
30 * The alternative implementation is chosen transparently 62 * printk-safe must preserve the existing local IRQ guarantees.
31 * via per-CPU variable.
32 */ 63 */
33DECLARE_PER_CPU(printk_func_t, printk_func); 64#define printk_safe_enter_irqsave(flags) local_irq_save(flags)
34static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) 65#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
35{ 66
36 return this_cpu_read(printk_func)(fmt, args); 67#define printk_safe_enter_irq() local_irq_disable()
37} 68#define printk_safe_exit_irq() local_irq_enable()
38 69
39extern atomic_t nmi_message_lost; 70#endif /* CONFIG_PRINTK */
40static inline int get_nmi_message_lost(void)
41{
42 return atomic_xchg(&nmi_message_lost, 0);
43}
44
45#else /* CONFIG_PRINTK_NMI */
46
47static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
48{
49 return vprintk_default(fmt, args);
50}
51
52static inline int get_nmi_message_lost(void)
53{
54 return 0;
55}
56
57#endif /* CONFIG_PRINTK_NMI */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 4ba3d34938c0..34da86e73d00 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -213,17 +213,36 @@ static int nr_ext_console_drivers;
213 213
214static int __down_trylock_console_sem(unsigned long ip) 214static int __down_trylock_console_sem(unsigned long ip)
215{ 215{
216 if (down_trylock(&console_sem)) 216 int lock_failed;
217 unsigned long flags;
218
219 /*
220 * Here and in __up_console_sem() we need to be in safe mode,
221 * because spindump/WARN/etc from under console ->lock will
222 * deadlock in printk()->down_trylock_console_sem() otherwise.
223 */
224 printk_safe_enter_irqsave(flags);
225 lock_failed = down_trylock(&console_sem);
226 printk_safe_exit_irqrestore(flags);
227
228 if (lock_failed)
217 return 1; 229 return 1;
218 mutex_acquire(&console_lock_dep_map, 0, 1, ip); 230 mutex_acquire(&console_lock_dep_map, 0, 1, ip);
219 return 0; 231 return 0;
220} 232}
221#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_) 233#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_)
222 234
223#define up_console_sem() do { \ 235static void __up_console_sem(unsigned long ip)
224 mutex_release(&console_lock_dep_map, 1, _RET_IP_);\ 236{
225 up(&console_sem);\ 237 unsigned long flags;
226} while (0) 238
239 mutex_release(&console_lock_dep_map, 1, ip);
240
241 printk_safe_enter_irqsave(flags);
242 up(&console_sem);
243 printk_safe_exit_irqrestore(flags);
244}
245#define up_console_sem() __up_console_sem(_RET_IP_)
227 246
228/* 247/*
229 * This is used for debugging the mess that is the VT code by 248 * This is used for debugging the mess that is the VT code by
@@ -351,6 +370,34 @@ __packed __aligned(4)
351 */ 370 */
352DEFINE_RAW_SPINLOCK(logbuf_lock); 371DEFINE_RAW_SPINLOCK(logbuf_lock);
353 372
373/*
374 * Helper macros to lock/unlock logbuf_lock and switch between
375 * printk-safe/unsafe modes.
376 */
377#define logbuf_lock_irq() \
378 do { \
379 printk_safe_enter_irq(); \
380 raw_spin_lock(&logbuf_lock); \
381 } while (0)
382
383#define logbuf_unlock_irq() \
384 do { \
385 raw_spin_unlock(&logbuf_lock); \
386 printk_safe_exit_irq(); \
387 } while (0)
388
389#define logbuf_lock_irqsave(flags) \
390 do { \
391 printk_safe_enter_irqsave(flags); \
392 raw_spin_lock(&logbuf_lock); \
393 } while (0)
394
395#define logbuf_unlock_irqrestore(flags) \
396 do { \
397 raw_spin_unlock(&logbuf_lock); \
398 printk_safe_exit_irqrestore(flags); \
399 } while (0)
400
354#ifdef CONFIG_PRINTK 401#ifdef CONFIG_PRINTK
355DECLARE_WAIT_QUEUE_HEAD(log_wait); 402DECLARE_WAIT_QUEUE_HEAD(log_wait);
356/* the next printk record to read by syslog(READ) or /proc/kmsg */ 403/* the next printk record to read by syslog(READ) or /proc/kmsg */
@@ -782,20 +829,21 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
782 ret = mutex_lock_interruptible(&user->lock); 829 ret = mutex_lock_interruptible(&user->lock);
783 if (ret) 830 if (ret)
784 return ret; 831 return ret;
785 raw_spin_lock_irq(&logbuf_lock); 832
833 logbuf_lock_irq();
786 while (user->seq == log_next_seq) { 834 while (user->seq == log_next_seq) {
787 if (file->f_flags & O_NONBLOCK) { 835 if (file->f_flags & O_NONBLOCK) {
788 ret = -EAGAIN; 836 ret = -EAGAIN;
789 raw_spin_unlock_irq(&logbuf_lock); 837 logbuf_unlock_irq();
790 goto out; 838 goto out;
791 } 839 }
792 840
793 raw_spin_unlock_irq(&logbuf_lock); 841 logbuf_unlock_irq();
794 ret = wait_event_interruptible(log_wait, 842 ret = wait_event_interruptible(log_wait,
795 user->seq != log_next_seq); 843 user->seq != log_next_seq);
796 if (ret) 844 if (ret)
797 goto out; 845 goto out;
798 raw_spin_lock_irq(&logbuf_lock); 846 logbuf_lock_irq();
799 } 847 }
800 848
801 if (user->seq < log_first_seq) { 849 if (user->seq < log_first_seq) {
@@ -803,7 +851,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
803 user->idx = log_first_idx; 851 user->idx = log_first_idx;
804 user->seq = log_first_seq; 852 user->seq = log_first_seq;
805 ret = -EPIPE; 853 ret = -EPIPE;
806 raw_spin_unlock_irq(&logbuf_lock); 854 logbuf_unlock_irq();
807 goto out; 855 goto out;
808 } 856 }
809 857
@@ -816,7 +864,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
816 864
817 user->idx = log_next(user->idx); 865 user->idx = log_next(user->idx);
818 user->seq++; 866 user->seq++;
819 raw_spin_unlock_irq(&logbuf_lock); 867 logbuf_unlock_irq();
820 868
821 if (len > count) { 869 if (len > count) {
822 ret = -EINVAL; 870 ret = -EINVAL;
@@ -843,7 +891,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
843 if (offset) 891 if (offset)
844 return -ESPIPE; 892 return -ESPIPE;
845 893
846 raw_spin_lock_irq(&logbuf_lock); 894 logbuf_lock_irq();
847 switch (whence) { 895 switch (whence) {
848 case SEEK_SET: 896 case SEEK_SET:
849 /* the first record */ 897 /* the first record */
@@ -867,7 +915,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
867 default: 915 default:
868 ret = -EINVAL; 916 ret = -EINVAL;
869 } 917 }
870 raw_spin_unlock_irq(&logbuf_lock); 918 logbuf_unlock_irq();
871 return ret; 919 return ret;
872} 920}
873 921
@@ -881,7 +929,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
881 929
882 poll_wait(file, &log_wait, wait); 930 poll_wait(file, &log_wait, wait);
883 931
884 raw_spin_lock_irq(&logbuf_lock); 932 logbuf_lock_irq();
885 if (user->seq < log_next_seq) { 933 if (user->seq < log_next_seq) {
886 /* return error when data has vanished underneath us */ 934 /* return error when data has vanished underneath us */
887 if (user->seq < log_first_seq) 935 if (user->seq < log_first_seq)
@@ -889,7 +937,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
889 else 937 else
890 ret = POLLIN|POLLRDNORM; 938 ret = POLLIN|POLLRDNORM;
891 } 939 }
892 raw_spin_unlock_irq(&logbuf_lock); 940 logbuf_unlock_irq();
893 941
894 return ret; 942 return ret;
895} 943}
@@ -919,10 +967,10 @@ static int devkmsg_open(struct inode *inode, struct file *file)
919 967
920 mutex_init(&user->lock); 968 mutex_init(&user->lock);
921 969
922 raw_spin_lock_irq(&logbuf_lock); 970 logbuf_lock_irq();
923 user->idx = log_first_idx; 971 user->idx = log_first_idx;
924 user->seq = log_first_seq; 972 user->seq = log_first_seq;
925 raw_spin_unlock_irq(&logbuf_lock); 973 logbuf_unlock_irq();
926 974
927 file->private_data = user; 975 file->private_data = user;
928 return 0; 976 return 0;
@@ -1064,13 +1112,13 @@ void __init setup_log_buf(int early)
1064 return; 1112 return;
1065 } 1113 }
1066 1114
1067 raw_spin_lock_irqsave(&logbuf_lock, flags); 1115 logbuf_lock_irqsave(flags);
1068 log_buf_len = new_log_buf_len; 1116 log_buf_len = new_log_buf_len;
1069 log_buf = new_log_buf; 1117 log_buf = new_log_buf;
1070 new_log_buf_len = 0; 1118 new_log_buf_len = 0;
1071 free = __LOG_BUF_LEN - log_next_idx; 1119 free = __LOG_BUF_LEN - log_next_idx;
1072 memcpy(log_buf, __log_buf, __LOG_BUF_LEN); 1120 memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
1073 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 1121 logbuf_unlock_irqrestore(flags);
1074 1122
1075 pr_info("log_buf_len: %d bytes\n", log_buf_len); 1123 pr_info("log_buf_len: %d bytes\n", log_buf_len);
1076 pr_info("early log buf free: %d(%d%%)\n", 1124 pr_info("early log buf free: %d(%d%%)\n",
@@ -1248,7 +1296,7 @@ static int syslog_print(char __user *buf, int size)
1248 size_t n; 1296 size_t n;
1249 size_t skip; 1297 size_t skip;
1250 1298
1251 raw_spin_lock_irq(&logbuf_lock); 1299 logbuf_lock_irq();
1252 if (syslog_seq < log_first_seq) { 1300 if (syslog_seq < log_first_seq) {
1253 /* messages are gone, move to first one */ 1301 /* messages are gone, move to first one */
1254 syslog_seq = log_first_seq; 1302 syslog_seq = log_first_seq;
@@ -1256,7 +1304,7 @@ static int syslog_print(char __user *buf, int size)
1256 syslog_partial = 0; 1304 syslog_partial = 0;
1257 } 1305 }
1258 if (syslog_seq == log_next_seq) { 1306 if (syslog_seq == log_next_seq) {
1259 raw_spin_unlock_irq(&logbuf_lock); 1307 logbuf_unlock_irq();
1260 break; 1308 break;
1261 } 1309 }
1262 1310
@@ -1275,7 +1323,7 @@ static int syslog_print(char __user *buf, int size)
1275 syslog_partial += n; 1323 syslog_partial += n;
1276 } else 1324 } else
1277 n = 0; 1325 n = 0;
1278 raw_spin_unlock_irq(&logbuf_lock); 1326 logbuf_unlock_irq();
1279 1327
1280 if (!n) 1328 if (!n)
1281 break; 1329 break;
@@ -1304,7 +1352,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1304 if (!text) 1352 if (!text)
1305 return -ENOMEM; 1353 return -ENOMEM;
1306 1354
1307 raw_spin_lock_irq(&logbuf_lock); 1355 logbuf_lock_irq();
1308 if (buf) { 1356 if (buf) {
1309 u64 next_seq; 1357 u64 next_seq;
1310 u64 seq; 1358 u64 seq;
@@ -1352,12 +1400,12 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1352 idx = log_next(idx); 1400 idx = log_next(idx);
1353 seq++; 1401 seq++;
1354 1402
1355 raw_spin_unlock_irq(&logbuf_lock); 1403 logbuf_unlock_irq();
1356 if (copy_to_user(buf + len, text, textlen)) 1404 if (copy_to_user(buf + len, text, textlen))
1357 len = -EFAULT; 1405 len = -EFAULT;
1358 else 1406 else
1359 len += textlen; 1407 len += textlen;
1360 raw_spin_lock_irq(&logbuf_lock); 1408 logbuf_lock_irq();
1361 1409
1362 if (seq < log_first_seq) { 1410 if (seq < log_first_seq) {
1363 /* messages are gone, move to next one */ 1411 /* messages are gone, move to next one */
@@ -1371,7 +1419,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1371 clear_seq = log_next_seq; 1419 clear_seq = log_next_seq;
1372 clear_idx = log_next_idx; 1420 clear_idx = log_next_idx;
1373 } 1421 }
1374 raw_spin_unlock_irq(&logbuf_lock); 1422 logbuf_unlock_irq();
1375 1423
1376 kfree(text); 1424 kfree(text);
1377 return len; 1425 return len;
@@ -1458,7 +1506,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
1458 break; 1506 break;
1459 /* Number of chars in the log buffer */ 1507 /* Number of chars in the log buffer */
1460 case SYSLOG_ACTION_SIZE_UNREAD: 1508 case SYSLOG_ACTION_SIZE_UNREAD:
1461 raw_spin_lock_irq(&logbuf_lock); 1509 logbuf_lock_irq();
1462 if (syslog_seq < log_first_seq) { 1510 if (syslog_seq < log_first_seq) {
1463 /* messages are gone, move to first one */ 1511 /* messages are gone, move to first one */
1464 syslog_seq = log_first_seq; 1512 syslog_seq = log_first_seq;
@@ -1486,7 +1534,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
1486 } 1534 }
1487 error -= syslog_partial; 1535 error -= syslog_partial;
1488 } 1536 }
1489 raw_spin_unlock_irq(&logbuf_lock); 1537 logbuf_unlock_irq();
1490 break; 1538 break;
1491 /* Size of the log buffer */ 1539 /* Size of the log buffer */
1492 case SYSLOG_ACTION_SIZE_BUFFER: 1540 case SYSLOG_ACTION_SIZE_BUFFER:
@@ -1510,8 +1558,7 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
1510 * log_buf[start] to log_buf[end - 1]. 1558 * log_buf[start] to log_buf[end - 1].
1511 * The console_lock must be held. 1559 * The console_lock must be held.
1512 */ 1560 */
1513static void call_console_drivers(int level, 1561static void call_console_drivers(const char *ext_text, size_t ext_len,
1514 const char *ext_text, size_t ext_len,
1515 const char *text, size_t len) 1562 const char *text, size_t len)
1516{ 1563{
1517 struct console *con; 1564 struct console *con;
@@ -1538,28 +1585,6 @@ static void call_console_drivers(int level,
1538 } 1585 }
1539} 1586}
1540 1587
1541/*
1542 * Zap console related locks when oopsing.
1543 * To leave time for slow consoles to print a full oops,
1544 * only zap at most once every 30 seconds.
1545 */
1546static void zap_locks(void)
1547{
1548 static unsigned long oops_timestamp;
1549
1550 if (time_after_eq(jiffies, oops_timestamp) &&
1551 !time_after(jiffies, oops_timestamp + 30 * HZ))
1552 return;
1553
1554 oops_timestamp = jiffies;
1555
1556 debug_locks_off();
1557 /* If a crash is occurring, make sure we can't deadlock */
1558 raw_spin_lock_init(&logbuf_lock);
1559 /* And make sure that we print immediately */
1560 sema_init(&console_sem, 1);
1561}
1562
1563int printk_delay_msec __read_mostly; 1588int printk_delay_msec __read_mostly;
1564 1589
1565static inline void printk_delay(void) 1590static inline void printk_delay(void)
@@ -1669,18 +1694,13 @@ asmlinkage int vprintk_emit(int facility, int level,
1669 const char *dict, size_t dictlen, 1694 const char *dict, size_t dictlen,
1670 const char *fmt, va_list args) 1695 const char *fmt, va_list args)
1671{ 1696{
1672 static bool recursion_bug;
1673 static char textbuf[LOG_LINE_MAX]; 1697 static char textbuf[LOG_LINE_MAX];
1674 char *text = textbuf; 1698 char *text = textbuf;
1675 size_t text_len = 0; 1699 size_t text_len = 0;
1676 enum log_flags lflags = 0; 1700 enum log_flags lflags = 0;
1677 unsigned long flags; 1701 unsigned long flags;
1678 int this_cpu;
1679 int printed_len = 0; 1702 int printed_len = 0;
1680 int nmi_message_lost;
1681 bool in_sched = false; 1703 bool in_sched = false;
1682 /* cpu currently holding logbuf_lock in this function */
1683 static unsigned int logbuf_cpu = UINT_MAX;
1684 1704
1685 if (level == LOGLEVEL_SCHED) { 1705 if (level == LOGLEVEL_SCHED) {
1686 level = LOGLEVEL_DEFAULT; 1706 level = LOGLEVEL_DEFAULT;
@@ -1690,53 +1710,8 @@ asmlinkage int vprintk_emit(int facility, int level,
1690 boot_delay_msec(level); 1710 boot_delay_msec(level);
1691 printk_delay(); 1711 printk_delay();
1692 1712
1693 local_irq_save(flags);
1694 this_cpu = smp_processor_id();
1695
1696 /*
1697 * Ouch, printk recursed into itself!
1698 */
1699 if (unlikely(logbuf_cpu == this_cpu)) {
1700 /*
1701 * If a crash is occurring during printk() on this CPU,
1702 * then try to get the crash message out but make sure
1703 * we can't deadlock. Otherwise just return to avoid the
1704 * recursion and return - but flag the recursion so that
1705 * it can be printed at the next appropriate moment:
1706 */
1707 if (!oops_in_progress && !lockdep_recursing(current)) {
1708 recursion_bug = true;
1709 local_irq_restore(flags);
1710 return 0;
1711 }
1712 zap_locks();
1713 }
1714
1715 lockdep_off();
1716 /* This stops the holder of console_sem just where we want him */ 1713 /* This stops the holder of console_sem just where we want him */
1717 raw_spin_lock(&logbuf_lock); 1714 logbuf_lock_irqsave(flags);
1718 logbuf_cpu = this_cpu;
1719
1720 if (unlikely(recursion_bug)) {
1721 static const char recursion_msg[] =
1722 "BUG: recent printk recursion!";
1723
1724 recursion_bug = false;
1725 /* emit KERN_CRIT message */
1726 printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
1727 NULL, 0, recursion_msg,
1728 strlen(recursion_msg));
1729 }
1730
1731 nmi_message_lost = get_nmi_message_lost();
1732 if (unlikely(nmi_message_lost)) {
1733 text_len = scnprintf(textbuf, sizeof(textbuf),
1734 "BAD LUCK: lost %d message(s) from NMI context!",
1735 nmi_message_lost);
1736 printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
1737 NULL, 0, textbuf, text_len);
1738 }
1739
1740 /* 1715 /*
1741 * The printf needs to come first; we need the syslog 1716 * The printf needs to come first; we need the syslog
1742 * prefix which might be passed-in as a parameter. 1717 * prefix which might be passed-in as a parameter.
@@ -1779,14 +1754,10 @@ asmlinkage int vprintk_emit(int facility, int level,
1779 1754
1780 printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len); 1755 printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len);
1781 1756
1782 logbuf_cpu = UINT_MAX; 1757 logbuf_unlock_irqrestore(flags);
1783 raw_spin_unlock(&logbuf_lock);
1784 lockdep_on();
1785 local_irq_restore(flags);
1786 1758
1787 /* If called from the scheduler, we can not call up(). */ 1759 /* If called from the scheduler, we can not call up(). */
1788 if (!in_sched) { 1760 if (!in_sched) {
1789 lockdep_off();
1790 /* 1761 /*
1791 * Try to acquire and then immediately release the console 1762 * Try to acquire and then immediately release the console
1792 * semaphore. The release will print out buffers and wake up 1763 * semaphore. The release will print out buffers and wake up
@@ -1794,7 +1765,6 @@ asmlinkage int vprintk_emit(int facility, int level,
1794 */ 1765 */
1795 if (console_trylock()) 1766 if (console_trylock())
1796 console_unlock(); 1767 console_unlock();
1797 lockdep_on();
1798 } 1768 }
1799 1769
1800 return printed_len; 1770 return printed_len;
@@ -1803,7 +1773,7 @@ EXPORT_SYMBOL(vprintk_emit);
1803 1773
1804asmlinkage int vprintk(const char *fmt, va_list args) 1774asmlinkage int vprintk(const char *fmt, va_list args)
1805{ 1775{
1806 return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); 1776 return vprintk_func(fmt, args);
1807} 1777}
1808EXPORT_SYMBOL(vprintk); 1778EXPORT_SYMBOL(vprintk);
1809 1779
@@ -1895,16 +1865,12 @@ static ssize_t msg_print_ext_header(char *buf, size_t size,
1895static ssize_t msg_print_ext_body(char *buf, size_t size, 1865static ssize_t msg_print_ext_body(char *buf, size_t size,
1896 char *dict, size_t dict_len, 1866 char *dict, size_t dict_len,
1897 char *text, size_t text_len) { return 0; } 1867 char *text, size_t text_len) { return 0; }
1898static void call_console_drivers(int level, 1868static void call_console_drivers(const char *ext_text, size_t ext_len,
1899 const char *ext_text, size_t ext_len,
1900 const char *text, size_t len) {} 1869 const char *text, size_t len) {}
1901static size_t msg_print_text(const struct printk_log *msg, 1870static size_t msg_print_text(const struct printk_log *msg,
1902 bool syslog, char *buf, size_t size) { return 0; } 1871 bool syslog, char *buf, size_t size) { return 0; }
1903static bool suppress_message_printing(int level) { return false; } 1872static bool suppress_message_printing(int level) { return false; }
1904 1873
1905/* Still needs to be defined for users */
1906DEFINE_PER_CPU(printk_func_t, printk_func);
1907
1908#endif /* CONFIG_PRINTK */ 1874#endif /* CONFIG_PRINTK */
1909 1875
1910#ifdef CONFIG_EARLY_PRINTK 1876#ifdef CONFIG_EARLY_PRINTK
@@ -2220,9 +2186,9 @@ again:
2220 struct printk_log *msg; 2186 struct printk_log *msg;
2221 size_t ext_len = 0; 2187 size_t ext_len = 0;
2222 size_t len; 2188 size_t len;
2223 int level;
2224 2189
2225 raw_spin_lock_irqsave(&logbuf_lock, flags); 2190 printk_safe_enter_irqsave(flags);
2191 raw_spin_lock(&logbuf_lock);
2226 if (seen_seq != log_next_seq) { 2192 if (seen_seq != log_next_seq) {
2227 wake_klogd = true; 2193 wake_klogd = true;
2228 seen_seq = log_next_seq; 2194 seen_seq = log_next_seq;
@@ -2243,8 +2209,7 @@ skip:
2243 break; 2209 break;
2244 2210
2245 msg = log_from_idx(console_idx); 2211 msg = log_from_idx(console_idx);
2246 level = msg->level; 2212 if (suppress_message_printing(msg->level)) {
2247 if (suppress_message_printing(level)) {
2248 /* 2213 /*
2249 * Skip record we have buffered and already printed 2214 * Skip record we have buffered and already printed
2250 * directly to the console when we received it, and 2215 * directly to the console when we received it, and
@@ -2270,9 +2235,9 @@ skip:
2270 raw_spin_unlock(&logbuf_lock); 2235 raw_spin_unlock(&logbuf_lock);
2271 2236
2272 stop_critical_timings(); /* don't trace print latency */ 2237 stop_critical_timings(); /* don't trace print latency */
2273 call_console_drivers(level, ext_text, ext_len, text, len); 2238 call_console_drivers(ext_text, ext_len, text, len);
2274 start_critical_timings(); 2239 start_critical_timings();
2275 local_irq_restore(flags); 2240 printk_safe_exit_irqrestore(flags);
2276 2241
2277 if (do_cond_resched) 2242 if (do_cond_resched)
2278 cond_resched(); 2243 cond_resched();
@@ -2295,7 +2260,8 @@ skip:
2295 */ 2260 */
2296 raw_spin_lock(&logbuf_lock); 2261 raw_spin_lock(&logbuf_lock);
2297 retry = console_seq != log_next_seq; 2262 retry = console_seq != log_next_seq;
2298 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2263 raw_spin_unlock(&logbuf_lock);
2264 printk_safe_exit_irqrestore(flags);
2299 2265
2300 if (retry && console_trylock()) 2266 if (retry && console_trylock())
2301 goto again; 2267 goto again;
@@ -2558,10 +2524,10 @@ void register_console(struct console *newcon)
2558 * console_unlock(); will print out the buffered messages 2524 * console_unlock(); will print out the buffered messages
2559 * for us. 2525 * for us.
2560 */ 2526 */
2561 raw_spin_lock_irqsave(&logbuf_lock, flags); 2527 logbuf_lock_irqsave(flags);
2562 console_seq = syslog_seq; 2528 console_seq = syslog_seq;
2563 console_idx = syslog_idx; 2529 console_idx = syslog_idx;
2564 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2530 logbuf_unlock_irqrestore(flags);
2565 /* 2531 /*
2566 * We're about to replay the log buffer. Only do this to the 2532 * We're about to replay the log buffer. Only do this to the
2567 * just-registered console to avoid excessive message spam to 2533 * just-registered console to avoid excessive message spam to
@@ -2860,12 +2826,12 @@ void kmsg_dump(enum kmsg_dump_reason reason)
2860 /* initialize iterator with data about the stored records */ 2826 /* initialize iterator with data about the stored records */
2861 dumper->active = true; 2827 dumper->active = true;
2862 2828
2863 raw_spin_lock_irqsave(&logbuf_lock, flags); 2829 logbuf_lock_irqsave(flags);
2864 dumper->cur_seq = clear_seq; 2830 dumper->cur_seq = clear_seq;
2865 dumper->cur_idx = clear_idx; 2831 dumper->cur_idx = clear_idx;
2866 dumper->next_seq = log_next_seq; 2832 dumper->next_seq = log_next_seq;
2867 dumper->next_idx = log_next_idx; 2833 dumper->next_idx = log_next_idx;
2868 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2834 logbuf_unlock_irqrestore(flags);
2869 2835
2870 /* invoke dumper which will iterate over records */ 2836 /* invoke dumper which will iterate over records */
2871 dumper->dump(dumper, reason); 2837 dumper->dump(dumper, reason);
@@ -2950,9 +2916,9 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
2950 unsigned long flags; 2916 unsigned long flags;
2951 bool ret; 2917 bool ret;
2952 2918
2953 raw_spin_lock_irqsave(&logbuf_lock, flags); 2919 logbuf_lock_irqsave(flags);
2954 ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); 2920 ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
2955 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2921 logbuf_unlock_irqrestore(flags);
2956 2922
2957 return ret; 2923 return ret;
2958} 2924}
@@ -2991,7 +2957,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
2991 if (!dumper->active) 2957 if (!dumper->active)
2992 goto out; 2958 goto out;
2993 2959
2994 raw_spin_lock_irqsave(&logbuf_lock, flags); 2960 logbuf_lock_irqsave(flags);
2995 if (dumper->cur_seq < log_first_seq) { 2961 if (dumper->cur_seq < log_first_seq) {
2996 /* messages are gone, move to first available one */ 2962 /* messages are gone, move to first available one */
2997 dumper->cur_seq = log_first_seq; 2963 dumper->cur_seq = log_first_seq;
@@ -3000,7 +2966,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
3000 2966
3001 /* last entry */ 2967 /* last entry */
3002 if (dumper->cur_seq >= dumper->next_seq) { 2968 if (dumper->cur_seq >= dumper->next_seq) {
3003 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2969 logbuf_unlock_irqrestore(flags);
3004 goto out; 2970 goto out;
3005 } 2971 }
3006 2972
@@ -3042,7 +3008,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
3042 dumper->next_seq = next_seq; 3008 dumper->next_seq = next_seq;
3043 dumper->next_idx = next_idx; 3009 dumper->next_idx = next_idx;
3044 ret = true; 3010 ret = true;
3045 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 3011 logbuf_unlock_irqrestore(flags);
3046out: 3012out:
3047 if (len) 3013 if (len)
3048 *len = l; 3014 *len = l;
@@ -3080,9 +3046,9 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
3080{ 3046{
3081 unsigned long flags; 3047 unsigned long flags;
3082 3048
3083 raw_spin_lock_irqsave(&logbuf_lock, flags); 3049 logbuf_lock_irqsave(flags);
3084 kmsg_dump_rewind_nolock(dumper); 3050 kmsg_dump_rewind_nolock(dumper);
3085 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 3051 logbuf_unlock_irqrestore(flags);
3086} 3052}
3087EXPORT_SYMBOL_GPL(kmsg_dump_rewind); 3053EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
3088 3054
diff --git a/kernel/printk/nmi.c b/kernel/printk/printk_safe.c
index f011aaef583c..033e50a7d706 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/printk_safe.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * nmi.c - Safe printk in NMI context 2 * printk_safe.c - Safe printk for printk-deadlock-prone contexts
3 * 3 *
4 * This program is free software; you can redistribute it and/or 4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License 5 * modify it under the terms of the GNU General Public License
@@ -32,36 +32,58 @@
32 * is later flushed into the main ring buffer via IRQ work. 32 * is later flushed into the main ring buffer via IRQ work.
33 * 33 *
34 * The alternative implementation is chosen transparently 34 * The alternative implementation is chosen transparently
35 * via @printk_func per-CPU variable. 35 * by examinig current printk() context mask stored in @printk_context
36 * per-CPU variable.
36 * 37 *
37 * The implementation allows to flush the strings also from another CPU. 38 * The implementation allows to flush the strings also from another CPU.
38 * There are situations when we want to make sure that all buffers 39 * There are situations when we want to make sure that all buffers
39 * were handled or when IRQs are blocked. 40 * were handled or when IRQs are blocked.
40 */ 41 */
41DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; 42static int printk_safe_irq_ready;
42static int printk_nmi_irq_ready;
43atomic_t nmi_message_lost;
44 43
45#define NMI_LOG_BUF_LEN ((1 << CONFIG_NMI_LOG_BUF_SHIFT) - \ 44#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \
46 sizeof(atomic_t) - sizeof(struct irq_work)) 45 sizeof(atomic_t) - \
46 sizeof(atomic_t) - \
47 sizeof(struct irq_work))
47 48
48struct nmi_seq_buf { 49struct printk_safe_seq_buf {
49 atomic_t len; /* length of written data */ 50 atomic_t len; /* length of written data */
51 atomic_t message_lost;
50 struct irq_work work; /* IRQ work that flushes the buffer */ 52 struct irq_work work; /* IRQ work that flushes the buffer */
51 unsigned char buffer[NMI_LOG_BUF_LEN]; 53 unsigned char buffer[SAFE_LOG_BUF_LEN];
52}; 54};
53static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); 55
56static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq);
57static DEFINE_PER_CPU(int, printk_context);
58
59#ifdef CONFIG_PRINTK_NMI
60static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
61#endif
62
63/* Get flushed in a more safe context. */
64static void queue_flush_work(struct printk_safe_seq_buf *s)
65{
66 if (printk_safe_irq_ready) {
67 /* Make sure that IRQ work is really initialized. */
68 smp_rmb();
69 irq_work_queue(&s->work);
70 }
71}
54 72
55/* 73/*
56 * Safe printk() for NMI context. It uses a per-CPU buffer to 74 * Add a message to per-CPU context-dependent buffer. NMI and printk-safe
57 * store the message. NMIs are not nested, so there is always only 75 * have dedicated buffers, because otherwise printk-safe preempted by
58 * one writer running. But the buffer might get flushed from another 76 * NMI-printk would have overwritten the NMI messages.
59 * CPU, so we need to be careful. 77 *
78 * The messages are fushed from irq work (or from panic()), possibly,
79 * from other CPU, concurrently with printk_safe_log_store(). Should this
80 * happen, printk_safe_log_store() will notice the buffer->len mismatch
81 * and repeat the write.
60 */ 82 */
61static int vprintk_nmi(const char *fmt, va_list args) 83static int printk_safe_log_store(struct printk_safe_seq_buf *s,
84 const char *fmt, va_list args)
62{ 85{
63 struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); 86 int add;
64 int add = 0;
65 size_t len; 87 size_t len;
66 88
67again: 89again:
@@ -69,18 +91,21 @@ again:
69 91
70 /* The trailing '\0' is not counted into len. */ 92 /* The trailing '\0' is not counted into len. */
71 if (len >= sizeof(s->buffer) - 1) { 93 if (len >= sizeof(s->buffer) - 1) {
72 atomic_inc(&nmi_message_lost); 94 atomic_inc(&s->message_lost);
95 queue_flush_work(s);
73 return 0; 96 return 0;
74 } 97 }
75 98
76 /* 99 /*
77 * Make sure that all old data have been read before the buffer was 100 * Make sure that all old data have been read before the buffer
78 * reseted. This is not needed when we just append data. 101 * was reset. This is not needed when we just append data.
79 */ 102 */
80 if (!len) 103 if (!len)
81 smp_rmb(); 104 smp_rmb();
82 105
83 add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); 106 add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
107 if (!add)
108 return 0;
84 109
85 /* 110 /*
86 * Do it once again if the buffer has been flushed in the meantime. 111 * Do it once again if the buffer has been flushed in the meantime.
@@ -90,32 +115,23 @@ again:
90 if (atomic_cmpxchg(&s->len, len, len + add) != len) 115 if (atomic_cmpxchg(&s->len, len, len + add) != len)
91 goto again; 116 goto again;
92 117
93 /* Get flushed in a more safe context. */ 118 queue_flush_work(s);
94 if (add && printk_nmi_irq_ready) {
95 /* Make sure that IRQ work is really initialized. */
96 smp_rmb();
97 irq_work_queue(&s->work);
98 }
99
100 return add; 119 return add;
101} 120}
102 121
103static void printk_nmi_flush_line(const char *text, int len) 122static inline void printk_safe_flush_line(const char *text, int len)
104{ 123{
105 /* 124 /*
106 * The buffers are flushed in NMI only on panic. The messages must 125 * Avoid any console drivers calls from here, because we may be
107 * go only into the ring buffer at this stage. Consoles will get 126 * in NMI or printk_safe context (when in panic). The messages
108 * explicitly called later when a crashdump is not generated. 127 * must go only into the ring buffer at this stage. Consoles will
128 * get explicitly called later when a crashdump is not generated.
109 */ 129 */
110 if (in_nmi()) 130 printk_deferred("%.*s", len, text);
111 printk_deferred("%.*s", len, text);
112 else
113 printk("%.*s", len, text);
114
115} 131}
116 132
117/* printk part of the temporary buffer line by line */ 133/* printk part of the temporary buffer line by line */
118static int printk_nmi_flush_buffer(const char *start, size_t len) 134static int printk_safe_flush_buffer(const char *start, size_t len)
119{ 135{
120 const char *c, *end; 136 const char *c, *end;
121 bool header; 137 bool header;
@@ -127,7 +143,7 @@ static int printk_nmi_flush_buffer(const char *start, size_t len)
127 /* Print line by line. */ 143 /* Print line by line. */
128 while (c < end) { 144 while (c < end) {
129 if (*c == '\n') { 145 if (*c == '\n') {
130 printk_nmi_flush_line(start, c - start + 1); 146 printk_safe_flush_line(start, c - start + 1);
131 start = ++c; 147 start = ++c;
132 header = true; 148 header = true;
133 continue; 149 continue;
@@ -140,7 +156,7 @@ static int printk_nmi_flush_buffer(const char *start, size_t len)
140 continue; 156 continue;
141 } 157 }
142 158
143 printk_nmi_flush_line(start, c - start); 159 printk_safe_flush_line(start, c - start);
144 start = c++; 160 start = c++;
145 header = true; 161 header = true;
146 continue; 162 continue;
@@ -154,22 +170,31 @@ static int printk_nmi_flush_buffer(const char *start, size_t len)
154 if (start < end && !header) { 170 if (start < end && !header) {
155 static const char newline[] = KERN_CONT "\n"; 171 static const char newline[] = KERN_CONT "\n";
156 172
157 printk_nmi_flush_line(start, end - start); 173 printk_safe_flush_line(start, end - start);
158 printk_nmi_flush_line(newline, strlen(newline)); 174 printk_safe_flush_line(newline, strlen(newline));
159 } 175 }
160 176
161 return len; 177 return len;
162} 178}
163 179
180static void report_message_lost(struct printk_safe_seq_buf *s)
181{
182 int lost = atomic_xchg(&s->message_lost, 0);
183
184 if (lost)
185 printk_deferred("Lost %d message(s)!\n", lost);
186}
187
164/* 188/*
165 * Flush data from the associated per_CPU buffer. The function 189 * Flush data from the associated per-CPU buffer. The function
166 * can be called either via IRQ work or independently. 190 * can be called either via IRQ work or independently.
167 */ 191 */
168static void __printk_nmi_flush(struct irq_work *work) 192static void __printk_safe_flush(struct irq_work *work)
169{ 193{
170 static raw_spinlock_t read_lock = 194 static raw_spinlock_t read_lock =
171 __RAW_SPIN_LOCK_INITIALIZER(read_lock); 195 __RAW_SPIN_LOCK_INITIALIZER(read_lock);
172 struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work); 196 struct printk_safe_seq_buf *s =
197 container_of(work, struct printk_safe_seq_buf, work);
173 unsigned long flags; 198 unsigned long flags;
174 size_t len; 199 size_t len;
175 int i; 200 int i;
@@ -194,9 +219,9 @@ more:
194 * buffer size. 219 * buffer size.
195 */ 220 */
196 if ((i && i >= len) || len > sizeof(s->buffer)) { 221 if ((i && i >= len) || len > sizeof(s->buffer)) {
197 const char *msg = "printk_nmi_flush: internal error\n"; 222 const char *msg = "printk_safe_flush: internal error\n";
198 223
199 printk_nmi_flush_line(msg, strlen(msg)); 224 printk_safe_flush_line(msg, strlen(msg));
200 len = 0; 225 len = 0;
201 } 226 }
202 227
@@ -205,7 +230,7 @@ more:
205 230
206 /* Make sure that data has been written up to the @len */ 231 /* Make sure that data has been written up to the @len */
207 smp_rmb(); 232 smp_rmb();
208 i += printk_nmi_flush_buffer(s->buffer + i, len - i); 233 i += printk_safe_flush_buffer(s->buffer + i, len - i);
209 234
210 /* 235 /*
211 * Check that nothing has got added in the meantime and truncate 236 * Check that nothing has got added in the meantime and truncate
@@ -217,35 +242,40 @@ more:
217 goto more; 242 goto more;
218 243
219out: 244out:
245 report_message_lost(s);
220 raw_spin_unlock_irqrestore(&read_lock, flags); 246 raw_spin_unlock_irqrestore(&read_lock, flags);
221} 247}
222 248
223/** 249/**
224 * printk_nmi_flush - flush all per-cpu nmi buffers. 250 * printk_safe_flush - flush all per-cpu nmi buffers.
225 * 251 *
226 * The buffers are flushed automatically via IRQ work. This function 252 * The buffers are flushed automatically via IRQ work. This function
227 * is useful only when someone wants to be sure that all buffers have 253 * is useful only when someone wants to be sure that all buffers have
228 * been flushed at some point. 254 * been flushed at some point.
229 */ 255 */
230void printk_nmi_flush(void) 256void printk_safe_flush(void)
231{ 257{
232 int cpu; 258 int cpu;
233 259
234 for_each_possible_cpu(cpu) 260 for_each_possible_cpu(cpu) {
235 __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work); 261#ifdef CONFIG_PRINTK_NMI
262 __printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work);
263#endif
264 __printk_safe_flush(&per_cpu(safe_print_seq, cpu).work);
265 }
236} 266}
237 267
238/** 268/**
239 * printk_nmi_flush_on_panic - flush all per-cpu nmi buffers when the system 269 * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system
240 * goes down. 270 * goes down.
241 * 271 *
242 * Similar to printk_nmi_flush() but it can be called even in NMI context when 272 * Similar to printk_safe_flush() but it can be called even in NMI context when
243 * the system goes down. It does the best effort to get NMI messages into 273 * the system goes down. It does the best effort to get NMI messages into
244 * the main ring buffer. 274 * the main ring buffer.
245 * 275 *
246 * Note that it could try harder when there is only one CPU online. 276 * Note that it could try harder when there is only one CPU online.
247 */ 277 */
248void printk_nmi_flush_on_panic(void) 278void printk_safe_flush_on_panic(void)
249{ 279{
250 /* 280 /*
251 * Make sure that we could access the main ring buffer. 281 * Make sure that we could access the main ring buffer.
@@ -259,33 +289,97 @@ void printk_nmi_flush_on_panic(void)
259 raw_spin_lock_init(&logbuf_lock); 289 raw_spin_lock_init(&logbuf_lock);
260 } 290 }
261 291
262 printk_nmi_flush(); 292 printk_safe_flush();
263} 293}
264 294
265void __init printk_nmi_init(void) 295#ifdef CONFIG_PRINTK_NMI
296/*
297 * Safe printk() for NMI context. It uses a per-CPU buffer to
298 * store the message. NMIs are not nested, so there is always only
299 * one writer running. But the buffer might get flushed from another
300 * CPU, so we need to be careful.
301 */
302static int vprintk_nmi(const char *fmt, va_list args)
266{ 303{
267 int cpu; 304 struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
268 305
269 for_each_possible_cpu(cpu) { 306 return printk_safe_log_store(s, fmt, args);
270 struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu); 307}
271 308
272 init_irq_work(&s->work, __printk_nmi_flush); 309void printk_nmi_enter(void)
273 } 310{
311 this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
312}
274 313
275 /* Make sure that IRQ works are initialized before enabling. */ 314void printk_nmi_exit(void)
276 smp_wmb(); 315{
277 printk_nmi_irq_ready = 1; 316 this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK);
317}
278 318
279 /* Flush pending messages that did not have scheduled IRQ works. */ 319#else
280 printk_nmi_flush(); 320
321static int vprintk_nmi(const char *fmt, va_list args)
322{
323 return 0;
281} 324}
282 325
283void printk_nmi_enter(void) 326#endif /* CONFIG_PRINTK_NMI */
327
328/*
329 * Lock-less printk(), to avoid deadlocks should the printk() recurse
330 * into itself. It uses a per-CPU buffer to store the message, just like
331 * NMI.
332 */
333static int vprintk_safe(const char *fmt, va_list args)
284{ 334{
285 this_cpu_write(printk_func, vprintk_nmi); 335 struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq);
336
337 return printk_safe_log_store(s, fmt, args);
286} 338}
287 339
288void printk_nmi_exit(void) 340/* Can be preempted by NMI. */
341void __printk_safe_enter(void)
342{
343 this_cpu_inc(printk_context);
344}
345
346/* Can be preempted by NMI. */
347void __printk_safe_exit(void)
289{ 348{
290 this_cpu_write(printk_func, vprintk_default); 349 this_cpu_dec(printk_context);
350}
351
352__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
353{
354 if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
355 return vprintk_nmi(fmt, args);
356
357 if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK)
358 return vprintk_safe(fmt, args);
359
360 return vprintk_default(fmt, args);
361}
362
363void __init printk_safe_init(void)
364{
365 int cpu;
366
367 for_each_possible_cpu(cpu) {
368 struct printk_safe_seq_buf *s;
369
370 s = &per_cpu(safe_print_seq, cpu);
371 init_irq_work(&s->work, __printk_safe_flush);
372
373#ifdef CONFIG_PRINTK_NMI
374 s = &per_cpu(nmi_print_seq, cpu);
375 init_irq_work(&s->work, __printk_safe_flush);
376#endif
377 }
378
379 /* Make sure that IRQ works are initialized before enabling. */
380 smp_wmb();
381 printk_safe_irq_ready = 1;
382
383 /* Flush pending messages that did not have scheduled IRQ works. */
384 printk_safe_flush();
291} 385}
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
index 75554754eadf..5f7999eacad5 100644
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -77,7 +77,7 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
77 * Force flush any remote buffers that might be stuck in IRQ context 77 * Force flush any remote buffers that might be stuck in IRQ context
78 * and therefore could not run their irq_work. 78 * and therefore could not run their irq_work.
79 */ 79 */
80 printk_nmi_flush(); 80 printk_safe_flush();
81 81
82 clear_bit_unlock(0, &backtrace_flag); 82 clear_bit_unlock(0, &backtrace_flag);
83 put_cpu(); 83 put_cpu();