aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig.binfmt3
-rw-r--r--fs/binfmt_elf.c238
-rw-r--r--fs/btrfs/check-integrity.c163
-rw-r--r--fs/btrfs/compression.c18
-rw-r--r--fs/btrfs/ctree.c2
-rw-r--r--fs/btrfs/ctree.h85
-rw-r--r--fs/btrfs/dev-replace.c32
-rw-r--r--fs/btrfs/dir-item.c10
-rw-r--r--fs/btrfs/disk-io.c49
-rw-r--r--fs/btrfs/extent-tree.c211
-rw-r--r--fs/btrfs/extent_io.c41
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/file.c51
-rw-r--r--fs/btrfs/free-space-cache.c117
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/inode-map.c4
-rw-r--r--fs/btrfs/inode.c152
-rw-r--r--fs/btrfs/ioctl.c36
-rw-r--r--fs/btrfs/ordered-data.c49
-rw-r--r--fs/btrfs/ordered-data.h12
-rw-r--r--fs/btrfs/raid56.c763
-rw-r--r--fs/btrfs/raid56.h16
-rw-r--r--fs/btrfs/scrub.c893
-rw-r--r--fs/btrfs/send.c49
-rw-r--r--fs/btrfs/super.c94
-rw-r--r--fs/btrfs/sysfs.c34
-rw-r--r--fs/btrfs/transaction.c166
-rw-r--r--fs/btrfs/transaction.h6
-rw-r--r--fs/btrfs/tree-log.c50
-rw-r--r--fs/btrfs/volumes.c90
-rw-r--r--fs/btrfs/volumes.h32
-rw-r--r--fs/btrfs/xattr.c150
-rw-r--r--fs/ext4/ext4.h41
-rw-r--r--fs/ext4/extents.c223
-rw-r--r--fs/ext4/extents_status.c321
-rw-r--r--fs/ext4/extents_status.h82
-rw-r--r--fs/ext4/file.c220
-rw-r--r--fs/ext4/inline.c35
-rw-r--r--fs/ext4/inode.c37
-rw-r--r--fs/ext4/ioctl.c2
-rw-r--r--fs/ext4/mballoc.c15
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c8
-rw-r--r--fs/ext4/namei.c1
-rw-r--r--fs/ext4/resize.c6
-rw-r--r--fs/ext4/super.c51
-rw-r--r--fs/jbd2/journal.c3
-rw-r--r--fs/pstore/ram.c13
-rw-r--r--fs/pstore/ram_core.c31
-rw-r--r--fs/reiserfs/journal.c2
-rw-r--r--fs/ubifs/file.c1
-rw-r--r--fs/ubifs/journal.c7
-rw-r--r--fs/xfs/libxfs/xfs_ag.h281
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c1
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h3
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c1
-rw-r--r--fs/xfs/libxfs/xfs_attr.c3
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c2
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c77
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c3
-rw-r--r--fs/xfs/libxfs/xfs_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c6
-rw-r--r--fs/xfs/libxfs/xfs_da_format.c2
-rw-r--r--fs/xfs/libxfs/xfs_dinode.h243
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c20
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h140
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c11
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c12
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c14
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h140
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c13
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c2
-rw-r--r--fs/xfs/libxfs/xfs_format.h1107
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c43
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h4
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c3
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c4
-rw-r--r--fs/xfs/libxfs/xfs_inum.h60
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h2
-rw-r--r--fs/xfs/libxfs/xfs_log_rlimit.c2
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c3
-rw-r--r--fs/xfs/libxfs/xfs_sb.c2
-rw-r--r--fs/xfs/libxfs/xfs_sb.h584
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c2
-rw-r--r--fs/xfs/xfs_acl.c2
-rw-r--r--fs/xfs/xfs_acl.h36
-rw-r--r--fs/xfs/xfs_aops.c3
-rw-r--r--fs/xfs/xfs_attr_inactive.c3
-rw-r--r--fs/xfs/xfs_attr_list.c3
-rw-r--r--fs/xfs/xfs_bmap_util.c3
-rw-r--r--fs/xfs/xfs_buf.c27
-rw-r--r--fs/xfs/xfs_buf.h3
-rw-r--r--fs/xfs/xfs_buf_item.c2
-rw-r--r--fs/xfs/xfs_dir2_readdir.c21
-rw-r--r--fs/xfs/xfs_discard.c1
-rw-r--r--fs/xfs/xfs_dquot.c2
-rw-r--r--fs/xfs/xfs_dquot_item.c2
-rw-r--r--fs/xfs/xfs_error.c2
-rw-r--r--fs/xfs/xfs_export.c3
-rw-r--r--fs/xfs/xfs_extent_busy.c1
-rw-r--r--fs/xfs/xfs_extfree_item.c3
-rw-r--r--fs/xfs/xfs_file.c9
-rw-r--r--fs/xfs/xfs_filestream.c3
-rw-r--r--fs/xfs/xfs_fsops.c2
-rw-r--r--fs/xfs/xfs_icache.c4
-rw-r--r--fs/xfs/xfs_icache.h8
-rw-r--r--fs/xfs/xfs_icreate_item.c3
-rw-r--r--fs/xfs/xfs_inode.c29
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_inode_item.c3
-rw-r--r--fs/xfs/xfs_ioctl.c3
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c18
-rw-r--r--fs/xfs/xfs_iops.c5
-rw-r--r--fs/xfs/xfs_itable.c6
-rw-r--r--fs/xfs/xfs_linux.h6
-rw-r--r--fs/xfs/xfs_log.c8
-rw-r--r--fs/xfs/xfs_log_cil.c3
-rw-r--r--fs/xfs/xfs_log_recover.c4
-rw-r--r--fs/xfs/xfs_message.c3
-rw-r--r--fs/xfs/xfs_mount.c33
-rw-r--r--fs/xfs/xfs_mount.h8
-rw-r--r--fs/xfs/xfs_qm.c14
-rw-r--r--fs/xfs/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/xfs_qm_syscalls.c27
-rw-r--r--fs/xfs/xfs_quotaops.c2
-rw-r--r--fs/xfs/xfs_rtalloc.c3
-rw-r--r--fs/xfs/xfs_super.c19
-rw-r--r--fs/xfs/xfs_symlink.c3
-rw-r--r--fs/xfs/xfs_trace.c2
-rw-r--r--fs/xfs/xfs_trans.c2
-rw-r--r--fs/xfs/xfs_trans_ail.c3
-rw-r--r--fs/xfs/xfs_trans_buf.c137
-rw-r--r--fs/xfs/xfs_trans_dquot.c2
-rw-r--r--fs/xfs/xfs_trans_extfree.c3
-rw-r--r--fs/xfs/xfs_trans_inode.c2
-rw-r--r--fs/xfs/xfs_xattr.c2
142 files changed, 4953 insertions, 3071 deletions
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 370b24cee4d8..c055d56ec63d 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -30,6 +30,9 @@ config COMPAT_BINFMT_ELF
30config ARCH_BINFMT_ELF_RANDOMIZE_PIE 30config ARCH_BINFMT_ELF_RANDOMIZE_PIE
31 bool 31 bool
32 32
33config ARCH_BINFMT_ELF_STATE
34 bool
35
33config BINFMT_ELF_FDPIC 36config BINFMT_ELF_FDPIC
34 bool "Kernel support for FDPIC ELF binaries" 37 bool "Kernel support for FDPIC ELF binaries"
35 default y 38 default y
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 3a6175fe10c0..02b16910f4c9 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -386,6 +386,127 @@ static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
386 ELF_PAGESTART(cmds[first_idx].p_vaddr); 386 ELF_PAGESTART(cmds[first_idx].p_vaddr);
387} 387}
388 388
389/**
390 * load_elf_phdrs() - load ELF program headers
391 * @elf_ex: ELF header of the binary whose program headers should be loaded
392 * @elf_file: the opened ELF binary file
393 *
394 * Loads ELF program headers from the binary file elf_file, which has the ELF
395 * header pointed to by elf_ex, into a newly allocated array. The caller is
396 * responsible for freeing the allocated data. Returns an ERR_PTR upon failure.
397 */
398static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex,
399 struct file *elf_file)
400{
401 struct elf_phdr *elf_phdata = NULL;
402 int retval, size, err = -1;
403
404 /*
405 * If the size of this structure has changed, then punt, since
406 * we will be doing the wrong thing.
407 */
408 if (elf_ex->e_phentsize != sizeof(struct elf_phdr))
409 goto out;
410
411 /* Sanity check the number of program headers... */
412 if (elf_ex->e_phnum < 1 ||
413 elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr))
414 goto out;
415
416 /* ...and their total size. */
417 size = sizeof(struct elf_phdr) * elf_ex->e_phnum;
418 if (size > ELF_MIN_ALIGN)
419 goto out;
420
421 elf_phdata = kmalloc(size, GFP_KERNEL);
422 if (!elf_phdata)
423 goto out;
424
425 /* Read in the program headers */
426 retval = kernel_read(elf_file, elf_ex->e_phoff,
427 (char *)elf_phdata, size);
428 if (retval != size) {
429 err = (retval < 0) ? retval : -EIO;
430 goto out;
431 }
432
433 /* Success! */
434 err = 0;
435out:
436 if (err) {
437 kfree(elf_phdata);
438 elf_phdata = NULL;
439 }
440 return elf_phdata;
441}
442
443#ifndef CONFIG_ARCH_BINFMT_ELF_STATE
444
445/**
446 * struct arch_elf_state - arch-specific ELF loading state
447 *
448 * This structure is used to preserve architecture specific data during
449 * the loading of an ELF file, throughout the checking of architecture
450 * specific ELF headers & through to the point where the ELF load is
451 * known to be proceeding (ie. SET_PERSONALITY).
452 *
453 * This implementation is a dummy for architectures which require no
454 * specific state.
455 */
456struct arch_elf_state {
457};
458
459#define INIT_ARCH_ELF_STATE {}
460
461/**
462 * arch_elf_pt_proc() - check a PT_LOPROC..PT_HIPROC ELF program header
463 * @ehdr: The main ELF header
464 * @phdr: The program header to check
465 * @elf: The open ELF file
466 * @is_interp: True if the phdr is from the interpreter of the ELF being
467 * loaded, else false.
468 * @state: Architecture-specific state preserved throughout the process
469 * of loading the ELF.
470 *
471 * Inspects the program header phdr to validate its correctness and/or
472 * suitability for the system. Called once per ELF program header in the
473 * range PT_LOPROC to PT_HIPROC, for both the ELF being loaded and its
474 * interpreter.
475 *
476 * Return: Zero to proceed with the ELF load, non-zero to fail the ELF load
477 * with that return code.
478 */
479static inline int arch_elf_pt_proc(struct elfhdr *ehdr,
480 struct elf_phdr *phdr,
481 struct file *elf, bool is_interp,
482 struct arch_elf_state *state)
483{
484 /* Dummy implementation, always proceed */
485 return 0;
486}
487
488/**
489 * arch_check_elf() - check a PT_LOPROC..PT_HIPROC ELF program header
490 * @ehdr: The main ELF header
491 * @has_interp: True if the ELF has an interpreter, else false.
492 * @state: Architecture-specific state preserved throughout the process
493 * of loading the ELF.
494 *
495 * Provides a final opportunity for architecture code to reject the loading
496 * of the ELF & cause an exec syscall to return an error. This is called after
497 * all program headers to be checked by arch_elf_pt_proc have been.
498 *
499 * Return: Zero to proceed with the ELF load, non-zero to fail the ELF load
500 * with that return code.
501 */
502static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp,
503 struct arch_elf_state *state)
504{
505 /* Dummy implementation, always proceed */
506 return 0;
507}
508
509#endif /* !CONFIG_ARCH_BINFMT_ELF_STATE */
389 510
390/* This is much more generalized than the library routine read function, 511/* This is much more generalized than the library routine read function,
391 so we keep this separate. Technically the library read function 512 so we keep this separate. Technically the library read function
@@ -394,16 +515,15 @@ static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
394 515
395static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, 516static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
396 struct file *interpreter, unsigned long *interp_map_addr, 517 struct file *interpreter, unsigned long *interp_map_addr,
397 unsigned long no_base) 518 unsigned long no_base, struct elf_phdr *interp_elf_phdata)
398{ 519{
399 struct elf_phdr *elf_phdata;
400 struct elf_phdr *eppnt; 520 struct elf_phdr *eppnt;
401 unsigned long load_addr = 0; 521 unsigned long load_addr = 0;
402 int load_addr_set = 0; 522 int load_addr_set = 0;
403 unsigned long last_bss = 0, elf_bss = 0; 523 unsigned long last_bss = 0, elf_bss = 0;
404 unsigned long error = ~0UL; 524 unsigned long error = ~0UL;
405 unsigned long total_size; 525 unsigned long total_size;
406 int retval, i, size; 526 int i;
407 527
408 /* First of all, some simple consistency checks */ 528 /* First of all, some simple consistency checks */
409 if (interp_elf_ex->e_type != ET_EXEC && 529 if (interp_elf_ex->e_type != ET_EXEC &&
@@ -414,40 +534,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
414 if (!interpreter->f_op->mmap) 534 if (!interpreter->f_op->mmap)
415 goto out; 535 goto out;
416 536
417 /* 537 total_size = total_mapping_size(interp_elf_phdata,
418 * If the size of this structure has changed, then punt, since 538 interp_elf_ex->e_phnum);
419 * we will be doing the wrong thing.
420 */
421 if (interp_elf_ex->e_phentsize != sizeof(struct elf_phdr))
422 goto out;
423 if (interp_elf_ex->e_phnum < 1 ||
424 interp_elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr))
425 goto out;
426
427 /* Now read in all of the header information */
428 size = sizeof(struct elf_phdr) * interp_elf_ex->e_phnum;
429 if (size > ELF_MIN_ALIGN)
430 goto out;
431 elf_phdata = kmalloc(size, GFP_KERNEL);
432 if (!elf_phdata)
433 goto out;
434
435 retval = kernel_read(interpreter, interp_elf_ex->e_phoff,
436 (char *)elf_phdata, size);
437 error = -EIO;
438 if (retval != size) {
439 if (retval < 0)
440 error = retval;
441 goto out_close;
442 }
443
444 total_size = total_mapping_size(elf_phdata, interp_elf_ex->e_phnum);
445 if (!total_size) { 539 if (!total_size) {
446 error = -EINVAL; 540 error = -EINVAL;
447 goto out_close; 541 goto out;
448 } 542 }
449 543
450 eppnt = elf_phdata; 544 eppnt = interp_elf_phdata;
451 for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { 545 for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
452 if (eppnt->p_type == PT_LOAD) { 546 if (eppnt->p_type == PT_LOAD) {
453 int elf_type = MAP_PRIVATE | MAP_DENYWRITE; 547 int elf_type = MAP_PRIVATE | MAP_DENYWRITE;
@@ -474,7 +568,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
474 *interp_map_addr = map_addr; 568 *interp_map_addr = map_addr;
475 error = map_addr; 569 error = map_addr;
476 if (BAD_ADDR(map_addr)) 570 if (BAD_ADDR(map_addr))
477 goto out_close; 571 goto out;
478 572
479 if (!load_addr_set && 573 if (!load_addr_set &&
480 interp_elf_ex->e_type == ET_DYN) { 574 interp_elf_ex->e_type == ET_DYN) {
@@ -493,7 +587,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
493 eppnt->p_memsz > TASK_SIZE || 587 eppnt->p_memsz > TASK_SIZE ||
494 TASK_SIZE - eppnt->p_memsz < k) { 588 TASK_SIZE - eppnt->p_memsz < k) {
495 error = -ENOMEM; 589 error = -ENOMEM;
496 goto out_close; 590 goto out;
497 } 591 }
498 592
499 /* 593 /*
@@ -523,7 +617,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
523 */ 617 */
524 if (padzero(elf_bss)) { 618 if (padzero(elf_bss)) {
525 error = -EFAULT; 619 error = -EFAULT;
526 goto out_close; 620 goto out;
527 } 621 }
528 622
529 /* What we have mapped so far */ 623 /* What we have mapped so far */
@@ -532,13 +626,10 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
532 /* Map the last of the bss segment */ 626 /* Map the last of the bss segment */
533 error = vm_brk(elf_bss, last_bss - elf_bss); 627 error = vm_brk(elf_bss, last_bss - elf_bss);
534 if (BAD_ADDR(error)) 628 if (BAD_ADDR(error))
535 goto out_close; 629 goto out;
536 } 630 }
537 631
538 error = load_addr; 632 error = load_addr;
539
540out_close:
541 kfree(elf_phdata);
542out: 633out:
543 return error; 634 return error;
544} 635}
@@ -575,10 +666,9 @@ static int load_elf_binary(struct linux_binprm *bprm)
575 int load_addr_set = 0; 666 int load_addr_set = 0;
576 char * elf_interpreter = NULL; 667 char * elf_interpreter = NULL;
577 unsigned long error; 668 unsigned long error;
578 struct elf_phdr *elf_ppnt, *elf_phdata; 669 struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
579 unsigned long elf_bss, elf_brk; 670 unsigned long elf_bss, elf_brk;
580 int retval, i; 671 int retval, i;
581 unsigned int size;
582 unsigned long elf_entry; 672 unsigned long elf_entry;
583 unsigned long interp_load_addr = 0; 673 unsigned long interp_load_addr = 0;
584 unsigned long start_code, end_code, start_data, end_data; 674 unsigned long start_code, end_code, start_data, end_data;
@@ -589,6 +679,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
589 struct elfhdr elf_ex; 679 struct elfhdr elf_ex;
590 struct elfhdr interp_elf_ex; 680 struct elfhdr interp_elf_ex;
591 } *loc; 681 } *loc;
682 struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE;
592 683
593 loc = kmalloc(sizeof(*loc), GFP_KERNEL); 684 loc = kmalloc(sizeof(*loc), GFP_KERNEL);
594 if (!loc) { 685 if (!loc) {
@@ -611,26 +702,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
611 if (!bprm->file->f_op->mmap) 702 if (!bprm->file->f_op->mmap)
612 goto out; 703 goto out;
613 704
614 /* Now read in all of the header information */ 705 elf_phdata = load_elf_phdrs(&loc->elf_ex, bprm->file);
615 if (loc->elf_ex.e_phentsize != sizeof(struct elf_phdr))
616 goto out;
617 if (loc->elf_ex.e_phnum < 1 ||
618 loc->elf_ex.e_phnum > 65536U / sizeof(struct elf_phdr))
619 goto out;
620 size = loc->elf_ex.e_phnum * sizeof(struct elf_phdr);
621 retval = -ENOMEM;
622 elf_phdata = kmalloc(size, GFP_KERNEL);
623 if (!elf_phdata) 706 if (!elf_phdata)
624 goto out; 707 goto out;
625 708
626 retval = kernel_read(bprm->file, loc->elf_ex.e_phoff,
627 (char *)elf_phdata, size);
628 if (retval != size) {
629 if (retval >= 0)
630 retval = -EIO;
631 goto out_free_ph;
632 }
633
634 elf_ppnt = elf_phdata; 709 elf_ppnt = elf_phdata;
635 elf_bss = 0; 710 elf_bss = 0;
636 elf_brk = 0; 711 elf_brk = 0;
@@ -699,12 +774,21 @@ static int load_elf_binary(struct linux_binprm *bprm)
699 774
700 elf_ppnt = elf_phdata; 775 elf_ppnt = elf_phdata;
701 for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) 776 for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++)
702 if (elf_ppnt->p_type == PT_GNU_STACK) { 777 switch (elf_ppnt->p_type) {
778 case PT_GNU_STACK:
703 if (elf_ppnt->p_flags & PF_X) 779 if (elf_ppnt->p_flags & PF_X)
704 executable_stack = EXSTACK_ENABLE_X; 780 executable_stack = EXSTACK_ENABLE_X;
705 else 781 else
706 executable_stack = EXSTACK_DISABLE_X; 782 executable_stack = EXSTACK_DISABLE_X;
707 break; 783 break;
784
785 case PT_LOPROC ... PT_HIPROC:
786 retval = arch_elf_pt_proc(&loc->elf_ex, elf_ppnt,
787 bprm->file, false,
788 &arch_state);
789 if (retval)
790 goto out_free_dentry;
791 break;
708 } 792 }
709 793
710 /* Some simple consistency checks for the interpreter */ 794 /* Some simple consistency checks for the interpreter */
@@ -716,8 +800,36 @@ static int load_elf_binary(struct linux_binprm *bprm)
716 /* Verify the interpreter has a valid arch */ 800 /* Verify the interpreter has a valid arch */
717 if (!elf_check_arch(&loc->interp_elf_ex)) 801 if (!elf_check_arch(&loc->interp_elf_ex))
718 goto out_free_dentry; 802 goto out_free_dentry;
803
804 /* Load the interpreter program headers */
805 interp_elf_phdata = load_elf_phdrs(&loc->interp_elf_ex,
806 interpreter);
807 if (!interp_elf_phdata)
808 goto out_free_dentry;
809
810 /* Pass PT_LOPROC..PT_HIPROC headers to arch code */
811 elf_ppnt = interp_elf_phdata;
812 for (i = 0; i < loc->interp_elf_ex.e_phnum; i++, elf_ppnt++)
813 switch (elf_ppnt->p_type) {
814 case PT_LOPROC ... PT_HIPROC:
815 retval = arch_elf_pt_proc(&loc->interp_elf_ex,
816 elf_ppnt, interpreter,
817 true, &arch_state);
818 if (retval)
819 goto out_free_dentry;
820 break;
821 }
719 } 822 }
720 823
824 /*
825 * Allow arch code to reject the ELF at this point, whilst it's
826 * still possible to return an error to the code that invoked
827 * the exec syscall.
828 */
829 retval = arch_check_elf(&loc->elf_ex, !!interpreter, &arch_state);
830 if (retval)
831 goto out_free_dentry;
832
721 /* Flush all traces of the currently running executable */ 833 /* Flush all traces of the currently running executable */
722 retval = flush_old_exec(bprm); 834 retval = flush_old_exec(bprm);
723 if (retval) 835 if (retval)
@@ -725,7 +837,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
725 837
726 /* Do this immediately, since STACK_TOP as used in setup_arg_pages 838 /* Do this immediately, since STACK_TOP as used in setup_arg_pages
727 may depend on the personality. */ 839 may depend on the personality. */
728 SET_PERSONALITY(loc->elf_ex); 840 SET_PERSONALITY2(loc->elf_ex, &arch_state);
729 if (elf_read_implies_exec(loc->elf_ex, executable_stack)) 841 if (elf_read_implies_exec(loc->elf_ex, executable_stack))
730 current->personality |= READ_IMPLIES_EXEC; 842 current->personality |= READ_IMPLIES_EXEC;
731 843
@@ -890,7 +1002,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
890 elf_entry = load_elf_interp(&loc->interp_elf_ex, 1002 elf_entry = load_elf_interp(&loc->interp_elf_ex,
891 interpreter, 1003 interpreter,
892 &interp_map_addr, 1004 &interp_map_addr,
893 load_bias); 1005 load_bias, interp_elf_phdata);
894 if (!IS_ERR((void *)elf_entry)) { 1006 if (!IS_ERR((void *)elf_entry)) {
895 /* 1007 /*
896 * load_elf_interp() returns relocation 1008 * load_elf_interp() returns relocation
@@ -917,6 +1029,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
917 } 1029 }
918 } 1030 }
919 1031
1032 kfree(interp_elf_phdata);
920 kfree(elf_phdata); 1033 kfree(elf_phdata);
921 1034
922 set_binfmt(&elf_format); 1035 set_binfmt(&elf_format);
@@ -981,6 +1094,7 @@ out_ret:
981 1094
982 /* error cleanup */ 1095 /* error cleanup */
983out_free_dentry: 1096out_free_dentry:
1097 kfree(interp_elf_phdata);
984 allow_write_access(interpreter); 1098 allow_write_access(interpreter);
985 if (interpreter) 1099 if (interpreter)
986 fput(interpreter); 1100 fput(interpreter);
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index cb7f3fe9c9f6..d897ef803b3b 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -94,6 +94,7 @@
94#include <linux/mutex.h> 94#include <linux/mutex.h>
95#include <linux/genhd.h> 95#include <linux/genhd.h>
96#include <linux/blkdev.h> 96#include <linux/blkdev.h>
97#include <linux/vmalloc.h>
97#include "ctree.h" 98#include "ctree.h"
98#include "disk-io.h" 99#include "disk-io.h"
99#include "hash.h" 100#include "hash.h"
@@ -326,9 +327,6 @@ static int btrfsic_handle_extent_data(struct btrfsic_state *state,
326static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, 327static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
327 struct btrfsic_block_data_ctx *block_ctx_out, 328 struct btrfsic_block_data_ctx *block_ctx_out,
328 int mirror_num); 329 int mirror_num);
329static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
330 u32 len, struct block_device *bdev,
331 struct btrfsic_block_data_ctx *block_ctx_out);
332static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); 330static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
333static int btrfsic_read_block(struct btrfsic_state *state, 331static int btrfsic_read_block(struct btrfsic_state *state,
334 struct btrfsic_block_data_ctx *block_ctx); 332 struct btrfsic_block_data_ctx *block_ctx);
@@ -1326,24 +1324,25 @@ static int btrfsic_create_link_to_next_block(
1326 l = NULL; 1324 l = NULL;
1327 next_block->generation = BTRFSIC_GENERATION_UNKNOWN; 1325 next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
1328 } else { 1326 } else {
1329 if (next_block->logical_bytenr != next_bytenr && 1327 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
1330 !(!next_block->is_metadata && 1328 if (next_block->logical_bytenr != next_bytenr &&
1331 0 == next_block->logical_bytenr)) { 1329 !(!next_block->is_metadata &&
1332 printk(KERN_INFO 1330 0 == next_block->logical_bytenr))
1333 "Referenced block @%llu (%s/%llu/%d)" 1331 printk(KERN_INFO
1334 " found in hash table, %c," 1332 "Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
1335 " bytenr mismatch (!= stored %llu).\n", 1333 next_bytenr, next_block_ctx->dev->name,
1336 next_bytenr, next_block_ctx->dev->name, 1334 next_block_ctx->dev_bytenr, *mirror_nump,
1337 next_block_ctx->dev_bytenr, *mirror_nump, 1335 btrfsic_get_block_type(state,
1338 btrfsic_get_block_type(state, next_block), 1336 next_block),
1339 next_block->logical_bytenr); 1337 next_block->logical_bytenr);
1340 } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) 1338 else
1341 printk(KERN_INFO 1339 printk(KERN_INFO
1342 "Referenced block @%llu (%s/%llu/%d)" 1340 "Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n",
1343 " found in hash table, %c.\n", 1341 next_bytenr, next_block_ctx->dev->name,
1344 next_bytenr, next_block_ctx->dev->name, 1342 next_block_ctx->dev_bytenr, *mirror_nump,
1345 next_block_ctx->dev_bytenr, *mirror_nump, 1343 btrfsic_get_block_type(state,
1346 btrfsic_get_block_type(state, next_block)); 1344 next_block));
1345 }
1347 next_block->logical_bytenr = next_bytenr; 1346 next_block->logical_bytenr = next_bytenr;
1348 1347
1349 next_block->mirror_num = *mirror_nump; 1348 next_block->mirror_num = *mirror_nump;
@@ -1529,7 +1528,9 @@ static int btrfsic_handle_extent_data(
1529 return -1; 1528 return -1;
1530 } 1529 }
1531 if (!block_was_created) { 1530 if (!block_was_created) {
1532 if (next_block->logical_bytenr != next_bytenr && 1531 if ((state->print_mask &
1532 BTRFSIC_PRINT_MASK_VERBOSE) &&
1533 next_block->logical_bytenr != next_bytenr &&
1533 !(!next_block->is_metadata && 1534 !(!next_block->is_metadata &&
1534 0 == next_block->logical_bytenr)) { 1535 0 == next_block->logical_bytenr)) {
1535 printk(KERN_INFO 1536 printk(KERN_INFO
@@ -1607,25 +1608,6 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1607 return ret; 1608 return ret;
1608} 1609}
1609 1610
1610static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
1611 u32 len, struct block_device *bdev,
1612 struct btrfsic_block_data_ctx *block_ctx_out)
1613{
1614 block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
1615 block_ctx_out->dev_bytenr = bytenr;
1616 block_ctx_out->start = bytenr;
1617 block_ctx_out->len = len;
1618 block_ctx_out->datav = NULL;
1619 block_ctx_out->pagev = NULL;
1620 block_ctx_out->mem_to_free = NULL;
1621 if (NULL != block_ctx_out->dev) {
1622 return 0;
1623 } else {
1624 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
1625 return -ENXIO;
1626 }
1627}
1628
1629static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) 1611static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
1630{ 1612{
1631 if (block_ctx->mem_to_free) { 1613 if (block_ctx->mem_to_free) {
@@ -1901,25 +1883,26 @@ again:
1901 dev_state, 1883 dev_state,
1902 dev_bytenr); 1884 dev_bytenr);
1903 } 1885 }
1904 if (block->logical_bytenr != bytenr && 1886 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
1905 !(!block->is_metadata && 1887 if (block->logical_bytenr != bytenr &&
1906 block->logical_bytenr == 0)) 1888 !(!block->is_metadata &&
1907 printk(KERN_INFO 1889 block->logical_bytenr == 0))
1908 "Written block @%llu (%s/%llu/%d)" 1890 printk(KERN_INFO
1909 " found in hash table, %c," 1891 "Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
1910 " bytenr mismatch" 1892 bytenr, dev_state->name,
1911 " (!= stored %llu).\n", 1893 dev_bytenr,
1912 bytenr, dev_state->name, dev_bytenr, 1894 block->mirror_num,
1913 block->mirror_num, 1895 btrfsic_get_block_type(state,
1914 btrfsic_get_block_type(state, block), 1896 block),
1915 block->logical_bytenr); 1897 block->logical_bytenr);
1916 else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) 1898 else
1917 printk(KERN_INFO 1899 printk(KERN_INFO
1918 "Written block @%llu (%s/%llu/%d)" 1900 "Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
1919 " found in hash table, %c.\n", 1901 bytenr, dev_state->name,
1920 bytenr, dev_state->name, dev_bytenr, 1902 dev_bytenr, block->mirror_num,
1921 block->mirror_num, 1903 btrfsic_get_block_type(state,
1922 btrfsic_get_block_type(state, block)); 1904 block));
1905 }
1923 block->logical_bytenr = bytenr; 1906 block->logical_bytenr = bytenr;
1924 } else { 1907 } else {
1925 if (num_pages * PAGE_CACHE_SIZE < 1908 if (num_pages * PAGE_CACHE_SIZE <
@@ -2002,24 +1985,13 @@ again:
2002 } 1985 }
2003 } 1986 }
2004 1987
2005 if (block->is_superblock)
2006 ret = btrfsic_map_superblock(state, bytenr,
2007 processed_len,
2008 bdev, &block_ctx);
2009 else
2010 ret = btrfsic_map_block(state, bytenr, processed_len,
2011 &block_ctx, 0);
2012 if (ret) {
2013 printk(KERN_INFO
2014 "btrfsic: btrfsic_map_block(root @%llu)"
2015 " failed!\n", bytenr);
2016 goto continue_loop;
2017 }
2018 block_ctx.datav = mapped_datav;
2019 /* the following is required in case of writes to mirrors,
2020 * use the same that was used for the lookup */
2021 block_ctx.dev = dev_state; 1988 block_ctx.dev = dev_state;
2022 block_ctx.dev_bytenr = dev_bytenr; 1989 block_ctx.dev_bytenr = dev_bytenr;
1990 block_ctx.start = bytenr;
1991 block_ctx.len = processed_len;
1992 block_ctx.pagev = NULL;
1993 block_ctx.mem_to_free = NULL;
1994 block_ctx.datav = mapped_datav;
2023 1995
2024 if (is_metadata || state->include_extent_data) { 1996 if (is_metadata || state->include_extent_data) {
2025 block->never_written = 0; 1997 block->never_written = 0;
@@ -2133,10 +2105,6 @@ again:
2133 /* this is getting ugly for the 2105 /* this is getting ugly for the
2134 * include_extent_data case... */ 2106 * include_extent_data case... */
2135 bytenr = 0; /* unknown */ 2107 bytenr = 0; /* unknown */
2136 block_ctx.start = bytenr;
2137 block_ctx.len = processed_len;
2138 block_ctx.mem_to_free = NULL;
2139 block_ctx.pagev = NULL;
2140 } else { 2108 } else {
2141 processed_len = state->metablock_size; 2109 processed_len = state->metablock_size;
2142 bytenr = btrfs_stack_header_bytenr( 2110 bytenr = btrfs_stack_header_bytenr(
@@ -2149,22 +2117,15 @@ again:
2149 "Written block @%llu (%s/%llu/?)" 2117 "Written block @%llu (%s/%llu/?)"
2150 " !found in hash table, M.\n", 2118 " !found in hash table, M.\n",
2151 bytenr, dev_state->name, dev_bytenr); 2119 bytenr, dev_state->name, dev_bytenr);
2152
2153 ret = btrfsic_map_block(state, bytenr, processed_len,
2154 &block_ctx, 0);
2155 if (ret) {
2156 printk(KERN_INFO
2157 "btrfsic: btrfsic_map_block(root @%llu)"
2158 " failed!\n",
2159 dev_bytenr);
2160 goto continue_loop;
2161 }
2162 } 2120 }
2163 block_ctx.datav = mapped_datav; 2121
2164 /* the following is required in case of writes to mirrors,
2165 * use the same that was used for the lookup */
2166 block_ctx.dev = dev_state; 2122 block_ctx.dev = dev_state;
2167 block_ctx.dev_bytenr = dev_bytenr; 2123 block_ctx.dev_bytenr = dev_bytenr;
2124 block_ctx.start = bytenr;
2125 block_ctx.len = processed_len;
2126 block_ctx.pagev = NULL;
2127 block_ctx.mem_to_free = NULL;
2128 block_ctx.datav = mapped_datav;
2168 2129
2169 block = btrfsic_block_alloc(); 2130 block = btrfsic_block_alloc();
2170 if (NULL == block) { 2131 if (NULL == block) {
@@ -3130,10 +3091,13 @@ int btrfsic_mount(struct btrfs_root *root,
3130 root->sectorsize, PAGE_CACHE_SIZE); 3091 root->sectorsize, PAGE_CACHE_SIZE);
3131 return -1; 3092 return -1;
3132 } 3093 }
3133 state = kzalloc(sizeof(*state), GFP_NOFS); 3094 state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
3134 if (NULL == state) { 3095 if (!state) {
3135 printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); 3096 state = vzalloc(sizeof(*state));
3136 return -1; 3097 if (!state) {
3098 printk(KERN_INFO "btrfs check-integrity: vzalloc() failed!\n");
3099 return -1;
3100 }
3137 } 3101 }
3138 3102
3139 if (!btrfsic_is_initialized) { 3103 if (!btrfsic_is_initialized) {
@@ -3277,5 +3241,8 @@ void btrfsic_unmount(struct btrfs_root *root,
3277 3241
3278 mutex_unlock(&btrfsic_mutex); 3242 mutex_unlock(&btrfsic_mutex);
3279 3243
3280 kfree(state); 3244 if (is_vmalloc_addr(state))
3245 vfree(state);
3246 else
3247 kfree(state);
3281} 3248}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index dcd9be32ac57..e9df8862012c 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -224,16 +224,19 @@ out:
224 * Clear the writeback bits on all of the file 224 * Clear the writeback bits on all of the file
225 * pages for a compressed write 225 * pages for a compressed write
226 */ 226 */
227static noinline void end_compressed_writeback(struct inode *inode, u64 start, 227static noinline void end_compressed_writeback(struct inode *inode,
228 unsigned long ram_size) 228 const struct compressed_bio *cb)
229{ 229{
230 unsigned long index = start >> PAGE_CACHE_SHIFT; 230 unsigned long index = cb->start >> PAGE_CACHE_SHIFT;
231 unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; 231 unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_CACHE_SHIFT;
232 struct page *pages[16]; 232 struct page *pages[16];
233 unsigned long nr_pages = end_index - index + 1; 233 unsigned long nr_pages = end_index - index + 1;
234 int i; 234 int i;
235 int ret; 235 int ret;
236 236
237 if (cb->errors)
238 mapping_set_error(inode->i_mapping, -EIO);
239
237 while (nr_pages > 0) { 240 while (nr_pages > 0) {
238 ret = find_get_pages_contig(inode->i_mapping, index, 241 ret = find_get_pages_contig(inode->i_mapping, index,
239 min_t(unsigned long, 242 min_t(unsigned long,
@@ -244,6 +247,8 @@ static noinline void end_compressed_writeback(struct inode *inode, u64 start,
244 continue; 247 continue;
245 } 248 }
246 for (i = 0; i < ret; i++) { 249 for (i = 0; i < ret; i++) {
250 if (cb->errors)
251 SetPageError(pages[i]);
247 end_page_writeback(pages[i]); 252 end_page_writeback(pages[i]);
248 page_cache_release(pages[i]); 253 page_cache_release(pages[i]);
249 } 254 }
@@ -287,10 +292,11 @@ static void end_compressed_bio_write(struct bio *bio, int err)
287 tree->ops->writepage_end_io_hook(cb->compressed_pages[0], 292 tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
288 cb->start, 293 cb->start,
289 cb->start + cb->len - 1, 294 cb->start + cb->len - 1,
290 NULL, 1); 295 NULL,
296 err ? 0 : 1);
291 cb->compressed_pages[0]->mapping = NULL; 297 cb->compressed_pages[0]->mapping = NULL;
292 298
293 end_compressed_writeback(inode, cb->start, cb->len); 299 end_compressed_writeback(inode, cb);
294 /* note, our inode could be gone now */ 300 /* note, our inode could be gone now */
295 301
296 /* 302 /*
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 150822ee0a0b..14a72ed14ef7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2929,7 +2929,7 @@ done:
2929 */ 2929 */
2930 if (!p->leave_spinning) 2930 if (!p->leave_spinning)
2931 btrfs_set_path_blocking(p); 2931 btrfs_set_path_blocking(p);
2932 if (ret < 0) 2932 if (ret < 0 && !p->skip_release_on_error)
2933 btrfs_release_path(p); 2933 btrfs_release_path(p);
2934 return ret; 2934 return ret;
2935} 2935}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fe69edda11fb..e6fbbd74b716 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -607,6 +607,7 @@ struct btrfs_path {
607 unsigned int leave_spinning:1; 607 unsigned int leave_spinning:1;
608 unsigned int search_commit_root:1; 608 unsigned int search_commit_root:1;
609 unsigned int need_commit_sem:1; 609 unsigned int need_commit_sem:1;
610 unsigned int skip_release_on_error:1;
610}; 611};
611 612
612/* 613/*
@@ -1170,6 +1171,7 @@ struct btrfs_space_info {
1170 struct percpu_counter total_bytes_pinned; 1171 struct percpu_counter total_bytes_pinned;
1171 1172
1172 struct list_head list; 1173 struct list_head list;
1174 struct list_head ro_bgs;
1173 1175
1174 struct rw_semaphore groups_sem; 1176 struct rw_semaphore groups_sem;
1175 /* for block groups in our same type */ 1177 /* for block groups in our same type */
@@ -1276,6 +1278,8 @@ struct btrfs_block_group_cache {
1276 unsigned int ro:1; 1278 unsigned int ro:1;
1277 unsigned int dirty:1; 1279 unsigned int dirty:1;
1278 unsigned int iref:1; 1280 unsigned int iref:1;
1281 unsigned int has_caching_ctl:1;
1282 unsigned int removed:1;
1279 1283
1280 int disk_cache_state; 1284 int disk_cache_state;
1281 1285
@@ -1305,6 +1309,11 @@ struct btrfs_block_group_cache {
1305 1309
1306 /* For delayed block group creation or deletion of empty block groups */ 1310 /* For delayed block group creation or deletion of empty block groups */
1307 struct list_head bg_list; 1311 struct list_head bg_list;
1312
1313 /* For read-only block groups */
1314 struct list_head ro_list;
1315
1316 atomic_t trimming;
1308}; 1317};
1309 1318
1310/* delayed seq elem */ 1319/* delayed seq elem */
@@ -1402,6 +1411,11 @@ struct btrfs_fs_info {
1402 */ 1411 */
1403 u64 last_trans_log_full_commit; 1412 u64 last_trans_log_full_commit;
1404 unsigned long mount_opt; 1413 unsigned long mount_opt;
1414 /*
1415 * Track requests for actions that need to be done during transaction
1416 * commit (like for some mount options).
1417 */
1418 unsigned long pending_changes;
1405 unsigned long compress_type:4; 1419 unsigned long compress_type:4;
1406 int commit_interval; 1420 int commit_interval;
1407 /* 1421 /*
@@ -1729,6 +1743,12 @@ struct btrfs_fs_info {
1729 1743
1730 /* For btrfs to record security options */ 1744 /* For btrfs to record security options */
1731 struct security_mnt_opts security_opts; 1745 struct security_mnt_opts security_opts;
1746
1747 /*
1748 * Chunks that can't be freed yet (under a trim/discard operation)
1749 * and will be latter freed. Protected by fs_info->chunk_mutex.
1750 */
1751 struct list_head pinned_chunks;
1732}; 1752};
1733 1753
1734struct btrfs_subvolume_writers { 1754struct btrfs_subvolume_writers {
@@ -2093,7 +2113,6 @@ struct btrfs_ioctl_defrag_range_args {
2093#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) 2113#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
2094#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) 2114#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22)
2095#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) 2115#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23)
2096#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24)
2097 2116
2098#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) 2117#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
2099#define BTRFS_DEFAULT_MAX_INLINE (8192) 2118#define BTRFS_DEFAULT_MAX_INLINE (8192)
@@ -2103,6 +2122,7 @@ struct btrfs_ioctl_defrag_range_args {
2103#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) 2122#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
2104#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ 2123#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
2105 BTRFS_MOUNT_##opt) 2124 BTRFS_MOUNT_##opt)
2125
2106#define btrfs_set_and_info(root, opt, fmt, args...) \ 2126#define btrfs_set_and_info(root, opt, fmt, args...) \
2107{ \ 2127{ \
2108 if (!btrfs_test_opt(root, opt)) \ 2128 if (!btrfs_test_opt(root, opt)) \
@@ -2118,6 +2138,49 @@ struct btrfs_ioctl_defrag_range_args {
2118} 2138}
2119 2139
2120/* 2140/*
2141 * Requests for changes that need to be done during transaction commit.
2142 *
2143 * Internal mount options that are used for special handling of the real
2144 * mount options (eg. cannot be set during remount and have to be set during
2145 * transaction commit)
2146 */
2147
2148#define BTRFS_PENDING_SET_INODE_MAP_CACHE (0)
2149#define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE (1)
2150#define BTRFS_PENDING_COMMIT (2)
2151
2152#define btrfs_test_pending(info, opt) \
2153 test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
2154#define btrfs_set_pending(info, opt) \
2155 set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
2156#define btrfs_clear_pending(info, opt) \
2157 clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
2158
2159/*
2160 * Helpers for setting pending mount option changes.
2161 *
2162 * Expects corresponding macros
2163 * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name
2164 */
2165#define btrfs_set_pending_and_info(info, opt, fmt, args...) \
2166do { \
2167 if (!btrfs_raw_test_opt((info)->mount_opt, opt)) { \
2168 btrfs_info((info), fmt, ##args); \
2169 btrfs_set_pending((info), SET_##opt); \
2170 btrfs_clear_pending((info), CLEAR_##opt); \
2171 } \
2172} while(0)
2173
2174#define btrfs_clear_pending_and_info(info, opt, fmt, args...) \
2175do { \
2176 if (btrfs_raw_test_opt((info)->mount_opt, opt)) { \
2177 btrfs_info((info), fmt, ##args); \
2178 btrfs_set_pending((info), CLEAR_##opt); \
2179 btrfs_clear_pending((info), SET_##opt); \
2180 } \
2181} while(0)
2182
2183/*
2121 * Inode flags 2184 * Inode flags
2122 */ 2185 */
2123#define BTRFS_INODE_NODATASUM (1 << 0) 2186#define BTRFS_INODE_NODATASUM (1 << 0)
@@ -3351,7 +3414,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
3351 u64 type, u64 chunk_objectid, u64 chunk_offset, 3414 u64 type, u64 chunk_objectid, u64 chunk_offset,
3352 u64 size); 3415 u64 size);
3353int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 3416int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
3354 struct btrfs_root *root, u64 group_start); 3417 struct btrfs_root *root, u64 group_start,
3418 struct extent_map *em);
3355void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); 3419void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
3356void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 3420void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
3357 struct btrfs_root *root); 3421 struct btrfs_root *root);
@@ -3427,8 +3491,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
3427int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 3491int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
3428 struct btrfs_fs_info *fs_info); 3492 struct btrfs_fs_info *fs_info);
3429int __get_raid_index(u64 flags); 3493int __get_raid_index(u64 flags);
3430int btrfs_start_nocow_write(struct btrfs_root *root); 3494int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
3431void btrfs_end_nocow_write(struct btrfs_root *root); 3495void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
3432/* ctree.c */ 3496/* ctree.c */
3433int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 3497int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
3434 int level, int *slot); 3498 int level, int *slot);
@@ -3686,6 +3750,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
3686int verify_dir_item(struct btrfs_root *root, 3750int verify_dir_item(struct btrfs_root *root,
3687 struct extent_buffer *leaf, 3751 struct extent_buffer *leaf,
3688 struct btrfs_dir_item *dir_item); 3752 struct btrfs_dir_item *dir_item);
3753struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
3754 struct btrfs_path *path,
3755 const char *name,
3756 int name_len);
3689 3757
3690/* orphan.c */ 3758/* orphan.c */
3691int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, 3759int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -3857,6 +3925,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
3857 struct btrfs_trans_handle *trans, int mode, 3925 struct btrfs_trans_handle *trans, int mode,
3858 u64 start, u64 num_bytes, u64 min_size, 3926 u64 start, u64 num_bytes, u64 min_size,
3859 loff_t actual_len, u64 *alloc_hint); 3927 loff_t actual_len, u64 *alloc_hint);
3928int btrfs_inode_check_errors(struct inode *inode);
3860extern const struct dentry_operations btrfs_dentry_operations; 3929extern const struct dentry_operations btrfs_dentry_operations;
3861 3930
3862/* ioctl.c */ 3931/* ioctl.c */
@@ -3901,6 +3970,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
3901 struct page **pages, size_t num_pages, 3970 struct page **pages, size_t num_pages,
3902 loff_t pos, size_t write_bytes, 3971 loff_t pos, size_t write_bytes,
3903 struct extent_state **cached); 3972 struct extent_state **cached);
3973int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
3904 3974
3905/* tree-defrag.c */ 3975/* tree-defrag.c */
3906int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 3976int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -4097,7 +4167,12 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
4097/* dev-replace.c */ 4167/* dev-replace.c */
4098void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); 4168void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
4099void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info); 4169void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
4100void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info); 4170void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
4171
4172static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
4173{
4174 btrfs_bio_counter_sub(fs_info, 1);
4175}
4101 4176
4102/* reada.c */ 4177/* reada.c */
4103struct reada_control { 4178struct reada_control {
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 6f662b34ba0e..ca6a3a3b6b6c 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -316,11 +316,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
316 struct btrfs_device *tgt_device = NULL; 316 struct btrfs_device *tgt_device = NULL;
317 struct btrfs_device *src_device = NULL; 317 struct btrfs_device *src_device = NULL;
318 318
319 if (btrfs_fs_incompat(fs_info, RAID56)) {
320 btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6");
321 return -EOPNOTSUPP;
322 }
323
324 switch (args->start.cont_reading_from_srcdev_mode) { 319 switch (args->start.cont_reading_from_srcdev_mode) {
325 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: 320 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
326 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: 321 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
@@ -422,9 +417,15 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
422 &dev_replace->scrub_progress, 0, 1); 417 &dev_replace->scrub_progress, 0, 1);
423 418
424 ret = btrfs_dev_replace_finishing(root->fs_info, ret); 419 ret = btrfs_dev_replace_finishing(root->fs_info, ret);
425 WARN_ON(ret); 420 /* don't warn if EINPROGRESS, someone else might be running scrub */
421 if (ret == -EINPROGRESS) {
422 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
423 ret = 0;
424 } else {
425 WARN_ON(ret);
426 }
426 427
427 return 0; 428 return ret;
428 429
429leave: 430leave:
430 dev_replace->srcdev = NULL; 431 dev_replace->srcdev = NULL;
@@ -542,7 +543,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
542 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); 543 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
543 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 544 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
544 545
545 return 0; 546 return scrub_ret;
546 } 547 }
547 548
548 printk_in_rcu(KERN_INFO 549 printk_in_rcu(KERN_INFO
@@ -571,15 +572,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
571 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 572 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
572 fs_info->fs_devices->rw_devices++; 573 fs_info->fs_devices->rw_devices++;
573 574
574 /* replace the sysfs entry */
575 btrfs_kobj_rm_device(fs_info, src_device);
576 btrfs_kobj_add_device(fs_info, tgt_device);
577
578 btrfs_dev_replace_unlock(dev_replace); 575 btrfs_dev_replace_unlock(dev_replace);
579 576
580 btrfs_rm_dev_replace_blocked(fs_info); 577 btrfs_rm_dev_replace_blocked(fs_info);
581 578
582 btrfs_rm_dev_replace_srcdev(fs_info, src_device); 579 btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device);
583 580
584 btrfs_rm_dev_replace_unblocked(fs_info); 581 btrfs_rm_dev_replace_unblocked(fs_info);
585 582
@@ -594,6 +591,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
594 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 591 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
595 mutex_unlock(&uuid_mutex); 592 mutex_unlock(&uuid_mutex);
596 593
594 /* replace the sysfs entry */
595 btrfs_kobj_rm_device(fs_info, src_device);
596 btrfs_kobj_add_device(fs_info, tgt_device);
597 btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
598
597 /* write back the superblocks */ 599 /* write back the superblocks */
598 trans = btrfs_start_transaction(root, 0); 600 trans = btrfs_start_transaction(root, 0);
599 if (!IS_ERR(trans)) 601 if (!IS_ERR(trans))
@@ -920,9 +922,9 @@ void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
920 percpu_counter_inc(&fs_info->bio_counter); 922 percpu_counter_inc(&fs_info->bio_counter);
921} 923}
922 924
923void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) 925void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
924{ 926{
925 percpu_counter_dec(&fs_info->bio_counter); 927 percpu_counter_sub(&fs_info->bio_counter, amount);
926 928
927 if (waitqueue_active(&fs_info->replace_wait)) 929 if (waitqueue_active(&fs_info->replace_wait))
928 wake_up(&fs_info->replace_wait); 930 wake_up(&fs_info->replace_wait);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index fc8df866e919..1752625fb4dd 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -21,10 +21,6 @@
21#include "hash.h" 21#include "hash.h"
22#include "transaction.h" 22#include "transaction.h"
23 23
24static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
25 struct btrfs_path *path,
26 const char *name, int name_len);
27
28/* 24/*
29 * insert a name into a directory, doing overflow properly if there is a hash 25 * insert a name into a directory, doing overflow properly if there is a hash
30 * collision. data_size indicates how big the item inserted should be. On 26 * collision. data_size indicates how big the item inserted should be. On
@@ -383,9 +379,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
383 * this walks through all the entries in a dir item and finds one 379 * this walks through all the entries in a dir item and finds one
384 * for a specific name. 380 * for a specific name.
385 */ 381 */
386static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, 382struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
387 struct btrfs_path *path, 383 struct btrfs_path *path,
388 const char *name, int name_len) 384 const char *name, int name_len)
389{ 385{
390 struct btrfs_dir_item *dir_item; 386 struct btrfs_dir_item *dir_item;
391 unsigned long name_ptr; 387 unsigned long name_ptr;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1bf9f897065d..30965120772b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2384,6 +2384,8 @@ int open_ctree(struct super_block *sb,
2384 init_waitqueue_head(&fs_info->transaction_blocked_wait); 2384 init_waitqueue_head(&fs_info->transaction_blocked_wait);
2385 init_waitqueue_head(&fs_info->async_submit_wait); 2385 init_waitqueue_head(&fs_info->async_submit_wait);
2386 2386
2387 INIT_LIST_HEAD(&fs_info->pinned_chunks);
2388
2387 ret = btrfs_alloc_stripe_hash_table(fs_info); 2389 ret = btrfs_alloc_stripe_hash_table(fs_info);
2388 if (ret) { 2390 if (ret) {
2389 err = ret; 2391 err = ret;
@@ -2830,9 +2832,11 @@ retry_root_backup:
2830 btrfs_set_opt(fs_info->mount_opt, SSD); 2832 btrfs_set_opt(fs_info->mount_opt, SSD);
2831 } 2833 }
2832 2834
2833 /* Set the real inode map cache flag */ 2835 /*
2834 if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE)) 2836 * Mount does not set all options immediatelly, we can do it now and do
2835 btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE); 2837 * not have to wait for transaction commit
2838 */
2839 btrfs_apply_pending_changes(fs_info);
2836 2840
2837#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 2841#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2838 if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { 2842 if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
@@ -3713,6 +3717,17 @@ void close_ctree(struct btrfs_root *root)
3713 3717
3714 btrfs_free_block_rsv(root, root->orphan_block_rsv); 3718 btrfs_free_block_rsv(root, root->orphan_block_rsv);
3715 root->orphan_block_rsv = NULL; 3719 root->orphan_block_rsv = NULL;
3720
3721 lock_chunks(root);
3722 while (!list_empty(&fs_info->pinned_chunks)) {
3723 struct extent_map *em;
3724
3725 em = list_first_entry(&fs_info->pinned_chunks,
3726 struct extent_map, list);
3727 list_del_init(&em->list);
3728 free_extent_map(em);
3729 }
3730 unlock_chunks(root);
3716} 3731}
3717 3732
3718int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 3733int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -3839,12 +3854,12 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
3839 */ 3854 */
3840 if (!IS_ALIGNED(btrfs_super_root(sb), 4096)) 3855 if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
3841 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", 3856 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
3842 sb->root); 3857 btrfs_super_root(sb));
3843 if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096)) 3858 if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
3844 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", 3859 printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n",
3845 sb->chunk_root); 3860 btrfs_super_chunk_root(sb));
3846 if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096)) 3861 if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
3847 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", 3862 printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
3848 btrfs_super_log_root(sb)); 3863 btrfs_super_log_root(sb));
3849 3864
3850 if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { 3865 if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
@@ -4129,6 +4144,25 @@ again:
4129 return 0; 4144 return 0;
4130} 4145}
4131 4146
4147static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
4148 struct btrfs_fs_info *fs_info)
4149{
4150 struct btrfs_ordered_extent *ordered;
4151
4152 spin_lock(&fs_info->trans_lock);
4153 while (!list_empty(&cur_trans->pending_ordered)) {
4154 ordered = list_first_entry(&cur_trans->pending_ordered,
4155 struct btrfs_ordered_extent,
4156 trans_list);
4157 list_del_init(&ordered->trans_list);
4158 spin_unlock(&fs_info->trans_lock);
4159
4160 btrfs_put_ordered_extent(ordered);
4161 spin_lock(&fs_info->trans_lock);
4162 }
4163 spin_unlock(&fs_info->trans_lock);
4164}
4165
4132void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, 4166void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4133 struct btrfs_root *root) 4167 struct btrfs_root *root)
4134{ 4168{
@@ -4140,6 +4174,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4140 cur_trans->state = TRANS_STATE_UNBLOCKED; 4174 cur_trans->state = TRANS_STATE_UNBLOCKED;
4141 wake_up(&root->fs_info->transaction_wait); 4175 wake_up(&root->fs_info->transaction_wait);
4142 4176
4177 btrfs_free_pending_ordered(cur_trans, root->fs_info);
4143 btrfs_destroy_delayed_inodes(root); 4178 btrfs_destroy_delayed_inodes(root);
4144 btrfs_assert_delayed_root_empty(root); 4179 btrfs_assert_delayed_root_empty(root);
4145 4180
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 47c1ba141082..222d6aea4a8a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -315,12 +315,6 @@ get_caching_control(struct btrfs_block_group_cache *cache)
315 struct btrfs_caching_control *ctl; 315 struct btrfs_caching_control *ctl;
316 316
317 spin_lock(&cache->lock); 317 spin_lock(&cache->lock);
318 if (cache->cached != BTRFS_CACHE_STARTED) {
319 spin_unlock(&cache->lock);
320 return NULL;
321 }
322
323 /* We're loading it the fast way, so we don't have a caching_ctl. */
324 if (!cache->caching_ctl) { 318 if (!cache->caching_ctl) {
325 spin_unlock(&cache->lock); 319 spin_unlock(&cache->lock);
326 return NULL; 320 return NULL;
@@ -594,6 +588,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
594 spin_unlock(&cache->lock); 588 spin_unlock(&cache->lock);
595 589
596 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 590 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
591 mutex_lock(&caching_ctl->mutex);
597 ret = load_free_space_cache(fs_info, cache); 592 ret = load_free_space_cache(fs_info, cache);
598 593
599 spin_lock(&cache->lock); 594 spin_lock(&cache->lock);
@@ -601,15 +596,19 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
601 cache->caching_ctl = NULL; 596 cache->caching_ctl = NULL;
602 cache->cached = BTRFS_CACHE_FINISHED; 597 cache->cached = BTRFS_CACHE_FINISHED;
603 cache->last_byte_to_unpin = (u64)-1; 598 cache->last_byte_to_unpin = (u64)-1;
599 caching_ctl->progress = (u64)-1;
604 } else { 600 } else {
605 if (load_cache_only) { 601 if (load_cache_only) {
606 cache->caching_ctl = NULL; 602 cache->caching_ctl = NULL;
607 cache->cached = BTRFS_CACHE_NO; 603 cache->cached = BTRFS_CACHE_NO;
608 } else { 604 } else {
609 cache->cached = BTRFS_CACHE_STARTED; 605 cache->cached = BTRFS_CACHE_STARTED;
606 cache->has_caching_ctl = 1;
610 } 607 }
611 } 608 }
612 spin_unlock(&cache->lock); 609 spin_unlock(&cache->lock);
610 mutex_unlock(&caching_ctl->mutex);
611
613 wake_up(&caching_ctl->wait); 612 wake_up(&caching_ctl->wait);
614 if (ret == 1) { 613 if (ret == 1) {
615 put_caching_control(caching_ctl); 614 put_caching_control(caching_ctl);
@@ -627,6 +626,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
627 cache->cached = BTRFS_CACHE_NO; 626 cache->cached = BTRFS_CACHE_NO;
628 } else { 627 } else {
629 cache->cached = BTRFS_CACHE_STARTED; 628 cache->cached = BTRFS_CACHE_STARTED;
629 cache->has_caching_ctl = 1;
630 } 630 }
631 spin_unlock(&cache->lock); 631 spin_unlock(&cache->lock);
632 wake_up(&caching_ctl->wait); 632 wake_up(&caching_ctl->wait);
@@ -3162,7 +3162,19 @@ next_block_group(struct btrfs_root *root,
3162 struct btrfs_block_group_cache *cache) 3162 struct btrfs_block_group_cache *cache)
3163{ 3163{
3164 struct rb_node *node; 3164 struct rb_node *node;
3165
3165 spin_lock(&root->fs_info->block_group_cache_lock); 3166 spin_lock(&root->fs_info->block_group_cache_lock);
3167
3168 /* If our block group was removed, we need a full search. */
3169 if (RB_EMPTY_NODE(&cache->cache_node)) {
3170 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3171
3172 spin_unlock(&root->fs_info->block_group_cache_lock);
3173 btrfs_put_block_group(cache);
3174 cache = btrfs_lookup_first_block_group(root->fs_info,
3175 next_bytenr);
3176 return cache;
3177 }
3166 node = rb_next(&cache->cache_node); 3178 node = rb_next(&cache->cache_node);
3167 btrfs_put_block_group(cache); 3179 btrfs_put_block_group(cache);
3168 if (node) { 3180 if (node) {
@@ -3504,6 +3516,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3504 found->chunk_alloc = 0; 3516 found->chunk_alloc = 0;
3505 found->flush = 0; 3517 found->flush = 0;
3506 init_waitqueue_head(&found->wait); 3518 init_waitqueue_head(&found->wait);
3519 INIT_LIST_HEAD(&found->ro_bgs);
3507 3520
3508 ret = kobject_init_and_add(&found->kobj, &space_info_ktype, 3521 ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
3509 info->space_info_kobj, "%s", 3522 info->space_info_kobj, "%s",
@@ -5425,7 +5438,17 @@ static int update_block_group(struct btrfs_root *root,
5425 spin_unlock(&cache->space_info->lock); 5438 spin_unlock(&cache->space_info->lock);
5426 } else { 5439 } else {
5427 old_val -= num_bytes; 5440 old_val -= num_bytes;
5441 btrfs_set_block_group_used(&cache->item, old_val);
5442 cache->pinned += num_bytes;
5443 cache->space_info->bytes_pinned += num_bytes;
5444 cache->space_info->bytes_used -= num_bytes;
5445 cache->space_info->disk_used -= num_bytes * factor;
5446 spin_unlock(&cache->lock);
5447 spin_unlock(&cache->space_info->lock);
5428 5448
5449 set_extent_dirty(info->pinned_extents,
5450 bytenr, bytenr + num_bytes - 1,
5451 GFP_NOFS | __GFP_NOFAIL);
5429 /* 5452 /*
5430 * No longer have used bytes in this block group, queue 5453 * No longer have used bytes in this block group, queue
5431 * it for deletion. 5454 * it for deletion.
@@ -5439,17 +5462,6 @@ static int update_block_group(struct btrfs_root *root,
5439 } 5462 }
5440 spin_unlock(&info->unused_bgs_lock); 5463 spin_unlock(&info->unused_bgs_lock);
5441 } 5464 }
5442 btrfs_set_block_group_used(&cache->item, old_val);
5443 cache->pinned += num_bytes;
5444 cache->space_info->bytes_pinned += num_bytes;
5445 cache->space_info->bytes_used -= num_bytes;
5446 cache->space_info->disk_used -= num_bytes * factor;
5447 spin_unlock(&cache->lock);
5448 spin_unlock(&cache->space_info->lock);
5449
5450 set_extent_dirty(info->pinned_extents,
5451 bytenr, bytenr + num_bytes - 1,
5452 GFP_NOFS | __GFP_NOFAIL);
5453 } 5465 }
5454 btrfs_put_block_group(cache); 5466 btrfs_put_block_group(cache);
5455 total -= num_bytes; 5467 total -= num_bytes;
@@ -8511,6 +8523,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
8511 min_allocable_bytes <= sinfo->total_bytes) { 8523 min_allocable_bytes <= sinfo->total_bytes) {
8512 sinfo->bytes_readonly += num_bytes; 8524 sinfo->bytes_readonly += num_bytes;
8513 cache->ro = 1; 8525 cache->ro = 1;
8526 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
8514 ret = 0; 8527 ret = 0;
8515 } 8528 }
8516out: 8529out:
@@ -8565,15 +8578,20 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
8565 8578
8566/* 8579/*
8567 * helper to account the unused space of all the readonly block group in the 8580 * helper to account the unused space of all the readonly block group in the
8568 * list. takes mirrors into account. 8581 * space_info. takes mirrors into account.
8569 */ 8582 */
8570static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) 8583u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
8571{ 8584{
8572 struct btrfs_block_group_cache *block_group; 8585 struct btrfs_block_group_cache *block_group;
8573 u64 free_bytes = 0; 8586 u64 free_bytes = 0;
8574 int factor; 8587 int factor;
8575 8588
8576 list_for_each_entry(block_group, groups_list, list) { 8589 /* It's df, we don't care if it's racey */
8590 if (list_empty(&sinfo->ro_bgs))
8591 return 0;
8592
8593 spin_lock(&sinfo->lock);
8594 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
8577 spin_lock(&block_group->lock); 8595 spin_lock(&block_group->lock);
8578 8596
8579 if (!block_group->ro) { 8597 if (!block_group->ro) {
@@ -8594,26 +8612,6 @@ static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
8594 8612
8595 spin_unlock(&block_group->lock); 8613 spin_unlock(&block_group->lock);
8596 } 8614 }
8597
8598 return free_bytes;
8599}
8600
8601/*
8602 * helper to account the unused space of all the readonly block group in the
8603 * space_info. takes mirrors into account.
8604 */
8605u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
8606{
8607 int i;
8608 u64 free_bytes = 0;
8609
8610 spin_lock(&sinfo->lock);
8611
8612 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
8613 if (!list_empty(&sinfo->block_groups[i]))
8614 free_bytes += __btrfs_get_ro_block_group_free_space(
8615 &sinfo->block_groups[i]);
8616
8617 spin_unlock(&sinfo->lock); 8615 spin_unlock(&sinfo->lock);
8618 8616
8619 return free_bytes; 8617 return free_bytes;
@@ -8633,6 +8631,7 @@ void btrfs_set_block_group_rw(struct btrfs_root *root,
8633 cache->bytes_super - btrfs_block_group_used(&cache->item); 8631 cache->bytes_super - btrfs_block_group_used(&cache->item);
8634 sinfo->bytes_readonly -= num_bytes; 8632 sinfo->bytes_readonly -= num_bytes;
8635 cache->ro = 0; 8633 cache->ro = 0;
8634 list_del_init(&cache->ro_list);
8636 spin_unlock(&cache->lock); 8635 spin_unlock(&cache->lock);
8637 spin_unlock(&sinfo->lock); 8636 spin_unlock(&sinfo->lock);
8638} 8637}
@@ -9002,7 +9001,9 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
9002 INIT_LIST_HEAD(&cache->list); 9001 INIT_LIST_HEAD(&cache->list);
9003 INIT_LIST_HEAD(&cache->cluster_list); 9002 INIT_LIST_HEAD(&cache->cluster_list);
9004 INIT_LIST_HEAD(&cache->bg_list); 9003 INIT_LIST_HEAD(&cache->bg_list);
9004 INIT_LIST_HEAD(&cache->ro_list);
9005 btrfs_init_free_space_ctl(cache); 9005 btrfs_init_free_space_ctl(cache);
9006 atomic_set(&cache->trimming, 0);
9006 9007
9007 return cache; 9008 return cache;
9008} 9009}
@@ -9195,9 +9196,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
9195 int ret = 0; 9196 int ret = 0;
9196 9197
9197 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { 9198 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
9198 list_del_init(&block_group->bg_list);
9199 if (ret) 9199 if (ret)
9200 continue; 9200 goto next;
9201 9201
9202 spin_lock(&block_group->lock); 9202 spin_lock(&block_group->lock);
9203 memcpy(&item, &block_group->item, sizeof(item)); 9203 memcpy(&item, &block_group->item, sizeof(item));
@@ -9212,6 +9212,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
9212 key.objectid, key.offset); 9212 key.objectid, key.offset);
9213 if (ret) 9213 if (ret)
9214 btrfs_abort_transaction(trans, extent_root, ret); 9214 btrfs_abort_transaction(trans, extent_root, ret);
9215next:
9216 list_del_init(&block_group->bg_list);
9215 } 9217 }
9216} 9218}
9217 9219
@@ -9304,7 +9306,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
9304} 9306}
9305 9307
9306int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 9308int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9307 struct btrfs_root *root, u64 group_start) 9309 struct btrfs_root *root, u64 group_start,
9310 struct extent_map *em)
9308{ 9311{
9309 struct btrfs_path *path; 9312 struct btrfs_path *path;
9310 struct btrfs_block_group_cache *block_group; 9313 struct btrfs_block_group_cache *block_group;
@@ -9316,6 +9319,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9316 int ret; 9319 int ret;
9317 int index; 9320 int index;
9318 int factor; 9321 int factor;
9322 struct btrfs_caching_control *caching_ctl = NULL;
9323 bool remove_em;
9319 9324
9320 root = root->fs_info->extent_root; 9325 root = root->fs_info->extent_root;
9321 9326
@@ -9400,6 +9405,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9400 spin_lock(&root->fs_info->block_group_cache_lock); 9405 spin_lock(&root->fs_info->block_group_cache_lock);
9401 rb_erase(&block_group->cache_node, 9406 rb_erase(&block_group->cache_node,
9402 &root->fs_info->block_group_cache_tree); 9407 &root->fs_info->block_group_cache_tree);
9408 RB_CLEAR_NODE(&block_group->cache_node);
9403 9409
9404 if (root->fs_info->first_logical_byte == block_group->key.objectid) 9410 if (root->fs_info->first_logical_byte == block_group->key.objectid)
9405 root->fs_info->first_logical_byte = (u64)-1; 9411 root->fs_info->first_logical_byte = (u64)-1;
@@ -9411,6 +9417,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9411 * are still on the list after taking the semaphore 9417 * are still on the list after taking the semaphore
9412 */ 9418 */
9413 list_del_init(&block_group->list); 9419 list_del_init(&block_group->list);
9420 list_del_init(&block_group->ro_list);
9414 if (list_empty(&block_group->space_info->block_groups[index])) { 9421 if (list_empty(&block_group->space_info->block_groups[index])) {
9415 kobj = block_group->space_info->block_group_kobjs[index]; 9422 kobj = block_group->space_info->block_group_kobjs[index];
9416 block_group->space_info->block_group_kobjs[index] = NULL; 9423 block_group->space_info->block_group_kobjs[index] = NULL;
@@ -9422,8 +9429,32 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9422 kobject_put(kobj); 9429 kobject_put(kobj);
9423 } 9430 }
9424 9431
9432 if (block_group->has_caching_ctl)
9433 caching_ctl = get_caching_control(block_group);
9425 if (block_group->cached == BTRFS_CACHE_STARTED) 9434 if (block_group->cached == BTRFS_CACHE_STARTED)
9426 wait_block_group_cache_done(block_group); 9435 wait_block_group_cache_done(block_group);
9436 if (block_group->has_caching_ctl) {
9437 down_write(&root->fs_info->commit_root_sem);
9438 if (!caching_ctl) {
9439 struct btrfs_caching_control *ctl;
9440
9441 list_for_each_entry(ctl,
9442 &root->fs_info->caching_block_groups, list)
9443 if (ctl->block_group == block_group) {
9444 caching_ctl = ctl;
9445 atomic_inc(&caching_ctl->count);
9446 break;
9447 }
9448 }
9449 if (caching_ctl)
9450 list_del_init(&caching_ctl->list);
9451 up_write(&root->fs_info->commit_root_sem);
9452 if (caching_ctl) {
9453 /* Once for the caching bgs list and once for us. */
9454 put_caching_control(caching_ctl);
9455 put_caching_control(caching_ctl);
9456 }
9457 }
9427 9458
9428 btrfs_remove_free_space_cache(block_group); 9459 btrfs_remove_free_space_cache(block_group);
9429 9460
@@ -9435,6 +9466,71 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9435 9466
9436 memcpy(&key, &block_group->key, sizeof(key)); 9467 memcpy(&key, &block_group->key, sizeof(key));
9437 9468
9469 lock_chunks(root);
9470 if (!list_empty(&em->list)) {
9471 /* We're in the transaction->pending_chunks list. */
9472 free_extent_map(em);
9473 }
9474 spin_lock(&block_group->lock);
9475 block_group->removed = 1;
9476 /*
9477 * At this point trimming can't start on this block group, because we
9478 * removed the block group from the tree fs_info->block_group_cache_tree
9479 * so no one can't find it anymore and even if someone already got this
9480 * block group before we removed it from the rbtree, they have already
9481 * incremented block_group->trimming - if they didn't, they won't find
9482 * any free space entries because we already removed them all when we
9483 * called btrfs_remove_free_space_cache().
9484 *
9485 * And we must not remove the extent map from the fs_info->mapping_tree
9486 * to prevent the same logical address range and physical device space
9487 * ranges from being reused for a new block group. This is because our
9488 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
9489 * completely transactionless, so while it is trimming a range the
9490 * currently running transaction might finish and a new one start,
9491 * allowing for new block groups to be created that can reuse the same
9492 * physical device locations unless we take this special care.
9493 */
9494 remove_em = (atomic_read(&block_group->trimming) == 0);
9495 /*
9496 * Make sure a trimmer task always sees the em in the pinned_chunks list
9497 * if it sees block_group->removed == 1 (needs to lock block_group->lock
9498 * before checking block_group->removed).
9499 */
9500 if (!remove_em) {
9501 /*
9502 * Our em might be in trans->transaction->pending_chunks which
9503 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
9504 * and so is the fs_info->pinned_chunks list.
9505 *
9506 * So at this point we must be holding the chunk_mutex to avoid
9507 * any races with chunk allocation (more specifically at
9508 * volumes.c:contains_pending_extent()), to ensure it always
9509 * sees the em, either in the pending_chunks list or in the
9510 * pinned_chunks list.
9511 */
9512 list_move_tail(&em->list, &root->fs_info->pinned_chunks);
9513 }
9514 spin_unlock(&block_group->lock);
9515
9516 if (remove_em) {
9517 struct extent_map_tree *em_tree;
9518
9519 em_tree = &root->fs_info->mapping_tree.map_tree;
9520 write_lock(&em_tree->lock);
9521 /*
9522 * The em might be in the pending_chunks list, so make sure the
9523 * chunk mutex is locked, since remove_extent_mapping() will
9524 * delete us from that list.
9525 */
9526 remove_extent_mapping(em_tree, em);
9527 write_unlock(&em_tree->lock);
9528 /* once for the tree */
9529 free_extent_map(em);
9530 }
9531
9532 unlock_chunks(root);
9533
9438 btrfs_put_block_group(block_group); 9534 btrfs_put_block_group(block_group);
9439 btrfs_put_block_group(block_group); 9535 btrfs_put_block_group(block_group);
9440 9536
@@ -9523,10 +9619,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
9523 */ 9619 */
9524 start = block_group->key.objectid; 9620 start = block_group->key.objectid;
9525 end = start + block_group->key.offset - 1; 9621 end = start + block_group->key.offset - 1;
9526 clear_extent_bits(&fs_info->freed_extents[0], start, end, 9622 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
9527 EXTENT_DIRTY, GFP_NOFS); 9623 EXTENT_DIRTY, GFP_NOFS);
9528 clear_extent_bits(&fs_info->freed_extents[1], start, end, 9624 if (ret) {
9625 btrfs_set_block_group_rw(root, block_group);
9626 goto end_trans;
9627 }
9628 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
9529 EXTENT_DIRTY, GFP_NOFS); 9629 EXTENT_DIRTY, GFP_NOFS);
9630 if (ret) {
9631 btrfs_set_block_group_rw(root, block_group);
9632 goto end_trans;
9633 }
9530 9634
9531 /* Reset pinned so btrfs_put_block_group doesn't complain */ 9635 /* Reset pinned so btrfs_put_block_group doesn't complain */
9532 block_group->pinned = 0; 9636 block_group->pinned = 0;
@@ -9537,6 +9641,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
9537 */ 9641 */
9538 ret = btrfs_remove_chunk(trans, root, 9642 ret = btrfs_remove_chunk(trans, root,
9539 block_group->key.objectid); 9643 block_group->key.objectid);
9644end_trans:
9540 btrfs_end_transaction(trans, root); 9645 btrfs_end_transaction(trans, root);
9541next: 9646next:
9542 btrfs_put_block_group(block_group); 9647 btrfs_put_block_group(block_group);
@@ -9657,12 +9762,14 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
9657} 9762}
9658 9763
9659/* 9764/*
9660 * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), 9765 * btrfs_{start,end}_write_no_snapshoting() are similar to
9661 * they are used to prevent the some tasks writing data into the page cache 9766 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
9662 * by nocow before the subvolume is snapshoted, but flush the data into 9767 * data into the page cache through nocow before the subvolume is snapshoted,
9663 * the disk after the snapshot creation. 9768 * but flush the data into disk after the snapshot creation, or to prevent
9769 * operations while snapshoting is ongoing and that cause the snapshot to be
9770 * inconsistent (writes followed by expanding truncates for example).
9664 */ 9771 */
9665void btrfs_end_nocow_write(struct btrfs_root *root) 9772void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
9666{ 9773{
9667 percpu_counter_dec(&root->subv_writers->counter); 9774 percpu_counter_dec(&root->subv_writers->counter);
9668 /* 9775 /*
@@ -9674,7 +9781,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root)
9674 wake_up(&root->subv_writers->wait); 9781 wake_up(&root->subv_writers->wait);
9675} 9782}
9676 9783
9677int btrfs_start_nocow_write(struct btrfs_root *root) 9784int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
9678{ 9785{
9679 if (atomic_read(&root->will_be_snapshoted)) 9786 if (atomic_read(&root->will_be_snapshoted))
9680 return 0; 9787 return 0;
@@ -9685,7 +9792,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root)
9685 */ 9792 */
9686 smp_mb(); 9793 smp_mb();
9687 if (atomic_read(&root->will_be_snapshoted)) { 9794 if (atomic_read(&root->will_be_snapshoted)) {
9688 btrfs_end_nocow_write(root); 9795 btrfs_end_write_no_snapshoting(root);
9689 return 0; 9796 return 0;
9690 } 9797 }
9691 return 1; 9798 return 1;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bf3f424e0013..4ebabd237153 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -595,9 +595,14 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
595 clear = 1; 595 clear = 1;
596again: 596again:
597 if (!prealloc && (mask & __GFP_WAIT)) { 597 if (!prealloc && (mask & __GFP_WAIT)) {
598 /*
599 * Don't care for allocation failure here because we might end
600 * up not needing the pre-allocated extent state at all, which
601 * is the case if we only have in the tree extent states that
602 * cover our input range and don't cover too any other range.
603 * If we end up needing a new extent state we allocate it later.
604 */
598 prealloc = alloc_extent_state(mask); 605 prealloc = alloc_extent_state(mask);
599 if (!prealloc)
600 return -ENOMEM;
601 } 606 }
602 607
603 spin_lock(&tree->lock); 608 spin_lock(&tree->lock);
@@ -796,17 +801,25 @@ static void set_state_bits(struct extent_io_tree *tree,
796 state->state |= bits_to_set; 801 state->state |= bits_to_set;
797} 802}
798 803
799static void cache_state(struct extent_state *state, 804static void cache_state_if_flags(struct extent_state *state,
800 struct extent_state **cached_ptr) 805 struct extent_state **cached_ptr,
806 const u64 flags)
801{ 807{
802 if (cached_ptr && !(*cached_ptr)) { 808 if (cached_ptr && !(*cached_ptr)) {
803 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { 809 if (!flags || (state->state & flags)) {
804 *cached_ptr = state; 810 *cached_ptr = state;
805 atomic_inc(&state->refs); 811 atomic_inc(&state->refs);
806 } 812 }
807 } 813 }
808} 814}
809 815
816static void cache_state(struct extent_state *state,
817 struct extent_state **cached_ptr)
818{
819 return cache_state_if_flags(state, cached_ptr,
820 EXTENT_IOBITS | EXTENT_BOUNDARY);
821}
822
810/* 823/*
811 * set some bits on a range in the tree. This may require allocations or 824 * set some bits on a range in the tree. This may require allocations or
812 * sleeping, so the gfp mask is used to indicate what is allowed. 825 * sleeping, so the gfp mask is used to indicate what is allowed.
@@ -1058,13 +1071,21 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1058 int err = 0; 1071 int err = 0;
1059 u64 last_start; 1072 u64 last_start;
1060 u64 last_end; 1073 u64 last_end;
1074 bool first_iteration = true;
1061 1075
1062 btrfs_debug_check_extent_io_range(tree, start, end); 1076 btrfs_debug_check_extent_io_range(tree, start, end);
1063 1077
1064again: 1078again:
1065 if (!prealloc && (mask & __GFP_WAIT)) { 1079 if (!prealloc && (mask & __GFP_WAIT)) {
1080 /*
1081 * Best effort, don't worry if extent state allocation fails
1082 * here for the first iteration. We might have a cached state
1083 * that matches exactly the target range, in which case no
1084 * extent state allocations are needed. We'll only know this
1085 * after locking the tree.
1086 */
1066 prealloc = alloc_extent_state(mask); 1087 prealloc = alloc_extent_state(mask);
1067 if (!prealloc) 1088 if (!prealloc && !first_iteration)
1068 return -ENOMEM; 1089 return -ENOMEM;
1069 } 1090 }
1070 1091
@@ -1234,6 +1255,7 @@ search_again:
1234 spin_unlock(&tree->lock); 1255 spin_unlock(&tree->lock);
1235 if (mask & __GFP_WAIT) 1256 if (mask & __GFP_WAIT)
1236 cond_resched(); 1257 cond_resched();
1258 first_iteration = false;
1237 goto again; 1259 goto again;
1238} 1260}
1239 1261
@@ -1482,7 +1504,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1482 state = find_first_extent_bit_state(tree, start, bits); 1504 state = find_first_extent_bit_state(tree, start, bits);
1483got_it: 1505got_it:
1484 if (state) { 1506 if (state) {
1485 cache_state(state, cached_state); 1507 cache_state_if_flags(state, cached_state, 0);
1486 *start_ret = state->start; 1508 *start_ret = state->start;
1487 *end_ret = state->end; 1509 *end_ret = state->end;
1488 ret = 0; 1510 ret = 0;
@@ -1746,6 +1768,9 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
1746 if (page_ops == 0) 1768 if (page_ops == 0)
1747 return 0; 1769 return 0;
1748 1770
1771 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
1772 mapping_set_error(inode->i_mapping, -EIO);
1773
1749 while (nr_pages > 0) { 1774 while (nr_pages > 0) {
1750 ret = find_get_pages_contig(inode->i_mapping, index, 1775 ret = find_get_pages_contig(inode->i_mapping, index,
1751 min_t(unsigned long, 1776 min_t(unsigned long,
@@ -1763,6 +1788,8 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
1763 clear_page_dirty_for_io(pages[i]); 1788 clear_page_dirty_for_io(pages[i]);
1764 if (page_ops & PAGE_SET_WRITEBACK) 1789 if (page_ops & PAGE_SET_WRITEBACK)
1765 set_page_writeback(pages[i]); 1790 set_page_writeback(pages[i]);
1791 if (page_ops & PAGE_SET_ERROR)
1792 SetPageError(pages[i]);
1766 if (page_ops & PAGE_END_WRITEBACK) 1793 if (page_ops & PAGE_END_WRITEBACK)
1767 end_page_writeback(pages[i]); 1794 end_page_writeback(pages[i]);
1768 if (page_ops & PAGE_UNLOCK) 1795 if (page_ops & PAGE_UNLOCK)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6d4b938be986..ece9ce87edff 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -49,6 +49,7 @@
49#define PAGE_SET_WRITEBACK (1 << 2) 49#define PAGE_SET_WRITEBACK (1 << 2)
50#define PAGE_END_WRITEBACK (1 << 3) 50#define PAGE_END_WRITEBACK (1 << 3)
51#define PAGE_SET_PRIVATE2 (1 << 4) 51#define PAGE_SET_PRIVATE2 (1 << 4)
52#define PAGE_SET_ERROR (1 << 5)
52 53
53/* 54/*
54 * page->private values. Every page that is controlled by the extent 55 * page->private values. Every page that is controlled by the extent
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 225302b39afb..6a98bddd8f33 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -287,8 +287,6 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
287 if (!em) 287 if (!em)
288 goto out; 288 goto out;
289 289
290 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
291 list_move(&em->list, &tree->modified_extents);
292 em->generation = gen; 290 em->generation = gen;
293 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 291 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
294 em->mod_start = em->start; 292 em->mod_start = em->start;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a18ceabd99a8..e4090259569b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1428,7 +1428,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1428 u64 num_bytes; 1428 u64 num_bytes;
1429 int ret; 1429 int ret;
1430 1430
1431 ret = btrfs_start_nocow_write(root); 1431 ret = btrfs_start_write_no_snapshoting(root);
1432 if (!ret) 1432 if (!ret)
1433 return -ENOSPC; 1433 return -ENOSPC;
1434 1434
@@ -1451,7 +1451,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1451 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); 1451 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
1452 if (ret <= 0) { 1452 if (ret <= 0) {
1453 ret = 0; 1453 ret = 0;
1454 btrfs_end_nocow_write(root); 1454 btrfs_end_write_no_snapshoting(root);
1455 } else { 1455 } else {
1456 *write_bytes = min_t(size_t, *write_bytes , 1456 *write_bytes = min_t(size_t, *write_bytes ,
1457 num_bytes - pos + lockstart); 1457 num_bytes - pos + lockstart);
@@ -1543,7 +1543,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1543 btrfs_free_reserved_data_space(inode, 1543 btrfs_free_reserved_data_space(inode,
1544 reserve_bytes); 1544 reserve_bytes);
1545 else 1545 else
1546 btrfs_end_nocow_write(root); 1546 btrfs_end_write_no_snapshoting(root);
1547 break; 1547 break;
1548 } 1548 }
1549 1549
@@ -1632,7 +1632,7 @@ again:
1632 1632
1633 release_bytes = 0; 1633 release_bytes = 0;
1634 if (only_release_metadata) 1634 if (only_release_metadata)
1635 btrfs_end_nocow_write(root); 1635 btrfs_end_write_no_snapshoting(root);
1636 1636
1637 if (only_release_metadata && copied > 0) { 1637 if (only_release_metadata && copied > 0) {
1638 u64 lockstart = round_down(pos, root->sectorsize); 1638 u64 lockstart = round_down(pos, root->sectorsize);
@@ -1661,7 +1661,7 @@ again:
1661 1661
1662 if (release_bytes) { 1662 if (release_bytes) {
1663 if (only_release_metadata) { 1663 if (only_release_metadata) {
1664 btrfs_end_nocow_write(root); 1664 btrfs_end_write_no_snapshoting(root);
1665 btrfs_delalloc_release_metadata(inode, release_bytes); 1665 btrfs_delalloc_release_metadata(inode, release_bytes);
1666 } else { 1666 } else {
1667 btrfs_delalloc_release_space(inode, release_bytes); 1667 btrfs_delalloc_release_space(inode, release_bytes);
@@ -1676,6 +1676,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1676 loff_t pos) 1676 loff_t pos)
1677{ 1677{
1678 struct file *file = iocb->ki_filp; 1678 struct file *file = iocb->ki_filp;
1679 struct inode *inode = file_inode(file);
1679 ssize_t written; 1680 ssize_t written;
1680 ssize_t written_buffered; 1681 ssize_t written_buffered;
1681 loff_t endbyte; 1682 loff_t endbyte;
@@ -1692,8 +1693,15 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1692 err = written_buffered; 1693 err = written_buffered;
1693 goto out; 1694 goto out;
1694 } 1695 }
1696 /*
1697 * Ensure all data is persisted. We want the next direct IO read to be
1698 * able to read what was just written.
1699 */
1695 endbyte = pos + written_buffered - 1; 1700 endbyte = pos + written_buffered - 1;
1696 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); 1701 err = btrfs_fdatawrite_range(inode, pos, endbyte);
1702 if (err)
1703 goto out;
1704 err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
1697 if (err) 1705 if (err)
1698 goto out; 1706 goto out;
1699 written += written_buffered; 1707 written += written_buffered;
@@ -1854,10 +1862,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
1854 int ret; 1862 int ret;
1855 1863
1856 atomic_inc(&BTRFS_I(inode)->sync_writers); 1864 atomic_inc(&BTRFS_I(inode)->sync_writers);
1857 ret = filemap_fdatawrite_range(inode->i_mapping, start, end); 1865 ret = btrfs_fdatawrite_range(inode, start, end);
1858 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1859 &BTRFS_I(inode)->runtime_flags))
1860 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1861 atomic_dec(&BTRFS_I(inode)->sync_writers); 1866 atomic_dec(&BTRFS_I(inode)->sync_writers);
1862 1867
1863 return ret; 1868 return ret;
@@ -2810,3 +2815,29 @@ int btrfs_auto_defrag_init(void)
2810 2815
2811 return 0; 2816 return 0;
2812} 2817}
2818
2819int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
2820{
2821 int ret;
2822
2823 /*
2824 * So with compression we will find and lock a dirty page and clear the
2825 * first one as dirty, setup an async extent, and immediately return
2826 * with the entire range locked but with nobody actually marked with
2827 * writeback. So we can't just filemap_write_and_wait_range() and
2828 * expect it to work since it will just kick off a thread to do the
2829 * actual work. So we need to call filemap_fdatawrite_range _again_
2830 * since it will wait on the page lock, which won't be unlocked until
2831 * after the pages have been marked as writeback and so we're good to go
2832 * from there. We have to do this otherwise we'll miss the ordered
2833 * extents and that results in badness. Please Josef, do not think you
2834 * know better and pull this out at some point in the future, it is
2835 * right and you are wrong.
2836 */
2837 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
2838 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
2839 &BTRFS_I(inode)->runtime_flags))
2840 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
2841
2842 return ret;
2843}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 33848196550e..030847bf7cec 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -27,10 +27,17 @@
27#include "disk-io.h" 27#include "disk-io.h"
28#include "extent_io.h" 28#include "extent_io.h"
29#include "inode-map.h" 29#include "inode-map.h"
30#include "volumes.h"
30 31
31#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) 32#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
32#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) 33#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
33 34
35struct btrfs_trim_range {
36 u64 start;
37 u64 bytes;
38 struct list_head list;
39};
40
34static int link_free_space(struct btrfs_free_space_ctl *ctl, 41static int link_free_space(struct btrfs_free_space_ctl *ctl,
35 struct btrfs_free_space *info); 42 struct btrfs_free_space *info);
36static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 43static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
@@ -881,6 +888,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
881 int ret; 888 int ret;
882 struct btrfs_free_cluster *cluster = NULL; 889 struct btrfs_free_cluster *cluster = NULL;
883 struct rb_node *node = rb_first(&ctl->free_space_offset); 890 struct rb_node *node = rb_first(&ctl->free_space_offset);
891 struct btrfs_trim_range *trim_entry;
884 892
885 /* Get the cluster for this block_group if it exists */ 893 /* Get the cluster for this block_group if it exists */
886 if (block_group && !list_empty(&block_group->cluster_list)) { 894 if (block_group && !list_empty(&block_group->cluster_list)) {
@@ -916,6 +924,21 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
916 cluster = NULL; 924 cluster = NULL;
917 } 925 }
918 } 926 }
927
928 /*
929 * Make sure we don't miss any range that was removed from our rbtree
930 * because trimming is running. Otherwise after a umount+mount (or crash
931 * after committing the transaction) we would leak free space and get
932 * an inconsistent free space cache report from fsck.
933 */
934 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
935 ret = io_ctl_add_entry(io_ctl, trim_entry->start,
936 trim_entry->bytes, NULL);
937 if (ret)
938 goto fail;
939 *entries += 1;
940 }
941
919 return 0; 942 return 0;
920fail: 943fail:
921 return -ENOSPC; 944 return -ENOSPC;
@@ -1135,12 +1158,15 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1135 1158
1136 io_ctl_set_generation(&io_ctl, trans->transid); 1159 io_ctl_set_generation(&io_ctl, trans->transid);
1137 1160
1161 mutex_lock(&ctl->cache_writeout_mutex);
1138 /* Write out the extent entries in the free space cache */ 1162 /* Write out the extent entries in the free space cache */
1139 ret = write_cache_extent_entries(&io_ctl, ctl, 1163 ret = write_cache_extent_entries(&io_ctl, ctl,
1140 block_group, &entries, &bitmaps, 1164 block_group, &entries, &bitmaps,
1141 &bitmap_list); 1165 &bitmap_list);
1142 if (ret) 1166 if (ret) {
1167 mutex_unlock(&ctl->cache_writeout_mutex);
1143 goto out_nospc; 1168 goto out_nospc;
1169 }
1144 1170
1145 /* 1171 /*
1146 * Some spaces that are freed in the current transaction are pinned, 1172 * Some spaces that are freed in the current transaction are pinned,
@@ -1148,11 +1174,18 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1148 * committed, we shouldn't lose them. 1174 * committed, we shouldn't lose them.
1149 */ 1175 */
1150 ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries); 1176 ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
1151 if (ret) 1177 if (ret) {
1178 mutex_unlock(&ctl->cache_writeout_mutex);
1152 goto out_nospc; 1179 goto out_nospc;
1180 }
1153 1181
1154 /* At last, we write out all the bitmaps. */ 1182 /*
1183 * At last, we write out all the bitmaps and keep cache_writeout_mutex
1184 * locked while doing it because a concurrent trim can be manipulating
1185 * or freeing the bitmap.
1186 */
1155 ret = write_bitmap_entries(&io_ctl, &bitmap_list); 1187 ret = write_bitmap_entries(&io_ctl, &bitmap_list);
1188 mutex_unlock(&ctl->cache_writeout_mutex);
1156 if (ret) 1189 if (ret)
1157 goto out_nospc; 1190 goto out_nospc;
1158 1191
@@ -2295,6 +2328,8 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
2295 ctl->start = block_group->key.objectid; 2328 ctl->start = block_group->key.objectid;
2296 ctl->private = block_group; 2329 ctl->private = block_group;
2297 ctl->op = &free_space_op; 2330 ctl->op = &free_space_op;
2331 INIT_LIST_HEAD(&ctl->trimming_ranges);
2332 mutex_init(&ctl->cache_writeout_mutex);
2298 2333
2299 /* 2334 /*
2300 * we only want to have 32k of ram per block group for keeping 2335 * we only want to have 32k of ram per block group for keeping
@@ -2911,10 +2946,12 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
2911 2946
2912static int do_trimming(struct btrfs_block_group_cache *block_group, 2947static int do_trimming(struct btrfs_block_group_cache *block_group,
2913 u64 *total_trimmed, u64 start, u64 bytes, 2948 u64 *total_trimmed, u64 start, u64 bytes,
2914 u64 reserved_start, u64 reserved_bytes) 2949 u64 reserved_start, u64 reserved_bytes,
2950 struct btrfs_trim_range *trim_entry)
2915{ 2951{
2916 struct btrfs_space_info *space_info = block_group->space_info; 2952 struct btrfs_space_info *space_info = block_group->space_info;
2917 struct btrfs_fs_info *fs_info = block_group->fs_info; 2953 struct btrfs_fs_info *fs_info = block_group->fs_info;
2954 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2918 int ret; 2955 int ret;
2919 int update = 0; 2956 int update = 0;
2920 u64 trimmed = 0; 2957 u64 trimmed = 0;
@@ -2934,7 +2971,10 @@ static int do_trimming(struct btrfs_block_group_cache *block_group,
2934 if (!ret) 2971 if (!ret)
2935 *total_trimmed += trimmed; 2972 *total_trimmed += trimmed;
2936 2973
2974 mutex_lock(&ctl->cache_writeout_mutex);
2937 btrfs_add_free_space(block_group, reserved_start, reserved_bytes); 2975 btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
2976 list_del(&trim_entry->list);
2977 mutex_unlock(&ctl->cache_writeout_mutex);
2938 2978
2939 if (update) { 2979 if (update) {
2940 spin_lock(&space_info->lock); 2980 spin_lock(&space_info->lock);
@@ -2962,16 +3002,21 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2962 u64 bytes; 3002 u64 bytes;
2963 3003
2964 while (start < end) { 3004 while (start < end) {
3005 struct btrfs_trim_range trim_entry;
3006
3007 mutex_lock(&ctl->cache_writeout_mutex);
2965 spin_lock(&ctl->tree_lock); 3008 spin_lock(&ctl->tree_lock);
2966 3009
2967 if (ctl->free_space < minlen) { 3010 if (ctl->free_space < minlen) {
2968 spin_unlock(&ctl->tree_lock); 3011 spin_unlock(&ctl->tree_lock);
3012 mutex_unlock(&ctl->cache_writeout_mutex);
2969 break; 3013 break;
2970 } 3014 }
2971 3015
2972 entry = tree_search_offset(ctl, start, 0, 1); 3016 entry = tree_search_offset(ctl, start, 0, 1);
2973 if (!entry) { 3017 if (!entry) {
2974 spin_unlock(&ctl->tree_lock); 3018 spin_unlock(&ctl->tree_lock);
3019 mutex_unlock(&ctl->cache_writeout_mutex);
2975 break; 3020 break;
2976 } 3021 }
2977 3022
@@ -2980,6 +3025,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2980 node = rb_next(&entry->offset_index); 3025 node = rb_next(&entry->offset_index);
2981 if (!node) { 3026 if (!node) {
2982 spin_unlock(&ctl->tree_lock); 3027 spin_unlock(&ctl->tree_lock);
3028 mutex_unlock(&ctl->cache_writeout_mutex);
2983 goto out; 3029 goto out;
2984 } 3030 }
2985 entry = rb_entry(node, struct btrfs_free_space, 3031 entry = rb_entry(node, struct btrfs_free_space,
@@ -2988,6 +3034,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2988 3034
2989 if (entry->offset >= end) { 3035 if (entry->offset >= end) {
2990 spin_unlock(&ctl->tree_lock); 3036 spin_unlock(&ctl->tree_lock);
3037 mutex_unlock(&ctl->cache_writeout_mutex);
2991 break; 3038 break;
2992 } 3039 }
2993 3040
@@ -2997,6 +3044,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2997 bytes = min(extent_start + extent_bytes, end) - start; 3044 bytes = min(extent_start + extent_bytes, end) - start;
2998 if (bytes < minlen) { 3045 if (bytes < minlen) {
2999 spin_unlock(&ctl->tree_lock); 3046 spin_unlock(&ctl->tree_lock);
3047 mutex_unlock(&ctl->cache_writeout_mutex);
3000 goto next; 3048 goto next;
3001 } 3049 }
3002 3050
@@ -3004,9 +3052,13 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
3004 kmem_cache_free(btrfs_free_space_cachep, entry); 3052 kmem_cache_free(btrfs_free_space_cachep, entry);
3005 3053
3006 spin_unlock(&ctl->tree_lock); 3054 spin_unlock(&ctl->tree_lock);
3055 trim_entry.start = extent_start;
3056 trim_entry.bytes = extent_bytes;
3057 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3058 mutex_unlock(&ctl->cache_writeout_mutex);
3007 3059
3008 ret = do_trimming(block_group, total_trimmed, start, bytes, 3060 ret = do_trimming(block_group, total_trimmed, start, bytes,
3009 extent_start, extent_bytes); 3061 extent_start, extent_bytes, &trim_entry);
3010 if (ret) 3062 if (ret)
3011 break; 3063 break;
3012next: 3064next:
@@ -3035,17 +3087,21 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3035 3087
3036 while (offset < end) { 3088 while (offset < end) {
3037 bool next_bitmap = false; 3089 bool next_bitmap = false;
3090 struct btrfs_trim_range trim_entry;
3038 3091
3092 mutex_lock(&ctl->cache_writeout_mutex);
3039 spin_lock(&ctl->tree_lock); 3093 spin_lock(&ctl->tree_lock);
3040 3094
3041 if (ctl->free_space < minlen) { 3095 if (ctl->free_space < minlen) {
3042 spin_unlock(&ctl->tree_lock); 3096 spin_unlock(&ctl->tree_lock);
3097 mutex_unlock(&ctl->cache_writeout_mutex);
3043 break; 3098 break;
3044 } 3099 }
3045 3100
3046 entry = tree_search_offset(ctl, offset, 1, 0); 3101 entry = tree_search_offset(ctl, offset, 1, 0);
3047 if (!entry) { 3102 if (!entry) {
3048 spin_unlock(&ctl->tree_lock); 3103 spin_unlock(&ctl->tree_lock);
3104 mutex_unlock(&ctl->cache_writeout_mutex);
3049 next_bitmap = true; 3105 next_bitmap = true;
3050 goto next; 3106 goto next;
3051 } 3107 }
@@ -3054,6 +3110,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3054 ret2 = search_bitmap(ctl, entry, &start, &bytes); 3110 ret2 = search_bitmap(ctl, entry, &start, &bytes);
3055 if (ret2 || start >= end) { 3111 if (ret2 || start >= end) {
3056 spin_unlock(&ctl->tree_lock); 3112 spin_unlock(&ctl->tree_lock);
3113 mutex_unlock(&ctl->cache_writeout_mutex);
3057 next_bitmap = true; 3114 next_bitmap = true;
3058 goto next; 3115 goto next;
3059 } 3116 }
@@ -3061,6 +3118,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3061 bytes = min(bytes, end - start); 3118 bytes = min(bytes, end - start);
3062 if (bytes < minlen) { 3119 if (bytes < minlen) {
3063 spin_unlock(&ctl->tree_lock); 3120 spin_unlock(&ctl->tree_lock);
3121 mutex_unlock(&ctl->cache_writeout_mutex);
3064 goto next; 3122 goto next;
3065 } 3123 }
3066 3124
@@ -3069,9 +3127,13 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3069 free_bitmap(ctl, entry); 3127 free_bitmap(ctl, entry);
3070 3128
3071 spin_unlock(&ctl->tree_lock); 3129 spin_unlock(&ctl->tree_lock);
3130 trim_entry.start = start;
3131 trim_entry.bytes = bytes;
3132 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3133 mutex_unlock(&ctl->cache_writeout_mutex);
3072 3134
3073 ret = do_trimming(block_group, total_trimmed, start, bytes, 3135 ret = do_trimming(block_group, total_trimmed, start, bytes,
3074 start, bytes); 3136 start, bytes, &trim_entry);
3075 if (ret) 3137 if (ret)
3076 break; 3138 break;
3077next: 3139next:
@@ -3101,11 +3163,52 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
3101 3163
3102 *trimmed = 0; 3164 *trimmed = 0;
3103 3165
3166 spin_lock(&block_group->lock);
3167 if (block_group->removed) {
3168 spin_unlock(&block_group->lock);
3169 return 0;
3170 }
3171 atomic_inc(&block_group->trimming);
3172 spin_unlock(&block_group->lock);
3173
3104 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); 3174 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
3105 if (ret) 3175 if (ret)
3106 return ret; 3176 goto out;
3107 3177
3108 ret = trim_bitmaps(block_group, trimmed, start, end, minlen); 3178 ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
3179out:
3180 spin_lock(&block_group->lock);
3181 if (atomic_dec_and_test(&block_group->trimming) &&
3182 block_group->removed) {
3183 struct extent_map_tree *em_tree;
3184 struct extent_map *em;
3185
3186 spin_unlock(&block_group->lock);
3187
3188 em_tree = &block_group->fs_info->mapping_tree.map_tree;
3189 write_lock(&em_tree->lock);
3190 em = lookup_extent_mapping(em_tree, block_group->key.objectid,
3191 1);
3192 BUG_ON(!em); /* logic error, can't happen */
3193 remove_extent_mapping(em_tree, em);
3194 write_unlock(&em_tree->lock);
3195
3196 lock_chunks(block_group->fs_info->chunk_root);
3197 list_del_init(&em->list);
3198 unlock_chunks(block_group->fs_info->chunk_root);
3199
3200 /* once for us and once for the tree */
3201 free_extent_map(em);
3202 free_extent_map(em);
3203
3204 /*
3205 * We've left one free space entry and other tasks trimming
3206 * this block group have left 1 entry each one. Free them.
3207 */
3208 __btrfs_remove_free_space_cache(block_group->free_space_ctl);
3209 } else {
3210 spin_unlock(&block_group->lock);
3211 }
3109 3212
3110 return ret; 3213 return ret;
3111} 3214}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 0cf4977ef70d..88b2238a0aed 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -38,6 +38,8 @@ struct btrfs_free_space_ctl {
38 u64 start; 38 u64 start;
39 struct btrfs_free_space_op *op; 39 struct btrfs_free_space_op *op;
40 void *private; 40 void *private;
41 struct mutex cache_writeout_mutex;
42 struct list_head trimming_ranges;
41}; 43};
42 44
43struct btrfs_free_space_op { 45struct btrfs_free_space_op {
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 83d646bd2e4b..74faea3a516e 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -178,7 +178,7 @@ static void start_caching(struct btrfs_root *root)
178 root->root_key.objectid); 178 root->root_key.objectid);
179 if (IS_ERR(tsk)) { 179 if (IS_ERR(tsk)) {
180 btrfs_warn(root->fs_info, "failed to start inode caching task"); 180 btrfs_warn(root->fs_info, "failed to start inode caching task");
181 btrfs_clear_and_info(root, CHANGE_INODE_CACHE, 181 btrfs_clear_pending_and_info(root->fs_info, INODE_MAP_CACHE,
182 "disabling inode map caching"); 182 "disabling inode map caching");
183 } 183 }
184} 184}
@@ -364,6 +364,8 @@ void btrfs_init_free_ino_ctl(struct btrfs_root *root)
364 ctl->start = 0; 364 ctl->start = 0;
365 ctl->private = NULL; 365 ctl->private = NULL;
366 ctl->op = &free_ino_op; 366 ctl->op = &free_ino_op;
367 INIT_LIST_HEAD(&ctl->trimming_ranges);
368 mutex_init(&ctl->cache_writeout_mutex);
367 369
368 /* 370 /*
369 * Initially we allow to use 16K of ram to cache chunks of 371 * Initially we allow to use 16K of ram to cache chunks of
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ff0dcc016b71..e687bb0dc73a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -382,7 +382,7 @@ static inline int inode_need_compress(struct inode *inode)
382 * are written in the same order that the flusher thread sent them 382 * are written in the same order that the flusher thread sent them
383 * down. 383 * down.
384 */ 384 */
385static noinline int compress_file_range(struct inode *inode, 385static noinline void compress_file_range(struct inode *inode,
386 struct page *locked_page, 386 struct page *locked_page,
387 u64 start, u64 end, 387 u64 start, u64 end,
388 struct async_cow *async_cow, 388 struct async_cow *async_cow,
@@ -411,14 +411,6 @@ static noinline int compress_file_range(struct inode *inode,
411 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 411 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
412 btrfs_add_inode_defrag(NULL, inode); 412 btrfs_add_inode_defrag(NULL, inode);
413 413
414 /*
415 * skip compression for a small file range(<=blocksize) that
416 * isn't an inline extent, since it dosen't save disk space at all.
417 */
418 if ((end - start + 1) <= blocksize &&
419 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
420 goto cleanup_and_bail_uncompressed;
421
422 actual_end = min_t(u64, isize, end + 1); 414 actual_end = min_t(u64, isize, end + 1);
423again: 415again:
424 will_compress = 0; 416 will_compress = 0;
@@ -440,6 +432,14 @@ again:
440 432
441 total_compressed = actual_end - start; 433 total_compressed = actual_end - start;
442 434
435 /*
436 * skip compression for a small file range(<=blocksize) that
437 * isn't an inline extent, since it dosen't save disk space at all.
438 */
439 if (total_compressed <= blocksize &&
440 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
441 goto cleanup_and_bail_uncompressed;
442
443 /* we want to make sure that amount of ram required to uncompress 443 /* we want to make sure that amount of ram required to uncompress
444 * an extent is reasonable, so we limit the total size in ram 444 * an extent is reasonable, so we limit the total size in ram
445 * of a compressed extent to 128k. This is a crucial number 445 * of a compressed extent to 128k. This is a crucial number
@@ -527,7 +527,10 @@ cont:
527 if (ret <= 0) { 527 if (ret <= 0) {
528 unsigned long clear_flags = EXTENT_DELALLOC | 528 unsigned long clear_flags = EXTENT_DELALLOC |
529 EXTENT_DEFRAG; 529 EXTENT_DEFRAG;
530 unsigned long page_error_op;
531
530 clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; 532 clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
533 page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
531 534
532 /* 535 /*
533 * inline extent creation worked or returned error, 536 * inline extent creation worked or returned error,
@@ -538,6 +541,7 @@ cont:
538 clear_flags, PAGE_UNLOCK | 541 clear_flags, PAGE_UNLOCK |
539 PAGE_CLEAR_DIRTY | 542 PAGE_CLEAR_DIRTY |
540 PAGE_SET_WRITEBACK | 543 PAGE_SET_WRITEBACK |
544 page_error_op |
541 PAGE_END_WRITEBACK); 545 PAGE_END_WRITEBACK);
542 goto free_pages_out; 546 goto free_pages_out;
543 } 547 }
@@ -620,8 +624,7 @@ cleanup_and_bail_uncompressed:
620 *num_added += 1; 624 *num_added += 1;
621 } 625 }
622 626
623out: 627 return;
624 return ret;
625 628
626free_pages_out: 629free_pages_out:
627 for (i = 0; i < nr_pages_ret; i++) { 630 for (i = 0; i < nr_pages_ret; i++) {
@@ -629,8 +632,22 @@ free_pages_out:
629 page_cache_release(pages[i]); 632 page_cache_release(pages[i]);
630 } 633 }
631 kfree(pages); 634 kfree(pages);
635}
632 636
633 goto out; 637static void free_async_extent_pages(struct async_extent *async_extent)
638{
639 int i;
640
641 if (!async_extent->pages)
642 return;
643
644 for (i = 0; i < async_extent->nr_pages; i++) {
645 WARN_ON(async_extent->pages[i]->mapping);
646 page_cache_release(async_extent->pages[i]);
647 }
648 kfree(async_extent->pages);
649 async_extent->nr_pages = 0;
650 async_extent->pages = NULL;
634} 651}
635 652
636/* 653/*
@@ -639,7 +656,7 @@ free_pages_out:
639 * queued. We walk all the async extents created by compress_file_range 656 * queued. We walk all the async extents created by compress_file_range
640 * and send them down to the disk. 657 * and send them down to the disk.
641 */ 658 */
642static noinline int submit_compressed_extents(struct inode *inode, 659static noinline void submit_compressed_extents(struct inode *inode,
643 struct async_cow *async_cow) 660 struct async_cow *async_cow)
644{ 661{
645 struct async_extent *async_extent; 662 struct async_extent *async_extent;
@@ -651,9 +668,6 @@ static noinline int submit_compressed_extents(struct inode *inode,
651 struct extent_io_tree *io_tree; 668 struct extent_io_tree *io_tree;
652 int ret = 0; 669 int ret = 0;
653 670
654 if (list_empty(&async_cow->extents))
655 return 0;
656
657again: 671again:
658 while (!list_empty(&async_cow->extents)) { 672 while (!list_empty(&async_cow->extents)) {
659 async_extent = list_entry(async_cow->extents.next, 673 async_extent = list_entry(async_cow->extents.next,
@@ -709,15 +723,7 @@ retry:
709 async_extent->compressed_size, 723 async_extent->compressed_size,
710 0, alloc_hint, &ins, 1, 1); 724 0, alloc_hint, &ins, 1, 1);
711 if (ret) { 725 if (ret) {
712 int i; 726 free_async_extent_pages(async_extent);
713
714 for (i = 0; i < async_extent->nr_pages; i++) {
715 WARN_ON(async_extent->pages[i]->mapping);
716 page_cache_release(async_extent->pages[i]);
717 }
718 kfree(async_extent->pages);
719 async_extent->nr_pages = 0;
720 async_extent->pages = NULL;
721 727
722 if (ret == -ENOSPC) { 728 if (ret == -ENOSPC) {
723 unlock_extent(io_tree, async_extent->start, 729 unlock_extent(io_tree, async_extent->start,
@@ -814,15 +820,26 @@ retry:
814 ins.objectid, 820 ins.objectid,
815 ins.offset, async_extent->pages, 821 ins.offset, async_extent->pages,
816 async_extent->nr_pages); 822 async_extent->nr_pages);
823 if (ret) {
824 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
825 struct page *p = async_extent->pages[0];
826 const u64 start = async_extent->start;
827 const u64 end = start + async_extent->ram_size - 1;
828
829 p->mapping = inode->i_mapping;
830 tree->ops->writepage_end_io_hook(p, start, end,
831 NULL, 0);
832 p->mapping = NULL;
833 extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
834 PAGE_END_WRITEBACK |
835 PAGE_SET_ERROR);
836 free_async_extent_pages(async_extent);
837 }
817 alloc_hint = ins.objectid + ins.offset; 838 alloc_hint = ins.objectid + ins.offset;
818 kfree(async_extent); 839 kfree(async_extent);
819 if (ret)
820 goto out;
821 cond_resched(); 840 cond_resched();
822 } 841 }
823 ret = 0; 842 return;
824out:
825 return ret;
826out_free_reserve: 843out_free_reserve:
827 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 844 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
828out_free: 845out_free:
@@ -832,7 +849,9 @@ out_free:
832 NULL, EXTENT_LOCKED | EXTENT_DELALLOC | 849 NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
833 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, 850 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
834 PAGE_UNLOCK | PAGE_CLEAR_DIRTY | 851 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
835 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); 852 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
853 PAGE_SET_ERROR);
854 free_async_extent_pages(async_extent);
836 kfree(async_extent); 855 kfree(async_extent);
837 goto again; 856 goto again;
838} 857}
@@ -1318,7 +1337,7 @@ next_slot:
1318 * we fall into common COW way. 1337 * we fall into common COW way.
1319 */ 1338 */
1320 if (!nolock) { 1339 if (!nolock) {
1321 err = btrfs_start_nocow_write(root); 1340 err = btrfs_start_write_no_snapshoting(root);
1322 if (!err) 1341 if (!err)
1323 goto out_check; 1342 goto out_check;
1324 } 1343 }
@@ -1342,7 +1361,7 @@ out_check:
1342 if (extent_end <= start) { 1361 if (extent_end <= start) {
1343 path->slots[0]++; 1362 path->slots[0]++;
1344 if (!nolock && nocow) 1363 if (!nolock && nocow)
1345 btrfs_end_nocow_write(root); 1364 btrfs_end_write_no_snapshoting(root);
1346 goto next_slot; 1365 goto next_slot;
1347 } 1366 }
1348 if (!nocow) { 1367 if (!nocow) {
@@ -1362,7 +1381,7 @@ out_check:
1362 page_started, nr_written, 1); 1381 page_started, nr_written, 1);
1363 if (ret) { 1382 if (ret) {
1364 if (!nolock && nocow) 1383 if (!nolock && nocow)
1365 btrfs_end_nocow_write(root); 1384 btrfs_end_write_no_snapshoting(root);
1366 goto error; 1385 goto error;
1367 } 1386 }
1368 cow_start = (u64)-1; 1387 cow_start = (u64)-1;
@@ -1413,7 +1432,7 @@ out_check:
1413 num_bytes); 1432 num_bytes);
1414 if (ret) { 1433 if (ret) {
1415 if (!nolock && nocow) 1434 if (!nolock && nocow)
1416 btrfs_end_nocow_write(root); 1435 btrfs_end_write_no_snapshoting(root);
1417 goto error; 1436 goto error;
1418 } 1437 }
1419 } 1438 }
@@ -1424,7 +1443,7 @@ out_check:
1424 EXTENT_DELALLOC, PAGE_UNLOCK | 1443 EXTENT_DELALLOC, PAGE_UNLOCK |
1425 PAGE_SET_PRIVATE2); 1444 PAGE_SET_PRIVATE2);
1426 if (!nolock && nocow) 1445 if (!nolock && nocow)
1427 btrfs_end_nocow_write(root); 1446 btrfs_end_write_no_snapshoting(root);
1428 cur_offset = extent_end; 1447 cur_offset = extent_end;
1429 if (cur_offset > end) 1448 if (cur_offset > end)
1430 break; 1449 break;
@@ -4580,6 +4599,26 @@ next:
4580 return err; 4599 return err;
4581} 4600}
4582 4601
4602static int wait_snapshoting_atomic_t(atomic_t *a)
4603{
4604 schedule();
4605 return 0;
4606}
4607
4608static void wait_for_snapshot_creation(struct btrfs_root *root)
4609{
4610 while (true) {
4611 int ret;
4612
4613 ret = btrfs_start_write_no_snapshoting(root);
4614 if (ret)
4615 break;
4616 wait_on_atomic_t(&root->will_be_snapshoted,
4617 wait_snapshoting_atomic_t,
4618 TASK_UNINTERRUPTIBLE);
4619 }
4620}
4621
4583static int btrfs_setsize(struct inode *inode, struct iattr *attr) 4622static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4584{ 4623{
4585 struct btrfs_root *root = BTRFS_I(inode)->root; 4624 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4604,17 +4643,30 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4604 4643
4605 if (newsize > oldsize) { 4644 if (newsize > oldsize) {
4606 truncate_pagecache(inode, newsize); 4645 truncate_pagecache(inode, newsize);
4646 /*
4647 * Don't do an expanding truncate while snapshoting is ongoing.
4648 * This is to ensure the snapshot captures a fully consistent
4649 * state of this file - if the snapshot captures this expanding
4650 * truncation, it must capture all writes that happened before
4651 * this truncation.
4652 */
4653 wait_for_snapshot_creation(root);
4607 ret = btrfs_cont_expand(inode, oldsize, newsize); 4654 ret = btrfs_cont_expand(inode, oldsize, newsize);
4608 if (ret) 4655 if (ret) {
4656 btrfs_end_write_no_snapshoting(root);
4609 return ret; 4657 return ret;
4658 }
4610 4659
4611 trans = btrfs_start_transaction(root, 1); 4660 trans = btrfs_start_transaction(root, 1);
4612 if (IS_ERR(trans)) 4661 if (IS_ERR(trans)) {
4662 btrfs_end_write_no_snapshoting(root);
4613 return PTR_ERR(trans); 4663 return PTR_ERR(trans);
4664 }
4614 4665
4615 i_size_write(inode, newsize); 4666 i_size_write(inode, newsize);
4616 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 4667 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
4617 ret = btrfs_update_inode(trans, root, inode); 4668 ret = btrfs_update_inode(trans, root, inode);
4669 btrfs_end_write_no_snapshoting(root);
4618 btrfs_end_transaction(trans, root); 4670 btrfs_end_transaction(trans, root);
4619 } else { 4671 } else {
4620 4672
@@ -7000,9 +7052,12 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7000 btrfs_put_ordered_extent(ordered); 7052 btrfs_put_ordered_extent(ordered);
7001 } else { 7053 } else {
7002 /* Screw you mmap */ 7054 /* Screw you mmap */
7003 ret = filemap_write_and_wait_range(inode->i_mapping, 7055 ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
7004 lockstart, 7056 if (ret)
7005 lockend); 7057 break;
7058 ret = filemap_fdatawait_range(inode->i_mapping,
7059 lockstart,
7060 lockend);
7006 if (ret) 7061 if (ret)
7007 break; 7062 break;
7008 7063
@@ -9442,6 +9497,21 @@ out_inode:
9442 9497
9443} 9498}
9444 9499
9500/* Inspired by filemap_check_errors() */
9501int btrfs_inode_check_errors(struct inode *inode)
9502{
9503 int ret = 0;
9504
9505 if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) &&
9506 test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags))
9507 ret = -ENOSPC;
9508 if (test_bit(AS_EIO, &inode->i_mapping->flags) &&
9509 test_and_clear_bit(AS_EIO, &inode->i_mapping->flags))
9510 ret = -EIO;
9511
9512 return ret;
9513}
9514
9445static const struct inode_operations btrfs_dir_inode_operations = { 9515static const struct inode_operations btrfs_dir_inode_operations = {
9446 .getattr = btrfs_getattr, 9516 .getattr = btrfs_getattr,
9447 .lookup = btrfs_lookup, 9517 .lookup = btrfs_lookup,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 080fe66c0349..d49fe8a0f6b5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -617,7 +617,7 @@ fail:
617 return ret; 617 return ret;
618} 618}
619 619
620static void btrfs_wait_nocow_write(struct btrfs_root *root) 620static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root)
621{ 621{
622 s64 writers; 622 s64 writers;
623 DEFINE_WAIT(wait); 623 DEFINE_WAIT(wait);
@@ -649,7 +649,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
649 649
650 atomic_inc(&root->will_be_snapshoted); 650 atomic_inc(&root->will_be_snapshoted);
651 smp_mb__after_atomic(); 651 smp_mb__after_atomic();
652 btrfs_wait_nocow_write(root); 652 btrfs_wait_for_no_snapshoting_writes(root);
653 653
654 ret = btrfs_start_delalloc_inodes(root, 0); 654 ret = btrfs_start_delalloc_inodes(root, 0);
655 if (ret) 655 if (ret)
@@ -717,35 +717,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
717 if (ret) 717 if (ret)
718 goto fail; 718 goto fail;
719 719
720 /*
721 * If orphan cleanup did remove any orphans, it means the tree was
722 * modified and therefore the commit root is not the same as the
723 * current root anymore. This is a problem, because send uses the
724 * commit root and therefore can see inode items that don't exist
725 * in the current root anymore, and for example make calls to
726 * btrfs_iget, which will do tree lookups based on the current root
727 * and not on the commit root. Those lookups will fail, returning a
728 * -ESTALE error, and making send fail with that error. So make sure
729 * a send does not see any orphans we have just removed, and that it
730 * will see the same inodes regardless of whether a transaction
731 * commit happened before it started (meaning that the commit root
732 * will be the same as the current root) or not.
733 */
734 if (readonly && pending_snapshot->snap->node !=
735 pending_snapshot->snap->commit_root) {
736 trans = btrfs_join_transaction(pending_snapshot->snap);
737 if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) {
738 ret = PTR_ERR(trans);
739 goto fail;
740 }
741 if (!IS_ERR(trans)) {
742 ret = btrfs_commit_transaction(trans,
743 pending_snapshot->snap);
744 if (ret)
745 goto fail;
746 }
747 }
748
749 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 720 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
750 if (IS_ERR(inode)) { 721 if (IS_ERR(inode)) {
751 ret = PTR_ERR(inode); 722 ret = PTR_ERR(inode);
@@ -761,7 +732,8 @@ fail:
761free: 732free:
762 kfree(pending_snapshot); 733 kfree(pending_snapshot);
763out: 734out:
764 atomic_dec(&root->will_be_snapshoted); 735 if (atomic_dec_and_test(&root->will_be_snapshoted))
736 wake_up_atomic_t(&root->will_be_snapshoted);
765 return ret; 737 return ret;
766} 738}
767 739
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ac734ec4cc20..534544e08f76 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -220,6 +220,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
220 INIT_LIST_HEAD(&entry->work_list); 220 INIT_LIST_HEAD(&entry->work_list);
221 init_completion(&entry->completion); 221 init_completion(&entry->completion);
222 INIT_LIST_HEAD(&entry->log_list); 222 INIT_LIST_HEAD(&entry->log_list);
223 INIT_LIST_HEAD(&entry->trans_list);
223 224
224 trace_btrfs_ordered_extent_add(inode, entry); 225 trace_btrfs_ordered_extent_add(inode, entry);
225 226
@@ -431,19 +432,31 @@ out:
431 432
432/* Needs to either be called under a log transaction or the log_mutex */ 433/* Needs to either be called under a log transaction or the log_mutex */
433void btrfs_get_logged_extents(struct inode *inode, 434void btrfs_get_logged_extents(struct inode *inode,
434 struct list_head *logged_list) 435 struct list_head *logged_list,
436 const loff_t start,
437 const loff_t end)
435{ 438{
436 struct btrfs_ordered_inode_tree *tree; 439 struct btrfs_ordered_inode_tree *tree;
437 struct btrfs_ordered_extent *ordered; 440 struct btrfs_ordered_extent *ordered;
438 struct rb_node *n; 441 struct rb_node *n;
442 struct rb_node *prev;
439 443
440 tree = &BTRFS_I(inode)->ordered_tree; 444 tree = &BTRFS_I(inode)->ordered_tree;
441 spin_lock_irq(&tree->lock); 445 spin_lock_irq(&tree->lock);
442 for (n = rb_first(&tree->tree); n; n = rb_next(n)) { 446 n = __tree_search(&tree->tree, end, &prev);
447 if (!n)
448 n = prev;
449 for (; n; n = rb_prev(n)) {
443 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); 450 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
451 if (ordered->file_offset > end)
452 continue;
453 if (entry_end(ordered) <= start)
454 break;
444 if (!list_empty(&ordered->log_list)) 455 if (!list_empty(&ordered->log_list))
445 continue; 456 continue;
446 list_add_tail(&ordered->log_list, logged_list); 457 if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
458 continue;
459 list_add(&ordered->log_list, logged_list);
447 atomic_inc(&ordered->refs); 460 atomic_inc(&ordered->refs);
448 } 461 }
449 spin_unlock_irq(&tree->lock); 462 spin_unlock_irq(&tree->lock);
@@ -472,7 +485,8 @@ void btrfs_submit_logged_extents(struct list_head *logged_list,
472 spin_unlock_irq(&log->log_extents_lock[index]); 485 spin_unlock_irq(&log->log_extents_lock[index]);
473} 486}
474 487
475void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) 488void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
489 struct btrfs_root *log, u64 transid)
476{ 490{
477 struct btrfs_ordered_extent *ordered; 491 struct btrfs_ordered_extent *ordered;
478 int index = transid % 2; 492 int index = transid % 2;
@@ -497,7 +511,8 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
497 wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, 511 wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
498 &ordered->flags)); 512 &ordered->flags));
499 513
500 btrfs_put_ordered_extent(ordered); 514 if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
515 list_add_tail(&ordered->trans_list, &trans->ordered);
501 spin_lock_irq(&log->log_extents_lock[index]); 516 spin_lock_irq(&log->log_extents_lock[index]);
502 } 517 }
503 spin_unlock_irq(&log->log_extents_lock[index]); 518 spin_unlock_irq(&log->log_extents_lock[index]);
@@ -725,30 +740,10 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
725 /* start IO across the range first to instantiate any delalloc 740 /* start IO across the range first to instantiate any delalloc
726 * extents 741 * extents
727 */ 742 */
728 ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end); 743 ret = btrfs_fdatawrite_range(inode, start, orig_end);
729 if (ret) 744 if (ret)
730 return ret; 745 return ret;
731 /* 746
732 * So with compression we will find and lock a dirty page and clear the
733 * first one as dirty, setup an async extent, and immediately return
734 * with the entire range locked but with nobody actually marked with
735 * writeback. So we can't just filemap_write_and_wait_range() and
736 * expect it to work since it will just kick off a thread to do the
737 * actual work. So we need to call filemap_fdatawrite_range _again_
738 * since it will wait on the page lock, which won't be unlocked until
739 * after the pages have been marked as writeback and so we're good to go
740 * from there. We have to do this otherwise we'll miss the ordered
741 * extents and that results in badness. Please Josef, do not think you
742 * know better and pull this out at some point in the future, it is
743 * right and you are wrong.
744 */
745 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
746 &BTRFS_I(inode)->runtime_flags)) {
747 ret = filemap_fdatawrite_range(inode->i_mapping, start,
748 orig_end);
749 if (ret)
750 return ret;
751 }
752 ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end); 747 ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
753 if (ret) 748 if (ret)
754 return ret; 749 return ret;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index d81a274d621e..e96cd4ccd805 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -71,6 +71,8 @@ struct btrfs_ordered_sum {
71 ordered extent */ 71 ordered extent */
72#define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */ 72#define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */
73 73
74#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
75 * in the logging code. */
74struct btrfs_ordered_extent { 76struct btrfs_ordered_extent {
75 /* logical offset in the file */ 77 /* logical offset in the file */
76 u64 file_offset; 78 u64 file_offset;
@@ -121,6 +123,9 @@ struct btrfs_ordered_extent {
121 /* If we need to wait on this to be done */ 123 /* If we need to wait on this to be done */
122 struct list_head log_list; 124 struct list_head log_list;
123 125
126 /* If the transaction needs to wait on this ordered extent */
127 struct list_head trans_list;
128
124 /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ 129 /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
125 wait_queue_head_t wait; 130 wait_queue_head_t wait;
126 131
@@ -193,11 +198,14 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
193int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); 198int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
194void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); 199void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
195void btrfs_get_logged_extents(struct inode *inode, 200void btrfs_get_logged_extents(struct inode *inode,
196 struct list_head *logged_list); 201 struct list_head *logged_list,
202 const loff_t start,
203 const loff_t end);
197void btrfs_put_logged_extents(struct list_head *logged_list); 204void btrfs_put_logged_extents(struct list_head *logged_list);
198void btrfs_submit_logged_extents(struct list_head *logged_list, 205void btrfs_submit_logged_extents(struct list_head *logged_list,
199 struct btrfs_root *log); 206 struct btrfs_root *log);
200void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); 207void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
208 struct btrfs_root *log, u64 transid);
201void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); 209void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
202int __init ordered_data_init(void); 210int __init ordered_data_init(void);
203void ordered_data_exit(void); 211void ordered_data_exit(void);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 6a41631cb959..8ab2a17bbba8 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -58,9 +58,23 @@
58 */ 58 */
59#define RBIO_CACHE_READY_BIT 3 59#define RBIO_CACHE_READY_BIT 3
60 60
61/*
62 * bbio and raid_map is managed by the caller, so we shouldn't free
63 * them here. And besides that, all rbios with this flag should not
64 * be cached, because we need raid_map to check the rbios' stripe
65 * is the same or not, but it is very likely that the caller has
66 * free raid_map, so don't cache those rbios.
67 */
68#define RBIO_HOLD_BBIO_MAP_BIT 4
61 69
62#define RBIO_CACHE_SIZE 1024 70#define RBIO_CACHE_SIZE 1024
63 71
72enum btrfs_rbio_ops {
73 BTRFS_RBIO_WRITE = 0,
74 BTRFS_RBIO_READ_REBUILD = 1,
75 BTRFS_RBIO_PARITY_SCRUB = 2,
76};
77
64struct btrfs_raid_bio { 78struct btrfs_raid_bio {
65 struct btrfs_fs_info *fs_info; 79 struct btrfs_fs_info *fs_info;
66 struct btrfs_bio *bbio; 80 struct btrfs_bio *bbio;
@@ -117,13 +131,16 @@ struct btrfs_raid_bio {
117 /* number of data stripes (no p/q) */ 131 /* number of data stripes (no p/q) */
118 int nr_data; 132 int nr_data;
119 133
134 int real_stripes;
135
136 int stripe_npages;
120 /* 137 /*
121 * set if we're doing a parity rebuild 138 * set if we're doing a parity rebuild
122 * for a read from higher up, which is handled 139 * for a read from higher up, which is handled
123 * differently from a parity rebuild as part of 140 * differently from a parity rebuild as part of
124 * rmw 141 * rmw
125 */ 142 */
126 int read_rebuild; 143 enum btrfs_rbio_ops operation;
127 144
128 /* first bad stripe */ 145 /* first bad stripe */
129 int faila; 146 int faila;
@@ -131,6 +148,7 @@ struct btrfs_raid_bio {
131 /* second bad stripe (for raid6 use) */ 148 /* second bad stripe (for raid6 use) */
132 int failb; 149 int failb;
133 150
151 int scrubp;
134 /* 152 /*
135 * number of pages needed to represent the full 153 * number of pages needed to represent the full
136 * stripe 154 * stripe
@@ -144,8 +162,13 @@ struct btrfs_raid_bio {
144 */ 162 */
145 int bio_list_bytes; 163 int bio_list_bytes;
146 164
165 int generic_bio_cnt;
166
147 atomic_t refs; 167 atomic_t refs;
148 168
169 atomic_t stripes_pending;
170
171 atomic_t error;
149 /* 172 /*
150 * these are two arrays of pointers. We allocate the 173 * these are two arrays of pointers. We allocate the
151 * rbio big enough to hold them both and setup their 174 * rbio big enough to hold them both and setup their
@@ -162,6 +185,11 @@ struct btrfs_raid_bio {
162 * here for faster lookup 185 * here for faster lookup
163 */ 186 */
164 struct page **bio_pages; 187 struct page **bio_pages;
188
189 /*
190 * bitmap to record which horizontal stripe has data
191 */
192 unsigned long *dbitmap;
165}; 193};
166 194
167static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 195static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
@@ -176,6 +204,10 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio);
176static void index_rbio_pages(struct btrfs_raid_bio *rbio); 204static void index_rbio_pages(struct btrfs_raid_bio *rbio);
177static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 205static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
178 206
207static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
208 int need_check);
209static void async_scrub_parity(struct btrfs_raid_bio *rbio);
210
179/* 211/*
180 * the stripe hash table is used for locking, and to collect 212 * the stripe hash table is used for locking, and to collect
181 * bios in hopes of making a full stripe 213 * bios in hopes of making a full stripe
@@ -324,6 +356,7 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
324{ 356{
325 bio_list_merge(&dest->bio_list, &victim->bio_list); 357 bio_list_merge(&dest->bio_list, &victim->bio_list);
326 dest->bio_list_bytes += victim->bio_list_bytes; 358 dest->bio_list_bytes += victim->bio_list_bytes;
359 dest->generic_bio_cnt += victim->generic_bio_cnt;
327 bio_list_init(&victim->bio_list); 360 bio_list_init(&victim->bio_list);
328} 361}
329 362
@@ -577,11 +610,20 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
577 cur->raid_map[0]) 610 cur->raid_map[0])
578 return 0; 611 return 0;
579 612
580 /* reads can't merge with writes */ 613 /* we can't merge with different operations */
581 if (last->read_rebuild != 614 if (last->operation != cur->operation)
582 cur->read_rebuild) { 615 return 0;
616 /*
617 * We've need read the full stripe from the drive.
618 * check and repair the parity and write the new results.
619 *
620 * We're not allowed to add any new bios to the
621 * bio list here, anyone else that wants to
622 * change this stripe needs to do their own rmw.
623 */
624 if (last->operation == BTRFS_RBIO_PARITY_SCRUB ||
625 cur->operation == BTRFS_RBIO_PARITY_SCRUB)
583 return 0; 626 return 0;
584 }
585 627
586 return 1; 628 return 1;
587} 629}
@@ -601,7 +643,7 @@ static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
601 */ 643 */
602static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) 644static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
603{ 645{
604 if (rbio->nr_data + 1 == rbio->bbio->num_stripes) 646 if (rbio->nr_data + 1 == rbio->real_stripes)
605 return NULL; 647 return NULL;
606 648
607 index += ((rbio->nr_data + 1) * rbio->stripe_len) >> 649 index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
@@ -772,11 +814,14 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
772 spin_unlock(&rbio->bio_list_lock); 814 spin_unlock(&rbio->bio_list_lock);
773 spin_unlock_irqrestore(&h->lock, flags); 815 spin_unlock_irqrestore(&h->lock, flags);
774 816
775 if (next->read_rebuild) 817 if (next->operation == BTRFS_RBIO_READ_REBUILD)
776 async_read_rebuild(next); 818 async_read_rebuild(next);
777 else { 819 else if (next->operation == BTRFS_RBIO_WRITE) {
778 steal_rbio(rbio, next); 820 steal_rbio(rbio, next);
779 async_rmw_stripe(next); 821 async_rmw_stripe(next);
822 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
823 steal_rbio(rbio, next);
824 async_scrub_parity(next);
780 } 825 }
781 826
782 goto done_nolock; 827 goto done_nolock;
@@ -796,6 +841,21 @@ done_nolock:
796 remove_rbio_from_cache(rbio); 841 remove_rbio_from_cache(rbio);
797} 842}
798 843
844static inline void
845__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
846{
847 if (need) {
848 kfree(raid_map);
849 kfree(bbio);
850 }
851}
852
853static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
854{
855 __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
856 !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
857}
858
799static void __free_raid_bio(struct btrfs_raid_bio *rbio) 859static void __free_raid_bio(struct btrfs_raid_bio *rbio)
800{ 860{
801 int i; 861 int i;
@@ -814,8 +874,9 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
814 rbio->stripe_pages[i] = NULL; 874 rbio->stripe_pages[i] = NULL;
815 } 875 }
816 } 876 }
817 kfree(rbio->raid_map); 877
818 kfree(rbio->bbio); 878 free_bbio_and_raid_map(rbio);
879
819 kfree(rbio); 880 kfree(rbio);
820} 881}
821 882
@@ -833,6 +894,10 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
833{ 894{
834 struct bio *cur = bio_list_get(&rbio->bio_list); 895 struct bio *cur = bio_list_get(&rbio->bio_list);
835 struct bio *next; 896 struct bio *next;
897
898 if (rbio->generic_bio_cnt)
899 btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
900
836 free_raid_bio(rbio); 901 free_raid_bio(rbio);
837 902
838 while (cur) { 903 while (cur) {
@@ -858,13 +923,13 @@ static void raid_write_end_io(struct bio *bio, int err)
858 923
859 bio_put(bio); 924 bio_put(bio);
860 925
861 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 926 if (!atomic_dec_and_test(&rbio->stripes_pending))
862 return; 927 return;
863 928
864 err = 0; 929 err = 0;
865 930
866 /* OK, we have read all the stripes we need to. */ 931 /* OK, we have read all the stripes we need to. */
867 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 932 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
868 err = -EIO; 933 err = -EIO;
869 934
870 rbio_orig_end_io(rbio, err, 0); 935 rbio_orig_end_io(rbio, err, 0);
@@ -925,16 +990,16 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
925{ 990{
926 struct btrfs_raid_bio *rbio; 991 struct btrfs_raid_bio *rbio;
927 int nr_data = 0; 992 int nr_data = 0;
928 int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); 993 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
994 int num_pages = rbio_nr_pages(stripe_len, real_stripes);
995 int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
929 void *p; 996 void *p;
930 997
931 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, 998 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
999 DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
932 GFP_NOFS); 1000 GFP_NOFS);
933 if (!rbio) { 1001 if (!rbio)
934 kfree(raid_map);
935 kfree(bbio);
936 return ERR_PTR(-ENOMEM); 1002 return ERR_PTR(-ENOMEM);
937 }
938 1003
939 bio_list_init(&rbio->bio_list); 1004 bio_list_init(&rbio->bio_list);
940 INIT_LIST_HEAD(&rbio->plug_list); 1005 INIT_LIST_HEAD(&rbio->plug_list);
@@ -946,9 +1011,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
946 rbio->fs_info = root->fs_info; 1011 rbio->fs_info = root->fs_info;
947 rbio->stripe_len = stripe_len; 1012 rbio->stripe_len = stripe_len;
948 rbio->nr_pages = num_pages; 1013 rbio->nr_pages = num_pages;
1014 rbio->real_stripes = real_stripes;
1015 rbio->stripe_npages = stripe_npages;
949 rbio->faila = -1; 1016 rbio->faila = -1;
950 rbio->failb = -1; 1017 rbio->failb = -1;
951 atomic_set(&rbio->refs, 1); 1018 atomic_set(&rbio->refs, 1);
1019 atomic_set(&rbio->error, 0);
1020 atomic_set(&rbio->stripes_pending, 0);
952 1021
953 /* 1022 /*
954 * the stripe_pages and bio_pages array point to the extra 1023 * the stripe_pages and bio_pages array point to the extra
@@ -957,11 +1026,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
957 p = rbio + 1; 1026 p = rbio + 1;
958 rbio->stripe_pages = p; 1027 rbio->stripe_pages = p;
959 rbio->bio_pages = p + sizeof(struct page *) * num_pages; 1028 rbio->bio_pages = p + sizeof(struct page *) * num_pages;
1029 rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
960 1030
961 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) 1031 if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
962 nr_data = bbio->num_stripes - 2; 1032 nr_data = real_stripes - 2;
963 else 1033 else
964 nr_data = bbio->num_stripes - 1; 1034 nr_data = real_stripes - 1;
965 1035
966 rbio->nr_data = nr_data; 1036 rbio->nr_data = nr_data;
967 return rbio; 1037 return rbio;
@@ -1073,7 +1143,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1073static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 1143static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1074{ 1144{
1075 if (rbio->faila >= 0 || rbio->failb >= 0) { 1145 if (rbio->faila >= 0 || rbio->failb >= 0) {
1076 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); 1146 BUG_ON(rbio->faila == rbio->real_stripes - 1);
1077 __raid56_parity_recover(rbio); 1147 __raid56_parity_recover(rbio);
1078 } else { 1148 } else {
1079 finish_rmw(rbio); 1149 finish_rmw(rbio);
@@ -1134,7 +1204,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1134static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 1204static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1135{ 1205{
1136 struct btrfs_bio *bbio = rbio->bbio; 1206 struct btrfs_bio *bbio = rbio->bbio;
1137 void *pointers[bbio->num_stripes]; 1207 void *pointers[rbio->real_stripes];
1138 int stripe_len = rbio->stripe_len; 1208 int stripe_len = rbio->stripe_len;
1139 int nr_data = rbio->nr_data; 1209 int nr_data = rbio->nr_data;
1140 int stripe; 1210 int stripe;
@@ -1148,11 +1218,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1148 1218
1149 bio_list_init(&bio_list); 1219 bio_list_init(&bio_list);
1150 1220
1151 if (bbio->num_stripes - rbio->nr_data == 1) { 1221 if (rbio->real_stripes - rbio->nr_data == 1) {
1152 p_stripe = bbio->num_stripes - 1; 1222 p_stripe = rbio->real_stripes - 1;
1153 } else if (bbio->num_stripes - rbio->nr_data == 2) { 1223 } else if (rbio->real_stripes - rbio->nr_data == 2) {
1154 p_stripe = bbio->num_stripes - 2; 1224 p_stripe = rbio->real_stripes - 2;
1155 q_stripe = bbio->num_stripes - 1; 1225 q_stripe = rbio->real_stripes - 1;
1156 } else { 1226 } else {
1157 BUG(); 1227 BUG();
1158 } 1228 }
@@ -1169,7 +1239,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1169 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1239 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1170 spin_unlock_irq(&rbio->bio_list_lock); 1240 spin_unlock_irq(&rbio->bio_list_lock);
1171 1241
1172 atomic_set(&rbio->bbio->error, 0); 1242 atomic_set(&rbio->error, 0);
1173 1243
1174 /* 1244 /*
1175 * now that we've set rmw_locked, run through the 1245 * now that we've set rmw_locked, run through the
@@ -1209,7 +1279,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1209 SetPageUptodate(p); 1279 SetPageUptodate(p);
1210 pointers[stripe++] = kmap(p); 1280 pointers[stripe++] = kmap(p);
1211 1281
1212 raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, 1282 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
1213 pointers); 1283 pointers);
1214 } else { 1284 } else {
1215 /* raid5 */ 1285 /* raid5 */
@@ -1218,7 +1288,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1218 } 1288 }
1219 1289
1220 1290
1221 for (stripe = 0; stripe < bbio->num_stripes; stripe++) 1291 for (stripe = 0; stripe < rbio->real_stripes; stripe++)
1222 kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 1292 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1223 } 1293 }
1224 1294
@@ -1227,7 +1297,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1227 * higher layers (the bio_list in our rbio) and our p/q. Ignore 1297 * higher layers (the bio_list in our rbio) and our p/q. Ignore
1228 * everything else. 1298 * everything else.
1229 */ 1299 */
1230 for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 1300 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1231 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 1301 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1232 struct page *page; 1302 struct page *page;
1233 if (stripe < rbio->nr_data) { 1303 if (stripe < rbio->nr_data) {
@@ -1245,8 +1315,34 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1245 } 1315 }
1246 } 1316 }
1247 1317
1248 atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); 1318 if (likely(!bbio->num_tgtdevs))
1249 BUG_ON(atomic_read(&bbio->stripes_pending) == 0); 1319 goto write_data;
1320
1321 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1322 if (!bbio->tgtdev_map[stripe])
1323 continue;
1324
1325 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1326 struct page *page;
1327 if (stripe < rbio->nr_data) {
1328 page = page_in_rbio(rbio, stripe, pagenr, 1);
1329 if (!page)
1330 continue;
1331 } else {
1332 page = rbio_stripe_page(rbio, stripe, pagenr);
1333 }
1334
1335 ret = rbio_add_io_page(rbio, &bio_list, page,
1336 rbio->bbio->tgtdev_map[stripe],
1337 pagenr, rbio->stripe_len);
1338 if (ret)
1339 goto cleanup;
1340 }
1341 }
1342
1343write_data:
1344 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1345 BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1250 1346
1251 while (1) { 1347 while (1) {
1252 bio = bio_list_pop(&bio_list); 1348 bio = bio_list_pop(&bio_list);
@@ -1283,7 +1379,8 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1283 stripe = &rbio->bbio->stripes[i]; 1379 stripe = &rbio->bbio->stripes[i];
1284 stripe_start = stripe->physical; 1380 stripe_start = stripe->physical;
1285 if (physical >= stripe_start && 1381 if (physical >= stripe_start &&
1286 physical < stripe_start + rbio->stripe_len) { 1382 physical < stripe_start + rbio->stripe_len &&
1383 bio->bi_bdev == stripe->dev->bdev) {
1287 return i; 1384 return i;
1288 } 1385 }
1289 } 1386 }
@@ -1331,11 +1428,11 @@ static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1331 if (rbio->faila == -1) { 1428 if (rbio->faila == -1) {
1332 /* first failure on this rbio */ 1429 /* first failure on this rbio */
1333 rbio->faila = failed; 1430 rbio->faila = failed;
1334 atomic_inc(&rbio->bbio->error); 1431 atomic_inc(&rbio->error);
1335 } else if (rbio->failb == -1) { 1432 } else if (rbio->failb == -1) {
1336 /* second failure on this rbio */ 1433 /* second failure on this rbio */
1337 rbio->failb = failed; 1434 rbio->failb = failed;
1338 atomic_inc(&rbio->bbio->error); 1435 atomic_inc(&rbio->error);
1339 } else { 1436 } else {
1340 ret = -EIO; 1437 ret = -EIO;
1341 } 1438 }
@@ -1394,11 +1491,11 @@ static void raid_rmw_end_io(struct bio *bio, int err)
1394 1491
1395 bio_put(bio); 1492 bio_put(bio);
1396 1493
1397 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 1494 if (!atomic_dec_and_test(&rbio->stripes_pending))
1398 return; 1495 return;
1399 1496
1400 err = 0; 1497 err = 0;
1401 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 1498 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
1402 goto cleanup; 1499 goto cleanup;
1403 1500
1404 /* 1501 /*
@@ -1439,7 +1536,6 @@ static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1439static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 1536static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1440{ 1537{
1441 int bios_to_read = 0; 1538 int bios_to_read = 0;
1442 struct btrfs_bio *bbio = rbio->bbio;
1443 struct bio_list bio_list; 1539 struct bio_list bio_list;
1444 int ret; 1540 int ret;
1445 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); 1541 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
@@ -1455,7 +1551,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1455 1551
1456 index_rbio_pages(rbio); 1552 index_rbio_pages(rbio);
1457 1553
1458 atomic_set(&rbio->bbio->error, 0); 1554 atomic_set(&rbio->error, 0);
1459 /* 1555 /*
1460 * build a list of bios to read all the missing parts of this 1556 * build a list of bios to read all the missing parts of this
1461 * stripe 1557 * stripe
@@ -1503,7 +1599,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1503 * the bbio may be freed once we submit the last bio. Make sure 1599 * the bbio may be freed once we submit the last bio. Make sure
1504 * not to touch it after that 1600 * not to touch it after that
1505 */ 1601 */
1506 atomic_set(&bbio->stripes_pending, bios_to_read); 1602 atomic_set(&rbio->stripes_pending, bios_to_read);
1507 while (1) { 1603 while (1) {
1508 bio = bio_list_pop(&bio_list); 1604 bio = bio_list_pop(&bio_list);
1509 if (!bio) 1605 if (!bio)
@@ -1686,19 +1782,30 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1686 struct btrfs_raid_bio *rbio; 1782 struct btrfs_raid_bio *rbio;
1687 struct btrfs_plug_cb *plug = NULL; 1783 struct btrfs_plug_cb *plug = NULL;
1688 struct blk_plug_cb *cb; 1784 struct blk_plug_cb *cb;
1785 int ret;
1689 1786
1690 rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 1787 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1691 if (IS_ERR(rbio)) 1788 if (IS_ERR(rbio)) {
1789 __free_bbio_and_raid_map(bbio, raid_map, 1);
1692 return PTR_ERR(rbio); 1790 return PTR_ERR(rbio);
1791 }
1693 bio_list_add(&rbio->bio_list, bio); 1792 bio_list_add(&rbio->bio_list, bio);
1694 rbio->bio_list_bytes = bio->bi_iter.bi_size; 1793 rbio->bio_list_bytes = bio->bi_iter.bi_size;
1794 rbio->operation = BTRFS_RBIO_WRITE;
1795
1796 btrfs_bio_counter_inc_noblocked(root->fs_info);
1797 rbio->generic_bio_cnt = 1;
1695 1798
1696 /* 1799 /*
1697 * don't plug on full rbios, just get them out the door 1800 * don't plug on full rbios, just get them out the door
1698 * as quickly as we can 1801 * as quickly as we can
1699 */ 1802 */
1700 if (rbio_is_full(rbio)) 1803 if (rbio_is_full(rbio)) {
1701 return full_stripe_write(rbio); 1804 ret = full_stripe_write(rbio);
1805 if (ret)
1806 btrfs_bio_counter_dec(root->fs_info);
1807 return ret;
1808 }
1702 1809
1703 cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, 1810 cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
1704 sizeof(*plug)); 1811 sizeof(*plug));
@@ -1709,10 +1816,13 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1709 INIT_LIST_HEAD(&plug->rbio_list); 1816 INIT_LIST_HEAD(&plug->rbio_list);
1710 } 1817 }
1711 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1818 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1819 ret = 0;
1712 } else { 1820 } else {
1713 return __raid56_parity_write(rbio); 1821 ret = __raid56_parity_write(rbio);
1822 if (ret)
1823 btrfs_bio_counter_dec(root->fs_info);
1714 } 1824 }
1715 return 0; 1825 return ret;
1716} 1826}
1717 1827
1718/* 1828/*
@@ -1730,7 +1840,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1730 int err; 1840 int err;
1731 int i; 1841 int i;
1732 1842
1733 pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), 1843 pointers = kzalloc(rbio->real_stripes * sizeof(void *),
1734 GFP_NOFS); 1844 GFP_NOFS);
1735 if (!pointers) { 1845 if (!pointers) {
1736 err = -ENOMEM; 1846 err = -ENOMEM;
@@ -1740,7 +1850,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1740 faila = rbio->faila; 1850 faila = rbio->faila;
1741 failb = rbio->failb; 1851 failb = rbio->failb;
1742 1852
1743 if (rbio->read_rebuild) { 1853 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1744 spin_lock_irq(&rbio->bio_list_lock); 1854 spin_lock_irq(&rbio->bio_list_lock);
1745 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1855 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1746 spin_unlock_irq(&rbio->bio_list_lock); 1856 spin_unlock_irq(&rbio->bio_list_lock);
@@ -1749,15 +1859,23 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1749 index_rbio_pages(rbio); 1859 index_rbio_pages(rbio);
1750 1860
1751 for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1861 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1862 /*
1863 * Now we just use bitmap to mark the horizontal stripes in
1864 * which we have data when doing parity scrub.
1865 */
1866 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1867 !test_bit(pagenr, rbio->dbitmap))
1868 continue;
1869
1752 /* setup our array of pointers with pages 1870 /* setup our array of pointers with pages
1753 * from each stripe 1871 * from each stripe
1754 */ 1872 */
1755 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { 1873 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1756 /* 1874 /*
1757 * if we're rebuilding a read, we have to use 1875 * if we're rebuilding a read, we have to use
1758 * pages from the bio list 1876 * pages from the bio list
1759 */ 1877 */
1760 if (rbio->read_rebuild && 1878 if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
1761 (stripe == faila || stripe == failb)) { 1879 (stripe == faila || stripe == failb)) {
1762 page = page_in_rbio(rbio, stripe, pagenr, 0); 1880 page = page_in_rbio(rbio, stripe, pagenr, 0);
1763 } else { 1881 } else {
@@ -1767,7 +1885,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1767 } 1885 }
1768 1886
1769 /* all raid6 handling here */ 1887 /* all raid6 handling here */
1770 if (rbio->raid_map[rbio->bbio->num_stripes - 1] == 1888 if (rbio->raid_map[rbio->real_stripes - 1] ==
1771 RAID6_Q_STRIPE) { 1889 RAID6_Q_STRIPE) {
1772 1890
1773 /* 1891 /*
@@ -1817,10 +1935,10 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1817 } 1935 }
1818 1936
1819 if (rbio->raid_map[failb] == RAID5_P_STRIPE) { 1937 if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1820 raid6_datap_recov(rbio->bbio->num_stripes, 1938 raid6_datap_recov(rbio->real_stripes,
1821 PAGE_SIZE, faila, pointers); 1939 PAGE_SIZE, faila, pointers);
1822 } else { 1940 } else {
1823 raid6_2data_recov(rbio->bbio->num_stripes, 1941 raid6_2data_recov(rbio->real_stripes,
1824 PAGE_SIZE, faila, failb, 1942 PAGE_SIZE, faila, failb,
1825 pointers); 1943 pointers);
1826 } 1944 }
@@ -1850,7 +1968,7 @@ pstripe:
1850 * know they can be trusted. If this was a read reconstruction, 1968 * know they can be trusted. If this was a read reconstruction,
1851 * other endio functions will fiddle the uptodate bits 1969 * other endio functions will fiddle the uptodate bits
1852 */ 1970 */
1853 if (!rbio->read_rebuild) { 1971 if (rbio->operation == BTRFS_RBIO_WRITE) {
1854 for (i = 0; i < nr_pages; i++) { 1972 for (i = 0; i < nr_pages; i++) {
1855 if (faila != -1) { 1973 if (faila != -1) {
1856 page = rbio_stripe_page(rbio, faila, i); 1974 page = rbio_stripe_page(rbio, faila, i);
@@ -1862,12 +1980,12 @@ pstripe:
1862 } 1980 }
1863 } 1981 }
1864 } 1982 }
1865 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { 1983 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1866 /* 1984 /*
1867 * if we're rebuilding a read, we have to use 1985 * if we're rebuilding a read, we have to use
1868 * pages from the bio list 1986 * pages from the bio list
1869 */ 1987 */
1870 if (rbio->read_rebuild && 1988 if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
1871 (stripe == faila || stripe == failb)) { 1989 (stripe == faila || stripe == failb)) {
1872 page = page_in_rbio(rbio, stripe, pagenr, 0); 1990 page = page_in_rbio(rbio, stripe, pagenr, 0);
1873 } else { 1991 } else {
@@ -1882,9 +2000,9 @@ cleanup:
1882 kfree(pointers); 2000 kfree(pointers);
1883 2001
1884cleanup_io: 2002cleanup_io:
1885 2003 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1886 if (rbio->read_rebuild) { 2004 if (err == 0 &&
1887 if (err == 0) 2005 !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
1888 cache_rbio_pages(rbio); 2006 cache_rbio_pages(rbio);
1889 else 2007 else
1890 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2008 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -1893,7 +2011,13 @@ cleanup_io:
1893 } else if (err == 0) { 2011 } else if (err == 0) {
1894 rbio->faila = -1; 2012 rbio->faila = -1;
1895 rbio->failb = -1; 2013 rbio->failb = -1;
1896 finish_rmw(rbio); 2014
2015 if (rbio->operation == BTRFS_RBIO_WRITE)
2016 finish_rmw(rbio);
2017 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
2018 finish_parity_scrub(rbio, 0);
2019 else
2020 BUG();
1897 } else { 2021 } else {
1898 rbio_orig_end_io(rbio, err, 0); 2022 rbio_orig_end_io(rbio, err, 0);
1899 } 2023 }
@@ -1917,10 +2041,10 @@ static void raid_recover_end_io(struct bio *bio, int err)
1917 set_bio_pages_uptodate(bio); 2041 set_bio_pages_uptodate(bio);
1918 bio_put(bio); 2042 bio_put(bio);
1919 2043
1920 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 2044 if (!atomic_dec_and_test(&rbio->stripes_pending))
1921 return; 2045 return;
1922 2046
1923 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 2047 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
1924 rbio_orig_end_io(rbio, -EIO, 0); 2048 rbio_orig_end_io(rbio, -EIO, 0);
1925 else 2049 else
1926 __raid_recover_end_io(rbio); 2050 __raid_recover_end_io(rbio);
@@ -1937,7 +2061,6 @@ static void raid_recover_end_io(struct bio *bio, int err)
1937static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 2061static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1938{ 2062{
1939 int bios_to_read = 0; 2063 int bios_to_read = 0;
1940 struct btrfs_bio *bbio = rbio->bbio;
1941 struct bio_list bio_list; 2064 struct bio_list bio_list;
1942 int ret; 2065 int ret;
1943 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); 2066 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
@@ -1951,16 +2074,16 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1951 if (ret) 2074 if (ret)
1952 goto cleanup; 2075 goto cleanup;
1953 2076
1954 atomic_set(&rbio->bbio->error, 0); 2077 atomic_set(&rbio->error, 0);
1955 2078
1956 /* 2079 /*
1957 * read everything that hasn't failed. Thanks to the 2080 * read everything that hasn't failed. Thanks to the
1958 * stripe cache, it is possible that some or all of these 2081 * stripe cache, it is possible that some or all of these
1959 * pages are going to be uptodate. 2082 * pages are going to be uptodate.
1960 */ 2083 */
1961 for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 2084 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1962 if (rbio->faila == stripe || rbio->failb == stripe) { 2085 if (rbio->faila == stripe || rbio->failb == stripe) {
1963 atomic_inc(&rbio->bbio->error); 2086 atomic_inc(&rbio->error);
1964 continue; 2087 continue;
1965 } 2088 }
1966 2089
@@ -1990,7 +2113,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1990 * were up to date, or we might have no bios to read because 2113 * were up to date, or we might have no bios to read because
1991 * the devices were gone. 2114 * the devices were gone.
1992 */ 2115 */
1993 if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { 2116 if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
1994 __raid_recover_end_io(rbio); 2117 __raid_recover_end_io(rbio);
1995 goto out; 2118 goto out;
1996 } else { 2119 } else {
@@ -2002,7 +2125,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2002 * the bbio may be freed once we submit the last bio. Make sure 2125 * the bbio may be freed once we submit the last bio. Make sure
2003 * not to touch it after that 2126 * not to touch it after that
2004 */ 2127 */
2005 atomic_set(&bbio->stripes_pending, bios_to_read); 2128 atomic_set(&rbio->stripes_pending, bios_to_read);
2006 while (1) { 2129 while (1) {
2007 bio = bio_list_pop(&bio_list); 2130 bio = bio_list_pop(&bio_list);
2008 if (!bio) 2131 if (!bio)
@@ -2021,7 +2144,7 @@ out:
2021 return 0; 2144 return 0;
2022 2145
2023cleanup: 2146cleanup:
2024 if (rbio->read_rebuild) 2147 if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
2025 rbio_orig_end_io(rbio, -EIO, 0); 2148 rbio_orig_end_io(rbio, -EIO, 0);
2026 return -EIO; 2149 return -EIO;
2027} 2150}
@@ -2034,34 +2157,42 @@ cleanup:
2034 */ 2157 */
2035int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, 2158int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2036 struct btrfs_bio *bbio, u64 *raid_map, 2159 struct btrfs_bio *bbio, u64 *raid_map,
2037 u64 stripe_len, int mirror_num) 2160 u64 stripe_len, int mirror_num, int generic_io)
2038{ 2161{
2039 struct btrfs_raid_bio *rbio; 2162 struct btrfs_raid_bio *rbio;
2040 int ret; 2163 int ret;
2041 2164
2042 rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 2165 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2043 if (IS_ERR(rbio)) 2166 if (IS_ERR(rbio)) {
2167 __free_bbio_and_raid_map(bbio, raid_map, generic_io);
2044 return PTR_ERR(rbio); 2168 return PTR_ERR(rbio);
2169 }
2045 2170
2046 rbio->read_rebuild = 1; 2171 rbio->operation = BTRFS_RBIO_READ_REBUILD;
2047 bio_list_add(&rbio->bio_list, bio); 2172 bio_list_add(&rbio->bio_list, bio);
2048 rbio->bio_list_bytes = bio->bi_iter.bi_size; 2173 rbio->bio_list_bytes = bio->bi_iter.bi_size;
2049 2174
2050 rbio->faila = find_logical_bio_stripe(rbio, bio); 2175 rbio->faila = find_logical_bio_stripe(rbio, bio);
2051 if (rbio->faila == -1) { 2176 if (rbio->faila == -1) {
2052 BUG(); 2177 BUG();
2053 kfree(raid_map); 2178 __free_bbio_and_raid_map(bbio, raid_map, generic_io);
2054 kfree(bbio);
2055 kfree(rbio); 2179 kfree(rbio);
2056 return -EIO; 2180 return -EIO;
2057 } 2181 }
2058 2182
2183 if (generic_io) {
2184 btrfs_bio_counter_inc_noblocked(root->fs_info);
2185 rbio->generic_bio_cnt = 1;
2186 } else {
2187 set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
2188 }
2189
2059 /* 2190 /*
2060 * reconstruct from the q stripe if they are 2191 * reconstruct from the q stripe if they are
2061 * asking for mirror 3 2192 * asking for mirror 3
2062 */ 2193 */
2063 if (mirror_num == 3) 2194 if (mirror_num == 3)
2064 rbio->failb = bbio->num_stripes - 2; 2195 rbio->failb = rbio->real_stripes - 2;
2065 2196
2066 ret = lock_stripe_add(rbio); 2197 ret = lock_stripe_add(rbio);
2067 2198
@@ -2098,3 +2229,483 @@ static void read_rebuild_work(struct btrfs_work *work)
2098 rbio = container_of(work, struct btrfs_raid_bio, work); 2229 rbio = container_of(work, struct btrfs_raid_bio, work);
2099 __raid56_parity_recover(rbio); 2230 __raid56_parity_recover(rbio);
2100} 2231}
2232
2233/*
2234 * The following code is used to scrub/replace the parity stripe
2235 *
2236 * Note: We need make sure all the pages that add into the scrub/replace
2237 * raid bio are correct and not be changed during the scrub/replace. That
2238 * is those pages just hold metadata or file data with checksum.
2239 */
2240
2241struct btrfs_raid_bio *
2242raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
2243 struct btrfs_bio *bbio, u64 *raid_map,
2244 u64 stripe_len, struct btrfs_device *scrub_dev,
2245 unsigned long *dbitmap, int stripe_nsectors)
2246{
2247 struct btrfs_raid_bio *rbio;
2248 int i;
2249
2250 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2251 if (IS_ERR(rbio))
2252 return NULL;
2253 bio_list_add(&rbio->bio_list, bio);
2254 /*
2255 * This is a special bio which is used to hold the completion handler
2256 * and make the scrub rbio is similar to the other types
2257 */
2258 ASSERT(!bio->bi_iter.bi_size);
2259 rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2260
2261 for (i = 0; i < rbio->real_stripes; i++) {
2262 if (bbio->stripes[i].dev == scrub_dev) {
2263 rbio->scrubp = i;
2264 break;
2265 }
2266 }
2267
2268 /* Now we just support the sectorsize equals to page size */
2269 ASSERT(root->sectorsize == PAGE_SIZE);
2270 ASSERT(rbio->stripe_npages == stripe_nsectors);
2271 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2272
2273 return rbio;
2274}
2275
2276void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
2277 struct page *page, u64 logical)
2278{
2279 int stripe_offset;
2280 int index;
2281
2282 ASSERT(logical >= rbio->raid_map[0]);
2283 ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] +
2284 rbio->stripe_len * rbio->nr_data);
2285 stripe_offset = (int)(logical - rbio->raid_map[0]);
2286 index = stripe_offset >> PAGE_CACHE_SHIFT;
2287 rbio->bio_pages[index] = page;
2288}
2289
2290/*
2291 * We just scrub the parity that we have correct data on the same horizontal,
2292 * so we needn't allocate all pages for all the stripes.
2293 */
2294static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2295{
2296 int i;
2297 int bit;
2298 int index;
2299 struct page *page;
2300
2301 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
2302 for (i = 0; i < rbio->real_stripes; i++) {
2303 index = i * rbio->stripe_npages + bit;
2304 if (rbio->stripe_pages[index])
2305 continue;
2306
2307 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2308 if (!page)
2309 return -ENOMEM;
2310 rbio->stripe_pages[index] = page;
2311 ClearPageUptodate(page);
2312 }
2313 }
2314 return 0;
2315}
2316
2317/*
2318 * end io function used by finish_rmw. When we finally
2319 * get here, we've written a full stripe
2320 */
2321static void raid_write_parity_end_io(struct bio *bio, int err)
2322{
2323 struct btrfs_raid_bio *rbio = bio->bi_private;
2324
2325 if (err)
2326 fail_bio_stripe(rbio, bio);
2327
2328 bio_put(bio);
2329
2330 if (!atomic_dec_and_test(&rbio->stripes_pending))
2331 return;
2332
2333 err = 0;
2334
2335 if (atomic_read(&rbio->error))
2336 err = -EIO;
2337
2338 rbio_orig_end_io(rbio, err, 0);
2339}
2340
2341static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2342 int need_check)
2343{
2344 struct btrfs_bio *bbio = rbio->bbio;
2345 void *pointers[rbio->real_stripes];
2346 DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
2347 int nr_data = rbio->nr_data;
2348 int stripe;
2349 int pagenr;
2350 int p_stripe = -1;
2351 int q_stripe = -1;
2352 struct page *p_page = NULL;
2353 struct page *q_page = NULL;
2354 struct bio_list bio_list;
2355 struct bio *bio;
2356 int is_replace = 0;
2357 int ret;
2358
2359 bio_list_init(&bio_list);
2360
2361 if (rbio->real_stripes - rbio->nr_data == 1) {
2362 p_stripe = rbio->real_stripes - 1;
2363 } else if (rbio->real_stripes - rbio->nr_data == 2) {
2364 p_stripe = rbio->real_stripes - 2;
2365 q_stripe = rbio->real_stripes - 1;
2366 } else {
2367 BUG();
2368 }
2369
2370 if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
2371 is_replace = 1;
2372 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
2373 }
2374
2375 /*
2376 * Because the higher layers(scrubber) are unlikely to
2377 * use this area of the disk again soon, so don't cache
2378 * it.
2379 */
2380 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2381
2382 if (!need_check)
2383 goto writeback;
2384
2385 p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2386 if (!p_page)
2387 goto cleanup;
2388 SetPageUptodate(p_page);
2389
2390 if (q_stripe != -1) {
2391 q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2392 if (!q_page) {
2393 __free_page(p_page);
2394 goto cleanup;
2395 }
2396 SetPageUptodate(q_page);
2397 }
2398
2399 atomic_set(&rbio->error, 0);
2400
2401 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2402 struct page *p;
2403 void *parity;
2404 /* first collect one page from each data stripe */
2405 for (stripe = 0; stripe < nr_data; stripe++) {
2406 p = page_in_rbio(rbio, stripe, pagenr, 0);
2407 pointers[stripe] = kmap(p);
2408 }
2409
2410 /* then add the parity stripe */
2411 pointers[stripe++] = kmap(p_page);
2412
2413 if (q_stripe != -1) {
2414
2415 /*
2416 * raid6, add the qstripe and call the
2417 * library function to fill in our p/q
2418 */
2419 pointers[stripe++] = kmap(q_page);
2420
2421 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
2422 pointers);
2423 } else {
2424 /* raid5 */
2425 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
2426 run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
2427 }
2428
2429 /* Check scrubbing pairty and repair it */
2430 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2431 parity = kmap(p);
2432 if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
2433 memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
2434 else
2435 /* Parity is right, needn't writeback */
2436 bitmap_clear(rbio->dbitmap, pagenr, 1);
2437 kunmap(p);
2438
2439 for (stripe = 0; stripe < rbio->real_stripes; stripe++)
2440 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
2441 }
2442
2443 __free_page(p_page);
2444 if (q_page)
2445 __free_page(q_page);
2446
2447writeback:
2448 /*
2449 * time to start writing. Make bios for everything from the
2450 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2451 * everything else.
2452 */
2453 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2454 struct page *page;
2455
2456 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2457 ret = rbio_add_io_page(rbio, &bio_list,
2458 page, rbio->scrubp, pagenr, rbio->stripe_len);
2459 if (ret)
2460 goto cleanup;
2461 }
2462
2463 if (!is_replace)
2464 goto submit_write;
2465
2466 for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
2467 struct page *page;
2468
2469 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2470 ret = rbio_add_io_page(rbio, &bio_list, page,
2471 bbio->tgtdev_map[rbio->scrubp],
2472 pagenr, rbio->stripe_len);
2473 if (ret)
2474 goto cleanup;
2475 }
2476
2477submit_write:
2478 nr_data = bio_list_size(&bio_list);
2479 if (!nr_data) {
2480 /* Every parity is right */
2481 rbio_orig_end_io(rbio, 0, 0);
2482 return;
2483 }
2484
2485 atomic_set(&rbio->stripes_pending, nr_data);
2486
2487 while (1) {
2488 bio = bio_list_pop(&bio_list);
2489 if (!bio)
2490 break;
2491
2492 bio->bi_private = rbio;
2493 bio->bi_end_io = raid_write_parity_end_io;
2494 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2495 submit_bio(WRITE, bio);
2496 }
2497 return;
2498
2499cleanup:
2500 rbio_orig_end_io(rbio, -EIO, 0);
2501}
2502
2503static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2504{
2505 if (stripe >= 0 && stripe < rbio->nr_data)
2506 return 1;
2507 return 0;
2508}
2509
2510/*
2511 * While we're doing the parity check and repair, we could have errors
2512 * in reading pages off the disk. This checks for errors and if we're
2513 * not able to read the page it'll trigger parity reconstruction. The
2514 * parity scrub will be finished after we've reconstructed the failed
2515 * stripes
2516 */
2517static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2518{
2519 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2520 goto cleanup;
2521
2522 if (rbio->faila >= 0 || rbio->failb >= 0) {
2523 int dfail = 0, failp = -1;
2524
2525 if (is_data_stripe(rbio, rbio->faila))
2526 dfail++;
2527 else if (is_parity_stripe(rbio->faila))
2528 failp = rbio->faila;
2529
2530 if (is_data_stripe(rbio, rbio->failb))
2531 dfail++;
2532 else if (is_parity_stripe(rbio->failb))
2533 failp = rbio->failb;
2534
2535 /*
2536 * Because we can not use a scrubbing parity to repair
2537 * the data, so the capability of the repair is declined.
2538 * (In the case of RAID5, we can not repair anything)
2539 */
2540 if (dfail > rbio->bbio->max_errors - 1)
2541 goto cleanup;
2542
2543 /*
2544 * If all data is good, only parity is correctly, just
2545 * repair the parity.
2546 */
2547 if (dfail == 0) {
2548 finish_parity_scrub(rbio, 0);
2549 return;
2550 }
2551
2552 /*
2553 * Here means we got one corrupted data stripe and one
2554 * corrupted parity on RAID6, if the corrupted parity
2555 * is scrubbing parity, luckly, use the other one to repair
2556 * the data, or we can not repair the data stripe.
2557 */
2558 if (failp != rbio->scrubp)
2559 goto cleanup;
2560
2561 __raid_recover_end_io(rbio);
2562 } else {
2563 finish_parity_scrub(rbio, 1);
2564 }
2565 return;
2566
2567cleanup:
2568 rbio_orig_end_io(rbio, -EIO, 0);
2569}
2570
2571/*
2572 * end io for the read phase of the rmw cycle. All the bios here are physical
2573 * stripe bios we've read from the disk so we can recalculate the parity of the
2574 * stripe.
2575 *
2576 * This will usually kick off finish_rmw once all the bios are read in, but it
2577 * may trigger parity reconstruction if we had any errors along the way
2578 */
2579static void raid56_parity_scrub_end_io(struct bio *bio, int err)
2580{
2581 struct btrfs_raid_bio *rbio = bio->bi_private;
2582
2583 if (err)
2584 fail_bio_stripe(rbio, bio);
2585 else
2586 set_bio_pages_uptodate(bio);
2587
2588 bio_put(bio);
2589
2590 if (!atomic_dec_and_test(&rbio->stripes_pending))
2591 return;
2592
2593 /*
2594 * this will normally call finish_rmw to start our write
2595 * but if there are any failed stripes we'll reconstruct
2596 * from parity first
2597 */
2598 validate_rbio_for_parity_scrub(rbio);
2599}
2600
2601static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2602{
2603 int bios_to_read = 0;
2604 struct bio_list bio_list;
2605 int ret;
2606 int pagenr;
2607 int stripe;
2608 struct bio *bio;
2609
2610 ret = alloc_rbio_essential_pages(rbio);
2611 if (ret)
2612 goto cleanup;
2613
2614 bio_list_init(&bio_list);
2615
2616 atomic_set(&rbio->error, 0);
2617 /*
2618 * build a list of bios to read all the missing parts of this
2619 * stripe
2620 */
2621 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2622 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2623 struct page *page;
2624 /*
2625 * we want to find all the pages missing from
2626 * the rbio and read them from the disk. If
2627 * page_in_rbio finds a page in the bio list
2628 * we don't need to read it off the stripe.
2629 */
2630 page = page_in_rbio(rbio, stripe, pagenr, 1);
2631 if (page)
2632 continue;
2633
2634 page = rbio_stripe_page(rbio, stripe, pagenr);
2635 /*
2636 * the bio cache may have handed us an uptodate
2637 * page. If so, be happy and use it
2638 */
2639 if (PageUptodate(page))
2640 continue;
2641
2642 ret = rbio_add_io_page(rbio, &bio_list, page,
2643 stripe, pagenr, rbio->stripe_len);
2644 if (ret)
2645 goto cleanup;
2646 }
2647 }
2648
2649 bios_to_read = bio_list_size(&bio_list);
2650 if (!bios_to_read) {
2651 /*
2652 * this can happen if others have merged with
2653 * us, it means there is nothing left to read.
2654 * But if there are missing devices it may not be
2655 * safe to do the full stripe write yet.
2656 */
2657 goto finish;
2658 }
2659
2660 /*
2661 * the bbio may be freed once we submit the last bio. Make sure
2662 * not to touch it after that
2663 */
2664 atomic_set(&rbio->stripes_pending, bios_to_read);
2665 while (1) {
2666 bio = bio_list_pop(&bio_list);
2667 if (!bio)
2668 break;
2669
2670 bio->bi_private = rbio;
2671 bio->bi_end_io = raid56_parity_scrub_end_io;
2672
2673 btrfs_bio_wq_end_io(rbio->fs_info, bio,
2674 BTRFS_WQ_ENDIO_RAID56);
2675
2676 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2677 submit_bio(READ, bio);
2678 }
2679 /* the actual write will happen once the reads are done */
2680 return;
2681
2682cleanup:
2683 rbio_orig_end_io(rbio, -EIO, 0);
2684 return;
2685
2686finish:
2687 validate_rbio_for_parity_scrub(rbio);
2688}
2689
2690static void scrub_parity_work(struct btrfs_work *work)
2691{
2692 struct btrfs_raid_bio *rbio;
2693
2694 rbio = container_of(work, struct btrfs_raid_bio, work);
2695 raid56_parity_scrub_stripe(rbio);
2696}
2697
2698static void async_scrub_parity(struct btrfs_raid_bio *rbio)
2699{
2700 btrfs_init_work(&rbio->work, btrfs_rmw_helper,
2701 scrub_parity_work, NULL, NULL);
2702
2703 btrfs_queue_work(rbio->fs_info->rmw_workers,
2704 &rbio->work);
2705}
2706
2707void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2708{
2709 if (!lock_stripe_add(rbio))
2710 async_scrub_parity(rbio);
2711}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index ea5d73bfdfbe..31d4a157b5e3 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -39,13 +39,25 @@ static inline int nr_data_stripes(struct map_lookup *map)
39#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ 39#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
40 ((x) == RAID6_Q_STRIPE)) 40 ((x) == RAID6_Q_STRIPE))
41 41
42struct btrfs_raid_bio;
43struct btrfs_device;
44
42int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, 45int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
43 struct btrfs_bio *bbio, u64 *raid_map, 46 struct btrfs_bio *bbio, u64 *raid_map,
44 u64 stripe_len, int mirror_num); 47 u64 stripe_len, int mirror_num, int generic_io);
45int raid56_parity_write(struct btrfs_root *root, struct bio *bio, 48int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
46 struct btrfs_bio *bbio, u64 *raid_map, 49 struct btrfs_bio *bbio, u64 *raid_map,
47 u64 stripe_len); 50 u64 stripe_len);
48 51
52struct btrfs_raid_bio *
53raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
54 struct btrfs_bio *bbio, u64 *raid_map,
55 u64 stripe_len, struct btrfs_device *scrub_dev,
56 unsigned long *dbitmap, int stripe_nsectors);
57void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
58 struct page *page, u64 logical);
59void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
60
49int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); 61int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
50void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); 62void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
51#endif 63#endif
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efa083113827..f2bb13a23f86 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -63,10 +63,18 @@ struct scrub_ctx;
63 */ 63 */
64#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ 64#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
65 65
66struct scrub_recover {
67 atomic_t refs;
68 struct btrfs_bio *bbio;
69 u64 *raid_map;
70 u64 map_length;
71};
72
66struct scrub_page { 73struct scrub_page {
67 struct scrub_block *sblock; 74 struct scrub_block *sblock;
68 struct page *page; 75 struct page *page;
69 struct btrfs_device *dev; 76 struct btrfs_device *dev;
77 struct list_head list;
70 u64 flags; /* extent flags */ 78 u64 flags; /* extent flags */
71 u64 generation; 79 u64 generation;
72 u64 logical; 80 u64 logical;
@@ -79,6 +87,8 @@ struct scrub_page {
79 unsigned int io_error:1; 87 unsigned int io_error:1;
80 }; 88 };
81 u8 csum[BTRFS_CSUM_SIZE]; 89 u8 csum[BTRFS_CSUM_SIZE];
90
91 struct scrub_recover *recover;
82}; 92};
83 93
84struct scrub_bio { 94struct scrub_bio {
@@ -105,14 +115,52 @@ struct scrub_block {
105 atomic_t outstanding_pages; 115 atomic_t outstanding_pages;
106 atomic_t ref_count; /* free mem on transition to zero */ 116 atomic_t ref_count; /* free mem on transition to zero */
107 struct scrub_ctx *sctx; 117 struct scrub_ctx *sctx;
118 struct scrub_parity *sparity;
108 struct { 119 struct {
109 unsigned int header_error:1; 120 unsigned int header_error:1;
110 unsigned int checksum_error:1; 121 unsigned int checksum_error:1;
111 unsigned int no_io_error_seen:1; 122 unsigned int no_io_error_seen:1;
112 unsigned int generation_error:1; /* also sets header_error */ 123 unsigned int generation_error:1; /* also sets header_error */
124
125 /* The following is for the data used to check parity */
126 /* It is for the data with checksum */
127 unsigned int data_corrected:1;
113 }; 128 };
114}; 129};
115 130
131/* Used for the chunks with parity stripe such RAID5/6 */
132struct scrub_parity {
133 struct scrub_ctx *sctx;
134
135 struct btrfs_device *scrub_dev;
136
137 u64 logic_start;
138
139 u64 logic_end;
140
141 int nsectors;
142
143 int stripe_len;
144
145 atomic_t ref_count;
146
147 struct list_head spages;
148
149 /* Work of parity check and repair */
150 struct btrfs_work work;
151
152 /* Mark the parity blocks which have data */
153 unsigned long *dbitmap;
154
155 /*
156 * Mark the parity blocks which have data, but errors happen when
157 * read data or check data
158 */
159 unsigned long *ebitmap;
160
161 unsigned long bitmap[0];
162};
163
116struct scrub_wr_ctx { 164struct scrub_wr_ctx {
117 struct scrub_bio *wr_curr_bio; 165 struct scrub_bio *wr_curr_bio;
118 struct btrfs_device *tgtdev; 166 struct btrfs_device *tgtdev;
@@ -196,7 +244,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
196static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 244static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
197 struct scrub_block *sblock, int is_metadata, 245 struct scrub_block *sblock, int is_metadata,
198 int have_csum, u8 *csum, u64 generation, 246 int have_csum, u8 *csum, u64 generation,
199 u16 csum_size); 247 u16 csum_size, int retry_failed_mirror);
200static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 248static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
201 struct scrub_block *sblock, 249 struct scrub_block *sblock,
202 int is_metadata, int have_csum, 250 int is_metadata, int have_csum,
@@ -218,6 +266,8 @@ static void scrub_block_get(struct scrub_block *sblock);
218static void scrub_block_put(struct scrub_block *sblock); 266static void scrub_block_put(struct scrub_block *sblock);
219static void scrub_page_get(struct scrub_page *spage); 267static void scrub_page_get(struct scrub_page *spage);
220static void scrub_page_put(struct scrub_page *spage); 268static void scrub_page_put(struct scrub_page *spage);
269static void scrub_parity_get(struct scrub_parity *sparity);
270static void scrub_parity_put(struct scrub_parity *sparity);
221static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, 271static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
222 struct scrub_page *spage); 272 struct scrub_page *spage);
223static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 273static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
@@ -790,6 +840,20 @@ out:
790 scrub_pending_trans_workers_dec(sctx); 840 scrub_pending_trans_workers_dec(sctx);
791} 841}
792 842
843static inline void scrub_get_recover(struct scrub_recover *recover)
844{
845 atomic_inc(&recover->refs);
846}
847
848static inline void scrub_put_recover(struct scrub_recover *recover)
849{
850 if (atomic_dec_and_test(&recover->refs)) {
851 kfree(recover->bbio);
852 kfree(recover->raid_map);
853 kfree(recover);
854 }
855}
856
793/* 857/*
794 * scrub_handle_errored_block gets called when either verification of the 858 * scrub_handle_errored_block gets called when either verification of the
795 * pages failed or the bio failed to read, e.g. with EIO. In the latter 859 * pages failed or the bio failed to read, e.g. with EIO. In the latter
@@ -906,7 +970,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
906 970
907 /* build and submit the bios for the failed mirror, check checksums */ 971 /* build and submit the bios for the failed mirror, check checksums */
908 scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, 972 scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
909 csum, generation, sctx->csum_size); 973 csum, generation, sctx->csum_size, 1);
910 974
911 if (!sblock_bad->header_error && !sblock_bad->checksum_error && 975 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
912 sblock_bad->no_io_error_seen) { 976 sblock_bad->no_io_error_seen) {
@@ -920,6 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
920 */ 984 */
921 spin_lock(&sctx->stat_lock); 985 spin_lock(&sctx->stat_lock);
922 sctx->stat.unverified_errors++; 986 sctx->stat.unverified_errors++;
987 sblock_to_check->data_corrected = 1;
923 spin_unlock(&sctx->stat_lock); 988 spin_unlock(&sctx->stat_lock);
924 989
925 if (sctx->is_dev_replace) 990 if (sctx->is_dev_replace)
@@ -1019,7 +1084,7 @@ nodatasum_case:
1019 /* build and submit the bios, check checksums */ 1084 /* build and submit the bios, check checksums */
1020 scrub_recheck_block(fs_info, sblock_other, is_metadata, 1085 scrub_recheck_block(fs_info, sblock_other, is_metadata,
1021 have_csum, csum, generation, 1086 have_csum, csum, generation,
1022 sctx->csum_size); 1087 sctx->csum_size, 0);
1023 1088
1024 if (!sblock_other->header_error && 1089 if (!sblock_other->header_error &&
1025 !sblock_other->checksum_error && 1090 !sblock_other->checksum_error &&
@@ -1169,7 +1234,7 @@ nodatasum_case:
1169 */ 1234 */
1170 scrub_recheck_block(fs_info, sblock_bad, 1235 scrub_recheck_block(fs_info, sblock_bad,
1171 is_metadata, have_csum, csum, 1236 is_metadata, have_csum, csum,
1172 generation, sctx->csum_size); 1237 generation, sctx->csum_size, 1);
1173 if (!sblock_bad->header_error && 1238 if (!sblock_bad->header_error &&
1174 !sblock_bad->checksum_error && 1239 !sblock_bad->checksum_error &&
1175 sblock_bad->no_io_error_seen) 1240 sblock_bad->no_io_error_seen)
@@ -1180,6 +1245,7 @@ nodatasum_case:
1180corrected_error: 1245corrected_error:
1181 spin_lock(&sctx->stat_lock); 1246 spin_lock(&sctx->stat_lock);
1182 sctx->stat.corrected_errors++; 1247 sctx->stat.corrected_errors++;
1248 sblock_to_check->data_corrected = 1;
1183 spin_unlock(&sctx->stat_lock); 1249 spin_unlock(&sctx->stat_lock);
1184 printk_ratelimited_in_rcu(KERN_ERR 1250 printk_ratelimited_in_rcu(KERN_ERR
1185 "BTRFS: fixed up error at logical %llu on dev %s\n", 1251 "BTRFS: fixed up error at logical %llu on dev %s\n",
@@ -1201,11 +1267,18 @@ out:
1201 mirror_index++) { 1267 mirror_index++) {
1202 struct scrub_block *sblock = sblocks_for_recheck + 1268 struct scrub_block *sblock = sblocks_for_recheck +
1203 mirror_index; 1269 mirror_index;
1270 struct scrub_recover *recover;
1204 int page_index; 1271 int page_index;
1205 1272
1206 for (page_index = 0; page_index < sblock->page_count; 1273 for (page_index = 0; page_index < sblock->page_count;
1207 page_index++) { 1274 page_index++) {
1208 sblock->pagev[page_index]->sblock = NULL; 1275 sblock->pagev[page_index]->sblock = NULL;
1276 recover = sblock->pagev[page_index]->recover;
1277 if (recover) {
1278 scrub_put_recover(recover);
1279 sblock->pagev[page_index]->recover =
1280 NULL;
1281 }
1209 scrub_page_put(sblock->pagev[page_index]); 1282 scrub_page_put(sblock->pagev[page_index]);
1210 } 1283 }
1211 } 1284 }
@@ -1215,14 +1288,63 @@ out:
1215 return 0; 1288 return 0;
1216} 1289}
1217 1290
1291static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
1292{
1293 if (raid_map) {
1294 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
1295 return 3;
1296 else
1297 return 2;
1298 } else {
1299 return (int)bbio->num_stripes;
1300 }
1301}
1302
1303static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
1304 u64 mapped_length,
1305 int nstripes, int mirror,
1306 int *stripe_index,
1307 u64 *stripe_offset)
1308{
1309 int i;
1310
1311 if (raid_map) {
1312 /* RAID5/6 */
1313 for (i = 0; i < nstripes; i++) {
1314 if (raid_map[i] == RAID6_Q_STRIPE ||
1315 raid_map[i] == RAID5_P_STRIPE)
1316 continue;
1317
1318 if (logical >= raid_map[i] &&
1319 logical < raid_map[i] + mapped_length)
1320 break;
1321 }
1322
1323 *stripe_index = i;
1324 *stripe_offset = logical - raid_map[i];
1325 } else {
1326 /* The other RAID type */
1327 *stripe_index = mirror;
1328 *stripe_offset = 0;
1329 }
1330}
1331
1218static int scrub_setup_recheck_block(struct scrub_ctx *sctx, 1332static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1219 struct btrfs_fs_info *fs_info, 1333 struct btrfs_fs_info *fs_info,
1220 struct scrub_block *original_sblock, 1334 struct scrub_block *original_sblock,
1221 u64 length, u64 logical, 1335 u64 length, u64 logical,
1222 struct scrub_block *sblocks_for_recheck) 1336 struct scrub_block *sblocks_for_recheck)
1223{ 1337{
1338 struct scrub_recover *recover;
1339 struct btrfs_bio *bbio;
1340 u64 *raid_map;
1341 u64 sublen;
1342 u64 mapped_length;
1343 u64 stripe_offset;
1344 int stripe_index;
1224 int page_index; 1345 int page_index;
1225 int mirror_index; 1346 int mirror_index;
1347 int nmirrors;
1226 int ret; 1348 int ret;
1227 1349
1228 /* 1350 /*
@@ -1233,23 +1355,39 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1233 1355
1234 page_index = 0; 1356 page_index = 0;
1235 while (length > 0) { 1357 while (length > 0) {
1236 u64 sublen = min_t(u64, length, PAGE_SIZE); 1358 sublen = min_t(u64, length, PAGE_SIZE);
1237 u64 mapped_length = sublen; 1359 mapped_length = sublen;
1238 struct btrfs_bio *bbio = NULL; 1360 bbio = NULL;
1361 raid_map = NULL;
1239 1362
1240 /* 1363 /*
1241 * with a length of PAGE_SIZE, each returned stripe 1364 * with a length of PAGE_SIZE, each returned stripe
1242 * represents one mirror 1365 * represents one mirror
1243 */ 1366 */
1244 ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, 1367 ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
1245 &mapped_length, &bbio, 0); 1368 &mapped_length, &bbio, 0, &raid_map);
1246 if (ret || !bbio || mapped_length < sublen) { 1369 if (ret || !bbio || mapped_length < sublen) {
1247 kfree(bbio); 1370 kfree(bbio);
1371 kfree(raid_map);
1248 return -EIO; 1372 return -EIO;
1249 } 1373 }
1250 1374
1375 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1376 if (!recover) {
1377 kfree(bbio);
1378 kfree(raid_map);
1379 return -ENOMEM;
1380 }
1381
1382 atomic_set(&recover->refs, 1);
1383 recover->bbio = bbio;
1384 recover->raid_map = raid_map;
1385 recover->map_length = mapped_length;
1386
1251 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); 1387 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1252 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; 1388
1389 nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
1390 for (mirror_index = 0; mirror_index < nmirrors;
1253 mirror_index++) { 1391 mirror_index++) {
1254 struct scrub_block *sblock; 1392 struct scrub_block *sblock;
1255 struct scrub_page *page; 1393 struct scrub_page *page;
@@ -1265,26 +1403,38 @@ leave_nomem:
1265 spin_lock(&sctx->stat_lock); 1403 spin_lock(&sctx->stat_lock);
1266 sctx->stat.malloc_errors++; 1404 sctx->stat.malloc_errors++;
1267 spin_unlock(&sctx->stat_lock); 1405 spin_unlock(&sctx->stat_lock);
1268 kfree(bbio); 1406 scrub_put_recover(recover);
1269 return -ENOMEM; 1407 return -ENOMEM;
1270 } 1408 }
1271 scrub_page_get(page); 1409 scrub_page_get(page);
1272 sblock->pagev[page_index] = page; 1410 sblock->pagev[page_index] = page;
1273 page->logical = logical; 1411 page->logical = logical;
1274 page->physical = bbio->stripes[mirror_index].physical; 1412
1413 scrub_stripe_index_and_offset(logical, raid_map,
1414 mapped_length,
1415 bbio->num_stripes,
1416 mirror_index,
1417 &stripe_index,
1418 &stripe_offset);
1419 page->physical = bbio->stripes[stripe_index].physical +
1420 stripe_offset;
1421 page->dev = bbio->stripes[stripe_index].dev;
1422
1275 BUG_ON(page_index >= original_sblock->page_count); 1423 BUG_ON(page_index >= original_sblock->page_count);
1276 page->physical_for_dev_replace = 1424 page->physical_for_dev_replace =
1277 original_sblock->pagev[page_index]-> 1425 original_sblock->pagev[page_index]->
1278 physical_for_dev_replace; 1426 physical_for_dev_replace;
1279 /* for missing devices, dev->bdev is NULL */ 1427 /* for missing devices, dev->bdev is NULL */
1280 page->dev = bbio->stripes[mirror_index].dev;
1281 page->mirror_num = mirror_index + 1; 1428 page->mirror_num = mirror_index + 1;
1282 sblock->page_count++; 1429 sblock->page_count++;
1283 page->page = alloc_page(GFP_NOFS); 1430 page->page = alloc_page(GFP_NOFS);
1284 if (!page->page) 1431 if (!page->page)
1285 goto leave_nomem; 1432 goto leave_nomem;
1433
1434 scrub_get_recover(recover);
1435 page->recover = recover;
1286 } 1436 }
1287 kfree(bbio); 1437 scrub_put_recover(recover);
1288 length -= sublen; 1438 length -= sublen;
1289 logical += sublen; 1439 logical += sublen;
1290 page_index++; 1440 page_index++;
@@ -1293,6 +1443,51 @@ leave_nomem:
1293 return 0; 1443 return 0;
1294} 1444}
1295 1445
1446struct scrub_bio_ret {
1447 struct completion event;
1448 int error;
1449};
1450
1451static void scrub_bio_wait_endio(struct bio *bio, int error)
1452{
1453 struct scrub_bio_ret *ret = bio->bi_private;
1454
1455 ret->error = error;
1456 complete(&ret->event);
1457}
1458
1459static inline int scrub_is_page_on_raid56(struct scrub_page *page)
1460{
1461 return page->recover && page->recover->raid_map;
1462}
1463
1464static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1465 struct bio *bio,
1466 struct scrub_page *page)
1467{
1468 struct scrub_bio_ret done;
1469 int ret;
1470
1471 init_completion(&done.event);
1472 done.error = 0;
1473 bio->bi_iter.bi_sector = page->logical >> 9;
1474 bio->bi_private = &done;
1475 bio->bi_end_io = scrub_bio_wait_endio;
1476
1477 ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
1478 page->recover->raid_map,
1479 page->recover->map_length,
1480 page->mirror_num, 0);
1481 if (ret)
1482 return ret;
1483
1484 wait_for_completion(&done.event);
1485 if (done.error)
1486 return -EIO;
1487
1488 return 0;
1489}
1490
1296/* 1491/*
1297 * this function will check the on disk data for checksum errors, header 1492 * this function will check the on disk data for checksum errors, header
1298 * errors and read I/O errors. If any I/O errors happen, the exact pages 1493 * errors and read I/O errors. If any I/O errors happen, the exact pages
@@ -1303,7 +1498,7 @@ leave_nomem:
1303static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 1498static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1304 struct scrub_block *sblock, int is_metadata, 1499 struct scrub_block *sblock, int is_metadata,
1305 int have_csum, u8 *csum, u64 generation, 1500 int have_csum, u8 *csum, u64 generation,
1306 u16 csum_size) 1501 u16 csum_size, int retry_failed_mirror)
1307{ 1502{
1308 int page_num; 1503 int page_num;
1309 1504
@@ -1329,11 +1524,17 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1329 continue; 1524 continue;
1330 } 1525 }
1331 bio->bi_bdev = page->dev->bdev; 1526 bio->bi_bdev = page->dev->bdev;
1332 bio->bi_iter.bi_sector = page->physical >> 9;
1333 1527
1334 bio_add_page(bio, page->page, PAGE_SIZE, 0); 1528 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1335 if (btrfsic_submit_bio_wait(READ, bio)) 1529 if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
1336 sblock->no_io_error_seen = 0; 1530 if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
1531 sblock->no_io_error_seen = 0;
1532 } else {
1533 bio->bi_iter.bi_sector = page->physical >> 9;
1534
1535 if (btrfsic_submit_bio_wait(READ, bio))
1536 sblock->no_io_error_seen = 0;
1537 }
1337 1538
1338 bio_put(bio); 1539 bio_put(bio);
1339 } 1540 }
@@ -1486,6 +1687,13 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1486{ 1687{
1487 int page_num; 1688 int page_num;
1488 1689
1690 /*
1691 * This block is used for the check of the parity on the source device,
1692 * so the data needn't be written into the destination device.
1693 */
1694 if (sblock->sparity)
1695 return;
1696
1489 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1697 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1490 int ret; 1698 int ret;
1491 1699
@@ -1867,6 +2075,9 @@ static void scrub_block_put(struct scrub_block *sblock)
1867 if (atomic_dec_and_test(&sblock->ref_count)) { 2075 if (atomic_dec_and_test(&sblock->ref_count)) {
1868 int i; 2076 int i;
1869 2077
2078 if (sblock->sparity)
2079 scrub_parity_put(sblock->sparity);
2080
1870 for (i = 0; i < sblock->page_count; i++) 2081 for (i = 0; i < sblock->page_count; i++)
1871 scrub_page_put(sblock->pagev[i]); 2082 scrub_page_put(sblock->pagev[i]);
1872 kfree(sblock); 2083 kfree(sblock);
@@ -2124,9 +2335,51 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
2124 scrub_pending_bio_dec(sctx); 2335 scrub_pending_bio_dec(sctx);
2125} 2336}
2126 2337
2338static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2339 unsigned long *bitmap,
2340 u64 start, u64 len)
2341{
2342 int offset;
2343 int nsectors;
2344 int sectorsize = sparity->sctx->dev_root->sectorsize;
2345
2346 if (len >= sparity->stripe_len) {
2347 bitmap_set(bitmap, 0, sparity->nsectors);
2348 return;
2349 }
2350
2351 start -= sparity->logic_start;
2352 offset = (int)do_div(start, sparity->stripe_len);
2353 offset /= sectorsize;
2354 nsectors = (int)len / sectorsize;
2355
2356 if (offset + nsectors <= sparity->nsectors) {
2357 bitmap_set(bitmap, offset, nsectors);
2358 return;
2359 }
2360
2361 bitmap_set(bitmap, offset, sparity->nsectors - offset);
2362 bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2363}
2364
2365static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2366 u64 start, u64 len)
2367{
2368 __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2369}
2370
2371static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2372 u64 start, u64 len)
2373{
2374 __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2375}
2376
2127static void scrub_block_complete(struct scrub_block *sblock) 2377static void scrub_block_complete(struct scrub_block *sblock)
2128{ 2378{
2379 int corrupted = 0;
2380
2129 if (!sblock->no_io_error_seen) { 2381 if (!sblock->no_io_error_seen) {
2382 corrupted = 1;
2130 scrub_handle_errored_block(sblock); 2383 scrub_handle_errored_block(sblock);
2131 } else { 2384 } else {
2132 /* 2385 /*
@@ -2134,9 +2387,19 @@ static void scrub_block_complete(struct scrub_block *sblock)
2134 * dev replace case, otherwise write here in dev replace 2387 * dev replace case, otherwise write here in dev replace
2135 * case. 2388 * case.
2136 */ 2389 */
2137 if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace) 2390 corrupted = scrub_checksum(sblock);
2391 if (!corrupted && sblock->sctx->is_dev_replace)
2138 scrub_write_block_to_dev_replace(sblock); 2392 scrub_write_block_to_dev_replace(sblock);
2139 } 2393 }
2394
2395 if (sblock->sparity && corrupted && !sblock->data_corrected) {
2396 u64 start = sblock->pagev[0]->logical;
2397 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2398 PAGE_SIZE;
2399
2400 scrub_parity_mark_sectors_error(sblock->sparity,
2401 start, end - start);
2402 }
2140} 2403}
2141 2404
2142static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, 2405static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
@@ -2228,6 +2491,132 @@ behind_scrub_pages:
2228 return 0; 2491 return 0;
2229} 2492}
2230 2493
2494static int scrub_pages_for_parity(struct scrub_parity *sparity,
2495 u64 logical, u64 len,
2496 u64 physical, struct btrfs_device *dev,
2497 u64 flags, u64 gen, int mirror_num, u8 *csum)
2498{
2499 struct scrub_ctx *sctx = sparity->sctx;
2500 struct scrub_block *sblock;
2501 int index;
2502
2503 sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2504 if (!sblock) {
2505 spin_lock(&sctx->stat_lock);
2506 sctx->stat.malloc_errors++;
2507 spin_unlock(&sctx->stat_lock);
2508 return -ENOMEM;
2509 }
2510
2511 /* one ref inside this function, plus one for each page added to
2512 * a bio later on */
2513 atomic_set(&sblock->ref_count, 1);
2514 sblock->sctx = sctx;
2515 sblock->no_io_error_seen = 1;
2516 sblock->sparity = sparity;
2517 scrub_parity_get(sparity);
2518
2519 for (index = 0; len > 0; index++) {
2520 struct scrub_page *spage;
2521 u64 l = min_t(u64, len, PAGE_SIZE);
2522
2523 spage = kzalloc(sizeof(*spage), GFP_NOFS);
2524 if (!spage) {
2525leave_nomem:
2526 spin_lock(&sctx->stat_lock);
2527 sctx->stat.malloc_errors++;
2528 spin_unlock(&sctx->stat_lock);
2529 scrub_block_put(sblock);
2530 return -ENOMEM;
2531 }
2532 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2533 /* For scrub block */
2534 scrub_page_get(spage);
2535 sblock->pagev[index] = spage;
2536 /* For scrub parity */
2537 scrub_page_get(spage);
2538 list_add_tail(&spage->list, &sparity->spages);
2539 spage->sblock = sblock;
2540 spage->dev = dev;
2541 spage->flags = flags;
2542 spage->generation = gen;
2543 spage->logical = logical;
2544 spage->physical = physical;
2545 spage->mirror_num = mirror_num;
2546 if (csum) {
2547 spage->have_csum = 1;
2548 memcpy(spage->csum, csum, sctx->csum_size);
2549 } else {
2550 spage->have_csum = 0;
2551 }
2552 sblock->page_count++;
2553 spage->page = alloc_page(GFP_NOFS);
2554 if (!spage->page)
2555 goto leave_nomem;
2556 len -= l;
2557 logical += l;
2558 physical += l;
2559 }
2560
2561 WARN_ON(sblock->page_count == 0);
2562 for (index = 0; index < sblock->page_count; index++) {
2563 struct scrub_page *spage = sblock->pagev[index];
2564 int ret;
2565
2566 ret = scrub_add_page_to_rd_bio(sctx, spage);
2567 if (ret) {
2568 scrub_block_put(sblock);
2569 return ret;
2570 }
2571 }
2572
2573 /* last one frees, either here or in bio completion for last page */
2574 scrub_block_put(sblock);
2575 return 0;
2576}
2577
2578static int scrub_extent_for_parity(struct scrub_parity *sparity,
2579 u64 logical, u64 len,
2580 u64 physical, struct btrfs_device *dev,
2581 u64 flags, u64 gen, int mirror_num)
2582{
2583 struct scrub_ctx *sctx = sparity->sctx;
2584 int ret;
2585 u8 csum[BTRFS_CSUM_SIZE];
2586 u32 blocksize;
2587
2588 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2589 blocksize = sctx->sectorsize;
2590 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2591 blocksize = sctx->nodesize;
2592 } else {
2593 blocksize = sctx->sectorsize;
2594 WARN_ON(1);
2595 }
2596
2597 while (len) {
2598 u64 l = min_t(u64, len, blocksize);
2599 int have_csum = 0;
2600
2601 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2602 /* push csums to sbio */
2603 have_csum = scrub_find_csum(sctx, logical, l, csum);
2604 if (have_csum == 0)
2605 goto skip;
2606 }
2607 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2608 flags, gen, mirror_num,
2609 have_csum ? csum : NULL);
2610skip:
2611 if (ret)
2612 return ret;
2613 len -= l;
2614 logical += l;
2615 physical += l;
2616 }
2617 return 0;
2618}
2619
2231/* 2620/*
2232 * Given a physical address, this will calculate it's 2621 * Given a physical address, this will calculate it's
2233 * logical offset. if this is a parity stripe, it will return 2622 * logical offset. if this is a parity stripe, it will return
@@ -2236,7 +2625,8 @@ behind_scrub_pages:
2236 * return 0 if it is a data stripe, 1 means parity stripe. 2625 * return 0 if it is a data stripe, 1 means parity stripe.
2237 */ 2626 */
2238static int get_raid56_logic_offset(u64 physical, int num, 2627static int get_raid56_logic_offset(u64 physical, int num,
2239 struct map_lookup *map, u64 *offset) 2628 struct map_lookup *map, u64 *offset,
2629 u64 *stripe_start)
2240{ 2630{
2241 int i; 2631 int i;
2242 int j = 0; 2632 int j = 0;
@@ -2247,6 +2637,9 @@ static int get_raid56_logic_offset(u64 physical, int num,
2247 2637
2248 last_offset = (physical - map->stripes[num].physical) * 2638 last_offset = (physical - map->stripes[num].physical) *
2249 nr_data_stripes(map); 2639 nr_data_stripes(map);
2640 if (stripe_start)
2641 *stripe_start = last_offset;
2642
2250 *offset = last_offset; 2643 *offset = last_offset;
2251 for (i = 0; i < nr_data_stripes(map); i++) { 2644 for (i = 0; i < nr_data_stripes(map); i++) {
2252 *offset = last_offset + i * map->stripe_len; 2645 *offset = last_offset + i * map->stripe_len;
@@ -2269,13 +2662,330 @@ static int get_raid56_logic_offset(u64 physical, int num,
2269 return 1; 2662 return 1;
2270} 2663}
2271 2664
2665static void scrub_free_parity(struct scrub_parity *sparity)
2666{
2667 struct scrub_ctx *sctx = sparity->sctx;
2668 struct scrub_page *curr, *next;
2669 int nbits;
2670
2671 nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2672 if (nbits) {
2673 spin_lock(&sctx->stat_lock);
2674 sctx->stat.read_errors += nbits;
2675 sctx->stat.uncorrectable_errors += nbits;
2676 spin_unlock(&sctx->stat_lock);
2677 }
2678
2679 list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2680 list_del_init(&curr->list);
2681 scrub_page_put(curr);
2682 }
2683
2684 kfree(sparity);
2685}
2686
2687static void scrub_parity_bio_endio(struct bio *bio, int error)
2688{
2689 struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2690 struct scrub_ctx *sctx = sparity->sctx;
2691
2692 if (error)
2693 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2694 sparity->nsectors);
2695
2696 scrub_free_parity(sparity);
2697 scrub_pending_bio_dec(sctx);
2698 bio_put(bio);
2699}
2700
2701static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2702{
2703 struct scrub_ctx *sctx = sparity->sctx;
2704 struct bio *bio;
2705 struct btrfs_raid_bio *rbio;
2706 struct scrub_page *spage;
2707 struct btrfs_bio *bbio = NULL;
2708 u64 *raid_map = NULL;
2709 u64 length;
2710 int ret;
2711
2712 if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2713 sparity->nsectors))
2714 goto out;
2715
2716 length = sparity->logic_end - sparity->logic_start + 1;
2717 ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
2718 sparity->logic_start,
2719 &length, &bbio, 0, &raid_map);
2720 if (ret || !bbio || !raid_map)
2721 goto bbio_out;
2722
2723 bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
2724 if (!bio)
2725 goto bbio_out;
2726
2727 bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2728 bio->bi_private = sparity;
2729 bio->bi_end_io = scrub_parity_bio_endio;
2730
2731 rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
2732 raid_map, length,
2733 sparity->scrub_dev,
2734 sparity->dbitmap,
2735 sparity->nsectors);
2736 if (!rbio)
2737 goto rbio_out;
2738
2739 list_for_each_entry(spage, &sparity->spages, list)
2740 raid56_parity_add_scrub_pages(rbio, spage->page,
2741 spage->logical);
2742
2743 scrub_pending_bio_inc(sctx);
2744 raid56_parity_submit_scrub_rbio(rbio);
2745 return;
2746
2747rbio_out:
2748 bio_put(bio);
2749bbio_out:
2750 kfree(bbio);
2751 kfree(raid_map);
2752 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2753 sparity->nsectors);
2754 spin_lock(&sctx->stat_lock);
2755 sctx->stat.malloc_errors++;
2756 spin_unlock(&sctx->stat_lock);
2757out:
2758 scrub_free_parity(sparity);
2759}
2760
2761static inline int scrub_calc_parity_bitmap_len(int nsectors)
2762{
2763 return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
2764}
2765
2766static void scrub_parity_get(struct scrub_parity *sparity)
2767{
2768 atomic_inc(&sparity->ref_count);
2769}
2770
2771static void scrub_parity_put(struct scrub_parity *sparity)
2772{
2773 if (!atomic_dec_and_test(&sparity->ref_count))
2774 return;
2775
2776 scrub_parity_check_and_repair(sparity);
2777}
2778
2779static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2780 struct map_lookup *map,
2781 struct btrfs_device *sdev,
2782 struct btrfs_path *path,
2783 u64 logic_start,
2784 u64 logic_end)
2785{
2786 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2787 struct btrfs_root *root = fs_info->extent_root;
2788 struct btrfs_root *csum_root = fs_info->csum_root;
2789 struct btrfs_extent_item *extent;
2790 u64 flags;
2791 int ret;
2792 int slot;
2793 struct extent_buffer *l;
2794 struct btrfs_key key;
2795 u64 generation;
2796 u64 extent_logical;
2797 u64 extent_physical;
2798 u64 extent_len;
2799 struct btrfs_device *extent_dev;
2800 struct scrub_parity *sparity;
2801 int nsectors;
2802 int bitmap_len;
2803 int extent_mirror_num;
2804 int stop_loop = 0;
2805
2806 nsectors = map->stripe_len / root->sectorsize;
2807 bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2808 sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2809 GFP_NOFS);
2810 if (!sparity) {
2811 spin_lock(&sctx->stat_lock);
2812 sctx->stat.malloc_errors++;
2813 spin_unlock(&sctx->stat_lock);
2814 return -ENOMEM;
2815 }
2816
2817 sparity->stripe_len = map->stripe_len;
2818 sparity->nsectors = nsectors;
2819 sparity->sctx = sctx;
2820 sparity->scrub_dev = sdev;
2821 sparity->logic_start = logic_start;
2822 sparity->logic_end = logic_end;
2823 atomic_set(&sparity->ref_count, 1);
2824 INIT_LIST_HEAD(&sparity->spages);
2825 sparity->dbitmap = sparity->bitmap;
2826 sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2827
2828 ret = 0;
2829 while (logic_start < logic_end) {
2830 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2831 key.type = BTRFS_METADATA_ITEM_KEY;
2832 else
2833 key.type = BTRFS_EXTENT_ITEM_KEY;
2834 key.objectid = logic_start;
2835 key.offset = (u64)-1;
2836
2837 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2838 if (ret < 0)
2839 goto out;
2840
2841 if (ret > 0) {
2842 ret = btrfs_previous_extent_item(root, path, 0);
2843 if (ret < 0)
2844 goto out;
2845 if (ret > 0) {
2846 btrfs_release_path(path);
2847 ret = btrfs_search_slot(NULL, root, &key,
2848 path, 0, 0);
2849 if (ret < 0)
2850 goto out;
2851 }
2852 }
2853
2854 stop_loop = 0;
2855 while (1) {
2856 u64 bytes;
2857
2858 l = path->nodes[0];
2859 slot = path->slots[0];
2860 if (slot >= btrfs_header_nritems(l)) {
2861 ret = btrfs_next_leaf(root, path);
2862 if (ret == 0)
2863 continue;
2864 if (ret < 0)
2865 goto out;
2866
2867 stop_loop = 1;
2868 break;
2869 }
2870 btrfs_item_key_to_cpu(l, &key, slot);
2871
2872 if (key.type == BTRFS_METADATA_ITEM_KEY)
2873 bytes = root->nodesize;
2874 else
2875 bytes = key.offset;
2876
2877 if (key.objectid + bytes <= logic_start)
2878 goto next;
2879
2880 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2881 key.type != BTRFS_METADATA_ITEM_KEY)
2882 goto next;
2883
2884 if (key.objectid > logic_end) {
2885 stop_loop = 1;
2886 break;
2887 }
2888
2889 while (key.objectid >= logic_start + map->stripe_len)
2890 logic_start += map->stripe_len;
2891
2892 extent = btrfs_item_ptr(l, slot,
2893 struct btrfs_extent_item);
2894 flags = btrfs_extent_flags(l, extent);
2895 generation = btrfs_extent_generation(l, extent);
2896
2897 if (key.objectid < logic_start &&
2898 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2899 btrfs_err(fs_info,
2900 "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
2901 key.objectid, logic_start);
2902 goto next;
2903 }
2904again:
2905 extent_logical = key.objectid;
2906 extent_len = bytes;
2907
2908 if (extent_logical < logic_start) {
2909 extent_len -= logic_start - extent_logical;
2910 extent_logical = logic_start;
2911 }
2912
2913 if (extent_logical + extent_len >
2914 logic_start + map->stripe_len)
2915 extent_len = logic_start + map->stripe_len -
2916 extent_logical;
2917
2918 scrub_parity_mark_sectors_data(sparity, extent_logical,
2919 extent_len);
2920
2921 scrub_remap_extent(fs_info, extent_logical,
2922 extent_len, &extent_physical,
2923 &extent_dev,
2924 &extent_mirror_num);
2925
2926 ret = btrfs_lookup_csums_range(csum_root,
2927 extent_logical,
2928 extent_logical + extent_len - 1,
2929 &sctx->csum_list, 1);
2930 if (ret)
2931 goto out;
2932
2933 ret = scrub_extent_for_parity(sparity, extent_logical,
2934 extent_len,
2935 extent_physical,
2936 extent_dev, flags,
2937 generation,
2938 extent_mirror_num);
2939 if (ret)
2940 goto out;
2941
2942 scrub_free_csums(sctx);
2943 if (extent_logical + extent_len <
2944 key.objectid + bytes) {
2945 logic_start += map->stripe_len;
2946
2947 if (logic_start >= logic_end) {
2948 stop_loop = 1;
2949 break;
2950 }
2951
2952 if (logic_start < key.objectid + bytes) {
2953 cond_resched();
2954 goto again;
2955 }
2956 }
2957next:
2958 path->slots[0]++;
2959 }
2960
2961 btrfs_release_path(path);
2962
2963 if (stop_loop)
2964 break;
2965
2966 logic_start += map->stripe_len;
2967 }
2968out:
2969 if (ret < 0)
2970 scrub_parity_mark_sectors_error(sparity, logic_start,
2971 logic_end - logic_start + 1);
2972 scrub_parity_put(sparity);
2973 scrub_submit(sctx);
2974 mutex_lock(&sctx->wr_ctx.wr_lock);
2975 scrub_wr_submit(sctx);
2976 mutex_unlock(&sctx->wr_ctx.wr_lock);
2977
2978 btrfs_release_path(path);
2979 return ret < 0 ? ret : 0;
2980}
2981
2272static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 2982static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2273 struct map_lookup *map, 2983 struct map_lookup *map,
2274 struct btrfs_device *scrub_dev, 2984 struct btrfs_device *scrub_dev,
2275 int num, u64 base, u64 length, 2985 int num, u64 base, u64 length,
2276 int is_dev_replace) 2986 int is_dev_replace)
2277{ 2987{
2278 struct btrfs_path *path; 2988 struct btrfs_path *path, *ppath;
2279 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 2989 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2280 struct btrfs_root *root = fs_info->extent_root; 2990 struct btrfs_root *root = fs_info->extent_root;
2281 struct btrfs_root *csum_root = fs_info->csum_root; 2991 struct btrfs_root *csum_root = fs_info->csum_root;
@@ -2302,6 +3012,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2302 u64 extent_logical; 3012 u64 extent_logical;
2303 u64 extent_physical; 3013 u64 extent_physical;
2304 u64 extent_len; 3014 u64 extent_len;
3015 u64 stripe_logical;
3016 u64 stripe_end;
2305 struct btrfs_device *extent_dev; 3017 struct btrfs_device *extent_dev;
2306 int extent_mirror_num; 3018 int extent_mirror_num;
2307 int stop_loop = 0; 3019 int stop_loop = 0;
@@ -2327,7 +3039,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2327 mirror_num = num % map->num_stripes + 1; 3039 mirror_num = num % map->num_stripes + 1;
2328 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 3040 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2329 BTRFS_BLOCK_GROUP_RAID6)) { 3041 BTRFS_BLOCK_GROUP_RAID6)) {
2330 get_raid56_logic_offset(physical, num, map, &offset); 3042 get_raid56_logic_offset(physical, num, map, &offset, NULL);
2331 increment = map->stripe_len * nr_data_stripes(map); 3043 increment = map->stripe_len * nr_data_stripes(map);
2332 mirror_num = 1; 3044 mirror_num = 1;
2333 } else { 3045 } else {
@@ -2339,6 +3051,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2339 if (!path) 3051 if (!path)
2340 return -ENOMEM; 3052 return -ENOMEM;
2341 3053
3054 ppath = btrfs_alloc_path();
3055 if (!ppath) {
3056 btrfs_free_path(ppath);
3057 return -ENOMEM;
3058 }
3059
2342 /* 3060 /*
2343 * work on commit root. The related disk blocks are static as 3061 * work on commit root. The related disk blocks are static as
2344 * long as COW is applied. This means, it is save to rewrite 3062 * long as COW is applied. This means, it is save to rewrite
@@ -2357,7 +3075,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2357 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 3075 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2358 BTRFS_BLOCK_GROUP_RAID6)) { 3076 BTRFS_BLOCK_GROUP_RAID6)) {
2359 get_raid56_logic_offset(physical_end, num, 3077 get_raid56_logic_offset(physical_end, num,
2360 map, &logic_end); 3078 map, &logic_end, NULL);
2361 logic_end += base; 3079 logic_end += base;
2362 } else { 3080 } else {
2363 logic_end = logical + increment * nstripes; 3081 logic_end = logical + increment * nstripes;
@@ -2404,10 +3122,18 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2404 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 3122 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2405 BTRFS_BLOCK_GROUP_RAID6)) { 3123 BTRFS_BLOCK_GROUP_RAID6)) {
2406 ret = get_raid56_logic_offset(physical, num, 3124 ret = get_raid56_logic_offset(physical, num,
2407 map, &logical); 3125 map, &logical, &stripe_logical);
2408 logical += base; 3126 logical += base;
2409 if (ret) 3127 if (ret) {
3128 stripe_logical += base;
3129 stripe_end = stripe_logical + increment - 1;
3130 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3131 ppath, stripe_logical,
3132 stripe_end);
3133 if (ret)
3134 goto out;
2410 goto skip; 3135 goto skip;
3136 }
2411 } 3137 }
2412 /* 3138 /*
2413 * canceled? 3139 * canceled?
@@ -2558,13 +3284,25 @@ again:
2558 * loop until we find next data stripe 3284 * loop until we find next data stripe
2559 * or we have finished all stripes. 3285 * or we have finished all stripes.
2560 */ 3286 */
2561 do { 3287loop:
2562 physical += map->stripe_len; 3288 physical += map->stripe_len;
2563 ret = get_raid56_logic_offset( 3289 ret = get_raid56_logic_offset(physical,
2564 physical, num, 3290 num, map, &logical,
2565 map, &logical); 3291 &stripe_logical);
2566 logical += base; 3292 logical += base;
2567 } while (physical < physical_end && ret); 3293
3294 if (ret && physical < physical_end) {
3295 stripe_logical += base;
3296 stripe_end = stripe_logical +
3297 increment - 1;
3298 ret = scrub_raid56_parity(sctx,
3299 map, scrub_dev, ppath,
3300 stripe_logical,
3301 stripe_end);
3302 if (ret)
3303 goto out;
3304 goto loop;
3305 }
2568 } else { 3306 } else {
2569 physical += map->stripe_len; 3307 physical += map->stripe_len;
2570 logical += increment; 3308 logical += increment;
@@ -2605,6 +3343,7 @@ out:
2605 3343
2606 blk_finish_plug(&plug); 3344 blk_finish_plug(&plug);
2607 btrfs_free_path(path); 3345 btrfs_free_path(path);
3346 btrfs_free_path(ppath);
2608 return ret < 0 ? ret : 0; 3347 return ret < 0 ? ret : 0;
2609} 3348}
2610 3349
@@ -3310,6 +4049,50 @@ out:
3310 scrub_pending_trans_workers_dec(sctx); 4049 scrub_pending_trans_workers_dec(sctx);
3311} 4050}
3312 4051
4052static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
4053 u64 logical)
4054{
4055 struct extent_state *cached_state = NULL;
4056 struct btrfs_ordered_extent *ordered;
4057 struct extent_io_tree *io_tree;
4058 struct extent_map *em;
4059 u64 lockstart = start, lockend = start + len - 1;
4060 int ret = 0;
4061
4062 io_tree = &BTRFS_I(inode)->io_tree;
4063
4064 lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
4065 ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
4066 if (ordered) {
4067 btrfs_put_ordered_extent(ordered);
4068 ret = 1;
4069 goto out_unlock;
4070 }
4071
4072 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
4073 if (IS_ERR(em)) {
4074 ret = PTR_ERR(em);
4075 goto out_unlock;
4076 }
4077
4078 /*
4079 * This extent does not actually cover the logical extent anymore,
4080 * move on to the next inode.
4081 */
4082 if (em->block_start > logical ||
4083 em->block_start + em->block_len < logical + len) {
4084 free_extent_map(em);
4085 ret = 1;
4086 goto out_unlock;
4087 }
4088 free_extent_map(em);
4089
4090out_unlock:
4091 unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
4092 GFP_NOFS);
4093 return ret;
4094}
4095
3313static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, 4096static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
3314 struct scrub_copy_nocow_ctx *nocow_ctx) 4097 struct scrub_copy_nocow_ctx *nocow_ctx)
3315{ 4098{
@@ -3318,13 +4101,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
3318 struct inode *inode; 4101 struct inode *inode;
3319 struct page *page; 4102 struct page *page;
3320 struct btrfs_root *local_root; 4103 struct btrfs_root *local_root;
3321 struct btrfs_ordered_extent *ordered;
3322 struct extent_map *em;
3323 struct extent_state *cached_state = NULL;
3324 struct extent_io_tree *io_tree; 4104 struct extent_io_tree *io_tree;
3325 u64 physical_for_dev_replace; 4105 u64 physical_for_dev_replace;
4106 u64 nocow_ctx_logical;
3326 u64 len = nocow_ctx->len; 4107 u64 len = nocow_ctx->len;
3327 u64 lockstart = offset, lockend = offset + len - 1;
3328 unsigned long index; 4108 unsigned long index;
3329 int srcu_index; 4109 int srcu_index;
3330 int ret = 0; 4110 int ret = 0;
@@ -3356,30 +4136,13 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
3356 4136
3357 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; 4137 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3358 io_tree = &BTRFS_I(inode)->io_tree; 4138 io_tree = &BTRFS_I(inode)->io_tree;
4139 nocow_ctx_logical = nocow_ctx->logical;
3359 4140
3360 lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); 4141 ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
3361 ordered = btrfs_lookup_ordered_range(inode, lockstart, len); 4142 if (ret) {
3362 if (ordered) { 4143 ret = ret > 0 ? 0 : ret;
3363 btrfs_put_ordered_extent(ordered); 4144 goto out;
3364 goto out_unlock;
3365 }
3366
3367 em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
3368 if (IS_ERR(em)) {
3369 ret = PTR_ERR(em);
3370 goto out_unlock;
3371 }
3372
3373 /*
3374 * This extent does not actually cover the logical extent anymore,
3375 * move on to the next inode.
3376 */
3377 if (em->block_start > nocow_ctx->logical ||
3378 em->block_start + em->block_len < nocow_ctx->logical + len) {
3379 free_extent_map(em);
3380 goto out_unlock;
3381 } 4145 }
3382 free_extent_map(em);
3383 4146
3384 while (len >= PAGE_CACHE_SIZE) { 4147 while (len >= PAGE_CACHE_SIZE) {
3385 index = offset >> PAGE_CACHE_SHIFT; 4148 index = offset >> PAGE_CACHE_SHIFT;
@@ -3396,7 +4159,7 @@ again:
3396 goto next_page; 4159 goto next_page;
3397 } else { 4160 } else {
3398 ClearPageError(page); 4161 ClearPageError(page);
3399 err = extent_read_full_page_nolock(io_tree, page, 4162 err = extent_read_full_page(io_tree, page,
3400 btrfs_get_extent, 4163 btrfs_get_extent,
3401 nocow_ctx->mirror_num); 4164 nocow_ctx->mirror_num);
3402 if (err) { 4165 if (err) {
@@ -3421,6 +4184,14 @@ again:
3421 goto next_page; 4184 goto next_page;
3422 } 4185 }
3423 } 4186 }
4187
4188 ret = check_extent_to_block(inode, offset, len,
4189 nocow_ctx_logical);
4190 if (ret) {
4191 ret = ret > 0 ? 0 : ret;
4192 goto next_page;
4193 }
4194
3424 err = write_page_nocow(nocow_ctx->sctx, 4195 err = write_page_nocow(nocow_ctx->sctx,
3425 physical_for_dev_replace, page); 4196 physical_for_dev_replace, page);
3426 if (err) 4197 if (err)
@@ -3434,12 +4205,10 @@ next_page:
3434 4205
3435 offset += PAGE_CACHE_SIZE; 4206 offset += PAGE_CACHE_SIZE;
3436 physical_for_dev_replace += PAGE_CACHE_SIZE; 4207 physical_for_dev_replace += PAGE_CACHE_SIZE;
4208 nocow_ctx_logical += PAGE_CACHE_SIZE;
3437 len -= PAGE_CACHE_SIZE; 4209 len -= PAGE_CACHE_SIZE;
3438 } 4210 }
3439 ret = COPY_COMPLETE; 4211 ret = COPY_COMPLETE;
3440out_unlock:
3441 unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
3442 GFP_NOFS);
3443out: 4212out:
3444 mutex_unlock(&inode->i_mutex); 4213 mutex_unlock(&inode->i_mutex);
3445 iput(inode); 4214 iput(inode);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 874828dd0a86..804432dbc351 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5507,6 +5507,51 @@ out:
5507 return ret; 5507 return ret;
5508} 5508}
5509 5509
5510/*
5511 * If orphan cleanup did remove any orphans from a root, it means the tree
5512 * was modified and therefore the commit root is not the same as the current
5513 * root anymore. This is a problem, because send uses the commit root and
5514 * therefore can see inode items that don't exist in the current root anymore,
5515 * and for example make calls to btrfs_iget, which will do tree lookups based
5516 * on the current root and not on the commit root. Those lookups will fail,
5517 * returning a -ESTALE error, and making send fail with that error. So make
5518 * sure a send does not see any orphans we have just removed, and that it will
5519 * see the same inodes regardless of whether a transaction commit happened
5520 * before it started (meaning that the commit root will be the same as the
5521 * current root) or not.
5522 */
5523static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
5524{
5525 int i;
5526 struct btrfs_trans_handle *trans = NULL;
5527
5528again:
5529 if (sctx->parent_root &&
5530 sctx->parent_root->node != sctx->parent_root->commit_root)
5531 goto commit_trans;
5532
5533 for (i = 0; i < sctx->clone_roots_cnt; i++)
5534 if (sctx->clone_roots[i].root->node !=
5535 sctx->clone_roots[i].root->commit_root)
5536 goto commit_trans;
5537
5538 if (trans)
5539 return btrfs_end_transaction(trans, sctx->send_root);
5540
5541 return 0;
5542
5543commit_trans:
5544 /* Use any root, all fs roots will get their commit roots updated. */
5545 if (!trans) {
5546 trans = btrfs_join_transaction(sctx->send_root);
5547 if (IS_ERR(trans))
5548 return PTR_ERR(trans);
5549 goto again;
5550 }
5551
5552 return btrfs_commit_transaction(trans, sctx->send_root);
5553}
5554
5510static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) 5555static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
5511{ 5556{
5512 spin_lock(&root->root_item_lock); 5557 spin_lock(&root->root_item_lock);
@@ -5728,6 +5773,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5728 NULL); 5773 NULL);
5729 sort_clone_roots = 1; 5774 sort_clone_roots = 1;
5730 5775
5776 ret = ensure_commit_roots_uptodate(sctx);
5777 if (ret)
5778 goto out;
5779
5731 current->journal_info = BTRFS_SEND_TRANS_STUB; 5780 current->journal_info = BTRFS_SEND_TRANS_STUB;
5732 ret = send_subvol(sctx); 5781 ret = send_subvol(sctx);
5733 current->journal_info = NULL; 5782 current->journal_info = NULL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 54bd91ece35b..60f7cbe815e9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -262,7 +262,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
262 trans->aborted = errno; 262 trans->aborted = errno;
263 /* Nothing used. The other threads that have joined this 263 /* Nothing used. The other threads that have joined this
264 * transaction may be able to continue. */ 264 * transaction may be able to continue. */
265 if (!trans->blocks_used) { 265 if (!trans->blocks_used && list_empty(&trans->new_bgs)) {
266 const char *errstr; 266 const char *errstr;
267 267
268 errstr = btrfs_decode_error(errno); 268 errstr = btrfs_decode_error(errno);
@@ -642,11 +642,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
642 "disabling disk space caching"); 642 "disabling disk space caching");
643 break; 643 break;
644 case Opt_inode_cache: 644 case Opt_inode_cache:
645 btrfs_set_and_info(root, CHANGE_INODE_CACHE, 645 btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
646 "enabling inode map caching"); 646 "enabling inode map caching");
647 break; 647 break;
648 case Opt_noinode_cache: 648 case Opt_noinode_cache:
649 btrfs_clear_and_info(root, CHANGE_INODE_CACHE, 649 btrfs_clear_pending_and_info(info, INODE_MAP_CACHE,
650 "disabling inode map caching"); 650 "disabling inode map caching");
651 break; 651 break;
652 case Opt_clear_cache: 652 case Opt_clear_cache:
@@ -993,9 +993,17 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
993 trans = btrfs_attach_transaction_barrier(root); 993 trans = btrfs_attach_transaction_barrier(root);
994 if (IS_ERR(trans)) { 994 if (IS_ERR(trans)) {
995 /* no transaction, don't bother */ 995 /* no transaction, don't bother */
996 if (PTR_ERR(trans) == -ENOENT) 996 if (PTR_ERR(trans) == -ENOENT) {
997 return 0; 997 /*
998 return PTR_ERR(trans); 998 * Exit unless we have some pending changes
999 * that need to go through commit
1000 */
1001 if (fs_info->pending_changes == 0)
1002 return 0;
1003 trans = btrfs_start_transaction(root, 0);
1004 } else {
1005 return PTR_ERR(trans);
1006 }
999 } 1007 }
1000 return btrfs_commit_transaction(trans, root); 1008 return btrfs_commit_transaction(trans, root);
1001} 1009}
@@ -1644,8 +1652,20 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1644 int i = 0, nr_devices; 1652 int i = 0, nr_devices;
1645 int ret; 1653 int ret;
1646 1654
1655 /*
1656 * We aren't under the device list lock, so this is racey-ish, but good
1657 * enough for our purposes.
1658 */
1647 nr_devices = fs_info->fs_devices->open_devices; 1659 nr_devices = fs_info->fs_devices->open_devices;
1648 BUG_ON(!nr_devices); 1660 if (!nr_devices) {
1661 smp_mb();
1662 nr_devices = fs_info->fs_devices->open_devices;
1663 ASSERT(nr_devices);
1664 if (!nr_devices) {
1665 *free_bytes = 0;
1666 return 0;
1667 }
1668 }
1649 1669
1650 devices_info = kmalloc_array(nr_devices, sizeof(*devices_info), 1670 devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
1651 GFP_NOFS); 1671 GFP_NOFS);
@@ -1670,11 +1690,17 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1670 else 1690 else
1671 min_stripe_size = BTRFS_STRIPE_LEN; 1691 min_stripe_size = BTRFS_STRIPE_LEN;
1672 1692
1673 list_for_each_entry(device, &fs_devices->devices, dev_list) { 1693 if (fs_info->alloc_start)
1694 mutex_lock(&fs_devices->device_list_mutex);
1695 rcu_read_lock();
1696 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
1674 if (!device->in_fs_metadata || !device->bdev || 1697 if (!device->in_fs_metadata || !device->bdev ||
1675 device->is_tgtdev_for_dev_replace) 1698 device->is_tgtdev_for_dev_replace)
1676 continue; 1699 continue;
1677 1700
1701 if (i >= nr_devices)
1702 break;
1703
1678 avail_space = device->total_bytes - device->bytes_used; 1704 avail_space = device->total_bytes - device->bytes_used;
1679 1705
1680 /* align with stripe_len */ 1706 /* align with stripe_len */
@@ -1689,24 +1715,32 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1689 skip_space = 1024 * 1024; 1715 skip_space = 1024 * 1024;
1690 1716
1691 /* user can set the offset in fs_info->alloc_start. */ 1717 /* user can set the offset in fs_info->alloc_start. */
1692 if (fs_info->alloc_start + BTRFS_STRIPE_LEN <= 1718 if (fs_info->alloc_start &&
1693 device->total_bytes) 1719 fs_info->alloc_start + BTRFS_STRIPE_LEN <=
1720 device->total_bytes) {
1721 rcu_read_unlock();
1694 skip_space = max(fs_info->alloc_start, skip_space); 1722 skip_space = max(fs_info->alloc_start, skip_space);
1695 1723
1696 /* 1724 /*
1697 * btrfs can not use the free space in [0, skip_space - 1], 1725 * btrfs can not use the free space in
1698 * we must subtract it from the total. In order to implement 1726 * [0, skip_space - 1], we must subtract it from the
1699 * it, we account the used space in this range first. 1727 * total. In order to implement it, we account the used
1700 */ 1728 * space in this range first.
1701 ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1, 1729 */
1702 &used_space); 1730 ret = btrfs_account_dev_extents_size(device, 0,
1703 if (ret) { 1731 skip_space - 1,
1704 kfree(devices_info); 1732 &used_space);
1705 return ret; 1733 if (ret) {
1706 } 1734 kfree(devices_info);
1735 mutex_unlock(&fs_devices->device_list_mutex);
1736 return ret;
1737 }
1707 1738
1708 /* calc the free space in [0, skip_space - 1] */ 1739 rcu_read_lock();
1709 skip_space -= used_space; 1740
1741 /* calc the free space in [0, skip_space - 1] */
1742 skip_space -= used_space;
1743 }
1710 1744
1711 /* 1745 /*
1712 * we can use the free space in [0, skip_space - 1], subtract 1746 * we can use the free space in [0, skip_space - 1], subtract
@@ -1725,6 +1759,9 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1725 1759
1726 i++; 1760 i++;
1727 } 1761 }
1762 rcu_read_unlock();
1763 if (fs_info->alloc_start)
1764 mutex_unlock(&fs_devices->device_list_mutex);
1728 1765
1729 nr_devices = i; 1766 nr_devices = i;
1730 1767
@@ -1787,8 +1824,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1787 * holding chunk_muext to avoid allocating new chunks, holding 1824 * holding chunk_muext to avoid allocating new chunks, holding
1788 * device_list_mutex to avoid the device being removed 1825 * device_list_mutex to avoid the device being removed
1789 */ 1826 */
1790 mutex_lock(&fs_info->fs_devices->device_list_mutex);
1791 mutex_lock(&fs_info->chunk_mutex);
1792 rcu_read_lock(); 1827 rcu_read_lock();
1793 list_for_each_entry_rcu(found, head, list) { 1828 list_for_each_entry_rcu(found, head, list) {
1794 if (found->flags & BTRFS_BLOCK_GROUP_DATA) { 1829 if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1824,17 +1859,12 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1824 buf->f_bfree -= block_rsv->size >> bits; 1859 buf->f_bfree -= block_rsv->size >> bits;
1825 spin_unlock(&block_rsv->lock); 1860 spin_unlock(&block_rsv->lock);
1826 1861
1827 buf->f_bavail = total_free_data; 1862 buf->f_bavail = div_u64(total_free_data, factor);
1828 ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); 1863 ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
1829 if (ret) { 1864 if (ret)
1830 mutex_unlock(&fs_info->chunk_mutex);
1831 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1832 return ret; 1865 return ret;
1833 }
1834 buf->f_bavail += div_u64(total_free_data, factor); 1866 buf->f_bavail += div_u64(total_free_data, factor);
1835 buf->f_bavail = buf->f_bavail >> bits; 1867 buf->f_bavail = buf->f_bavail >> bits;
1836 mutex_unlock(&fs_info->chunk_mutex);
1837 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1838 1868
1839 buf->f_type = BTRFS_SUPER_MAGIC; 1869 buf->f_type = BTRFS_SUPER_MAGIC;
1840 buf->f_bsize = dentry->d_sb->s_blocksize; 1870 buf->f_bsize = dentry->d_sb->s_blocksize;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index b2e7bb4393f6..92db3f648df4 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -111,7 +111,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
111{ 111{
112 struct btrfs_fs_info *fs_info; 112 struct btrfs_fs_info *fs_info;
113 struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); 113 struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
114 struct btrfs_trans_handle *trans;
115 u64 features, set, clear; 114 u64 features, set, clear;
116 unsigned long val; 115 unsigned long val;
117 int ret; 116 int ret;
@@ -153,10 +152,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
153 btrfs_info(fs_info, "%s %s feature flag", 152 btrfs_info(fs_info, "%s %s feature flag",
154 val ? "Setting" : "Clearing", fa->kobj_attr.attr.name); 153 val ? "Setting" : "Clearing", fa->kobj_attr.attr.name);
155 154
156 trans = btrfs_start_transaction(fs_info->fs_root, 0);
157 if (IS_ERR(trans))
158 return PTR_ERR(trans);
159
160 spin_lock(&fs_info->super_lock); 155 spin_lock(&fs_info->super_lock);
161 features = get_features(fs_info, fa->feature_set); 156 features = get_features(fs_info, fa->feature_set);
162 if (val) 157 if (val)
@@ -166,9 +161,11 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
166 set_features(fs_info, fa->feature_set, features); 161 set_features(fs_info, fa->feature_set, features);
167 spin_unlock(&fs_info->super_lock); 162 spin_unlock(&fs_info->super_lock);
168 163
169 ret = btrfs_commit_transaction(trans, fs_info->fs_root); 164 /*
170 if (ret) 165 * We don't want to do full transaction commit from inside sysfs
171 return ret; 166 */
167 btrfs_set_pending(fs_info, COMMIT);
168 wake_up_process(fs_info->transaction_kthread);
172 169
173 return count; 170 return count;
174} 171}
@@ -372,9 +369,6 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
372 const char *buf, size_t len) 369 const char *buf, size_t len)
373{ 370{
374 struct btrfs_fs_info *fs_info = to_fs_info(kobj); 371 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
375 struct btrfs_trans_handle *trans;
376 struct btrfs_root *root = fs_info->fs_root;
377 int ret;
378 size_t p_len; 372 size_t p_len;
379 373
380 if (fs_info->sb->s_flags & MS_RDONLY) 374 if (fs_info->sb->s_flags & MS_RDONLY)
@@ -389,20 +383,18 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
389 if (p_len >= BTRFS_LABEL_SIZE) 383 if (p_len >= BTRFS_LABEL_SIZE)
390 return -EINVAL; 384 return -EINVAL;
391 385
392 trans = btrfs_start_transaction(root, 0); 386 spin_lock(&fs_info->super_lock);
393 if (IS_ERR(trans))
394 return PTR_ERR(trans);
395
396 spin_lock(&root->fs_info->super_lock);
397 memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE); 387 memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE);
398 memcpy(fs_info->super_copy->label, buf, p_len); 388 memcpy(fs_info->super_copy->label, buf, p_len);
399 spin_unlock(&root->fs_info->super_lock); 389 spin_unlock(&fs_info->super_lock);
400 ret = btrfs_commit_transaction(trans, root);
401 390
402 if (!ret) 391 /*
403 return len; 392 * We don't want to do full transaction commit from inside sysfs
393 */
394 btrfs_set_pending(fs_info, COMMIT);
395 wake_up_process(fs_info->transaction_kthread);
404 396
405 return ret; 397 return len;
406} 398}
407BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store); 399BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store);
408 400
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index dcaae3616728..a605d4e2f2bc 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
76 } 76 }
77} 77}
78 78
79static void clear_btree_io_tree(struct extent_io_tree *tree)
80{
81 spin_lock(&tree->lock);
82 while (!RB_EMPTY_ROOT(&tree->state)) {
83 struct rb_node *node;
84 struct extent_state *state;
85
86 node = rb_first(&tree->state);
87 state = rb_entry(node, struct extent_state, rb_node);
88 rb_erase(&state->rb_node, &tree->state);
89 RB_CLEAR_NODE(&state->rb_node);
90 /*
91 * btree io trees aren't supposed to have tasks waiting for
92 * changes in the flags of extent states ever.
93 */
94 ASSERT(!waitqueue_active(&state->wq));
95 free_extent_state(state);
96 if (need_resched()) {
97 spin_unlock(&tree->lock);
98 cond_resched();
99 spin_lock(&tree->lock);
100 }
101 }
102 spin_unlock(&tree->lock);
103}
104
79static noinline void switch_commit_roots(struct btrfs_transaction *trans, 105static noinline void switch_commit_roots(struct btrfs_transaction *trans,
80 struct btrfs_fs_info *fs_info) 106 struct btrfs_fs_info *fs_info)
81{ 107{
@@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans,
89 root->commit_root = btrfs_root_node(root); 115 root->commit_root = btrfs_root_node(root);
90 if (is_fstree(root->objectid)) 116 if (is_fstree(root->objectid))
91 btrfs_unpin_free_ino(root); 117 btrfs_unpin_free_ino(root);
118 clear_btree_io_tree(&root->dirty_log_pages);
92 } 119 }
93 up_write(&fs_info->commit_root_sem); 120 up_write(&fs_info->commit_root_sem);
94} 121}
@@ -220,6 +247,7 @@ loop:
220 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 247 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
221 INIT_LIST_HEAD(&cur_trans->pending_chunks); 248 INIT_LIST_HEAD(&cur_trans->pending_chunks);
222 INIT_LIST_HEAD(&cur_trans->switch_commits); 249 INIT_LIST_HEAD(&cur_trans->switch_commits);
250 INIT_LIST_HEAD(&cur_trans->pending_ordered);
223 list_add_tail(&cur_trans->list, &fs_info->trans_list); 251 list_add_tail(&cur_trans->list, &fs_info->trans_list);
224 extent_io_tree_init(&cur_trans->dirty_pages, 252 extent_io_tree_init(&cur_trans->dirty_pages,
225 fs_info->btree_inode->i_mapping); 253 fs_info->btree_inode->i_mapping);
@@ -488,6 +516,7 @@ again:
488 h->sync = false; 516 h->sync = false;
489 INIT_LIST_HEAD(&h->qgroup_ref_list); 517 INIT_LIST_HEAD(&h->qgroup_ref_list);
490 INIT_LIST_HEAD(&h->new_bgs); 518 INIT_LIST_HEAD(&h->new_bgs);
519 INIT_LIST_HEAD(&h->ordered);
491 520
492 smp_mb(); 521 smp_mb();
493 if (cur_trans->state >= TRANS_STATE_BLOCKED && 522 if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -719,6 +748,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
719 if (!list_empty(&trans->new_bgs)) 748 if (!list_empty(&trans->new_bgs))
720 btrfs_create_pending_block_groups(trans, root); 749 btrfs_create_pending_block_groups(trans, root);
721 750
751 if (!list_empty(&trans->ordered)) {
752 spin_lock(&info->trans_lock);
753 list_splice(&trans->ordered, &cur_trans->pending_ordered);
754 spin_unlock(&info->trans_lock);
755 }
756
722 trans->delayed_ref_updates = 0; 757 trans->delayed_ref_updates = 0;
723 if (!trans->sync) { 758 if (!trans->sync) {
724 must_run_delayed_refs = 759 must_run_delayed_refs =
@@ -828,17 +863,39 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
828 863
829 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 864 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
830 mark, &cached_state)) { 865 mark, &cached_state)) {
831 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 866 bool wait_writeback = false;
832 mark, &cached_state, GFP_NOFS); 867
833 cached_state = NULL; 868 err = convert_extent_bit(dirty_pages, start, end,
834 err = filemap_fdatawrite_range(mapping, start, end); 869 EXTENT_NEED_WAIT,
870 mark, &cached_state, GFP_NOFS);
871 /*
872 * convert_extent_bit can return -ENOMEM, which is most of the
873 * time a temporary error. So when it happens, ignore the error
874 * and wait for writeback of this range to finish - because we
875 * failed to set the bit EXTENT_NEED_WAIT for the range, a call
876 * to btrfs_wait_marked_extents() would not know that writeback
877 * for this range started and therefore wouldn't wait for it to
878 * finish - we don't want to commit a superblock that points to
879 * btree nodes/leafs for which writeback hasn't finished yet
880 * (and without errors).
881 * We cleanup any entries left in the io tree when committing
882 * the transaction (through clear_btree_io_tree()).
883 */
884 if (err == -ENOMEM) {
885 err = 0;
886 wait_writeback = true;
887 }
888 if (!err)
889 err = filemap_fdatawrite_range(mapping, start, end);
835 if (err) 890 if (err)
836 werr = err; 891 werr = err;
892 else if (wait_writeback)
893 werr = filemap_fdatawait_range(mapping, start, end);
894 free_extent_state(cached_state);
895 cached_state = NULL;
837 cond_resched(); 896 cond_resched();
838 start = end + 1; 897 start = end + 1;
839 } 898 }
840 if (err)
841 werr = err;
842 return werr; 899 return werr;
843} 900}
844 901
@@ -862,11 +919,25 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
862 919
863 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 920 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
864 EXTENT_NEED_WAIT, &cached_state)) { 921 EXTENT_NEED_WAIT, &cached_state)) {
865 clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 922 /*
866 0, 0, &cached_state, GFP_NOFS); 923 * Ignore -ENOMEM errors returned by clear_extent_bit().
867 err = filemap_fdatawait_range(mapping, start, end); 924 * When committing the transaction, we'll remove any entries
925 * left in the io tree. For a log commit, we don't remove them
926 * after committing the log because the tree can be accessed
927 * concurrently - we do it only at transaction commit time when
928 * it's safe to do it (through clear_btree_io_tree()).
929 */
930 err = clear_extent_bit(dirty_pages, start, end,
931 EXTENT_NEED_WAIT,
932 0, 0, &cached_state, GFP_NOFS);
933 if (err == -ENOMEM)
934 err = 0;
935 if (!err)
936 err = filemap_fdatawait_range(mapping, start, end);
868 if (err) 937 if (err)
869 werr = err; 938 werr = err;
939 free_extent_state(cached_state);
940 cached_state = NULL;
870 cond_resched(); 941 cond_resched();
871 start = end + 1; 942 start = end + 1;
872 } 943 }
@@ -919,17 +990,17 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
919 return 0; 990 return 0;
920} 991}
921 992
922int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 993static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
923 struct btrfs_root *root) 994 struct btrfs_root *root)
924{ 995{
925 if (!trans || !trans->transaction) { 996 int ret;
926 struct inode *btree_inode; 997
927 btree_inode = root->fs_info->btree_inode; 998 ret = btrfs_write_and_wait_marked_extents(root,
928 return filemap_write_and_wait(btree_inode->i_mapping);
929 }
930 return btrfs_write_and_wait_marked_extents(root,
931 &trans->transaction->dirty_pages, 999 &trans->transaction->dirty_pages,
932 EXTENT_DIRTY); 1000 EXTENT_DIRTY);
1001 clear_btree_io_tree(&trans->transaction->dirty_pages);
1002
1003 return ret;
933} 1004}
934 1005
935/* 1006/*
@@ -1652,6 +1723,28 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
1652 btrfs_wait_ordered_roots(fs_info, -1); 1723 btrfs_wait_ordered_roots(fs_info, -1);
1653} 1724}
1654 1725
1726static inline void
1727btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
1728 struct btrfs_fs_info *fs_info)
1729{
1730 struct btrfs_ordered_extent *ordered;
1731
1732 spin_lock(&fs_info->trans_lock);
1733 while (!list_empty(&cur_trans->pending_ordered)) {
1734 ordered = list_first_entry(&cur_trans->pending_ordered,
1735 struct btrfs_ordered_extent,
1736 trans_list);
1737 list_del_init(&ordered->trans_list);
1738 spin_unlock(&fs_info->trans_lock);
1739
1740 wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
1741 &ordered->flags));
1742 btrfs_put_ordered_extent(ordered);
1743 spin_lock(&fs_info->trans_lock);
1744 }
1745 spin_unlock(&fs_info->trans_lock);
1746}
1747
1655int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 1748int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1656 struct btrfs_root *root) 1749 struct btrfs_root *root)
1657{ 1750{
@@ -1702,6 +1795,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1702 } 1795 }
1703 1796
1704 spin_lock(&root->fs_info->trans_lock); 1797 spin_lock(&root->fs_info->trans_lock);
1798 list_splice(&trans->ordered, &cur_trans->pending_ordered);
1705 if (cur_trans->state >= TRANS_STATE_COMMIT_START) { 1799 if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
1706 spin_unlock(&root->fs_info->trans_lock); 1800 spin_unlock(&root->fs_info->trans_lock);
1707 atomic_inc(&cur_trans->use_count); 1801 atomic_inc(&cur_trans->use_count);
@@ -1754,6 +1848,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1754 1848
1755 btrfs_wait_delalloc_flush(root->fs_info); 1849 btrfs_wait_delalloc_flush(root->fs_info);
1756 1850
1851 btrfs_wait_pending_ordered(cur_trans, root->fs_info);
1852
1757 btrfs_scrub_pause(root); 1853 btrfs_scrub_pause(root);
1758 /* 1854 /*
1759 * Ok now we need to make sure to block out any other joins while we 1855 * Ok now we need to make sure to block out any other joins while we
@@ -1842,13 +1938,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1842 } 1938 }
1843 1939
1844 /* 1940 /*
1845 * Since the transaction is done, we should set the inode map cache flag 1941 * Since the transaction is done, we can apply the pending changes
1846 * before any other comming transaction. 1942 * before the next transaction.
1847 */ 1943 */
1848 if (btrfs_test_opt(root, CHANGE_INODE_CACHE)) 1944 btrfs_apply_pending_changes(root->fs_info);
1849 btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
1850 else
1851 btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
1852 1945
1853 /* commit_fs_roots gets rid of all the tree log roots, it is now 1946 /* commit_fs_roots gets rid of all the tree log roots, it is now
1854 * safe to free the root of tree log roots 1947 * safe to free the root of tree log roots
@@ -2019,3 +2112,32 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
2019 2112
2020 return (ret < 0) ? 0 : 1; 2113 return (ret < 0) ? 0 : 1;
2021} 2114}
2115
2116void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
2117{
2118 unsigned long prev;
2119 unsigned long bit;
2120
2121 prev = cmpxchg(&fs_info->pending_changes, 0, 0);
2122 if (!prev)
2123 return;
2124
2125 bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE;
2126 if (prev & bit)
2127 btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE);
2128 prev &= ~bit;
2129
2130 bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE;
2131 if (prev & bit)
2132 btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE);
2133 prev &= ~bit;
2134
2135 bit = 1 << BTRFS_PENDING_COMMIT;
2136 if (prev & bit)
2137 btrfs_debug(fs_info, "pending commit done");
2138 prev &= ~bit;
2139
2140 if (prev)
2141 btrfs_warn(fs_info,
2142 "unknown pending changes left 0x%lx, ignoring", prev);
2143}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d8f40e1a5d2d..00ed29c4b3f9 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -56,6 +56,7 @@ struct btrfs_transaction {
56 wait_queue_head_t commit_wait; 56 wait_queue_head_t commit_wait;
57 struct list_head pending_snapshots; 57 struct list_head pending_snapshots;
58 struct list_head pending_chunks; 58 struct list_head pending_chunks;
59 struct list_head pending_ordered;
59 struct list_head switch_commits; 60 struct list_head switch_commits;
60 struct btrfs_delayed_ref_root delayed_refs; 61 struct btrfs_delayed_ref_root delayed_refs;
61 int aborted; 62 int aborted;
@@ -105,6 +106,7 @@ struct btrfs_trans_handle {
105 */ 106 */
106 struct btrfs_root *root; 107 struct btrfs_root *root;
107 struct seq_list delayed_ref_elem; 108 struct seq_list delayed_ref_elem;
109 struct list_head ordered;
108 struct list_head qgroup_ref_list; 110 struct list_head qgroup_ref_list;
109 struct list_head new_bgs; 111 struct list_head new_bgs;
110}; 112};
@@ -145,8 +147,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
145 struct btrfs_root *root); 147 struct btrfs_root *root);
146struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); 148struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
147int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); 149int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
148int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
149 struct btrfs_root *root);
150 150
151void btrfs_add_dead_root(struct btrfs_root *root); 151void btrfs_add_dead_root(struct btrfs_root *root);
152int btrfs_defrag_root(struct btrfs_root *root); 152int btrfs_defrag_root(struct btrfs_root *root);
@@ -170,4 +170,6 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
170int btrfs_transaction_blocked(struct btrfs_fs_info *info); 170int btrfs_transaction_blocked(struct btrfs_fs_info *info);
171int btrfs_transaction_in_commit(struct btrfs_fs_info *info); 171int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
172void btrfs_put_transaction(struct btrfs_transaction *transaction); 172void btrfs_put_transaction(struct btrfs_transaction *transaction);
173void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
174
173#endif 175#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 286213cec861..9a02da16f2be 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2599,12 +2599,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2599 index2 = root_log_ctx.log_transid % 2; 2599 index2 = root_log_ctx.log_transid % 2;
2600 if (atomic_read(&log_root_tree->log_commit[index2])) { 2600 if (atomic_read(&log_root_tree->log_commit[index2])) {
2601 blk_finish_plug(&plug); 2601 blk_finish_plug(&plug);
2602 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2602 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
2603 mark);
2604 btrfs_wait_logged_extents(trans, log, log_transid);
2603 wait_log_commit(trans, log_root_tree, 2605 wait_log_commit(trans, log_root_tree,
2604 root_log_ctx.log_transid); 2606 root_log_ctx.log_transid);
2605 btrfs_free_logged_extents(log, log_transid);
2606 mutex_unlock(&log_root_tree->log_mutex); 2607 mutex_unlock(&log_root_tree->log_mutex);
2607 ret = root_log_ctx.log_ret; 2608 if (!ret)
2609 ret = root_log_ctx.log_ret;
2608 goto out; 2610 goto out;
2609 } 2611 }
2610 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 2612 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
@@ -2641,11 +2643,18 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2641 mutex_unlock(&log_root_tree->log_mutex); 2643 mutex_unlock(&log_root_tree->log_mutex);
2642 goto out_wake_log_root; 2644 goto out_wake_log_root;
2643 } 2645 }
2644 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2646 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2645 btrfs_wait_marked_extents(log_root_tree, 2647 if (!ret)
2646 &log_root_tree->dirty_log_pages, 2648 ret = btrfs_wait_marked_extents(log_root_tree,
2647 EXTENT_NEW | EXTENT_DIRTY); 2649 &log_root_tree->dirty_log_pages,
2648 btrfs_wait_logged_extents(log, log_transid); 2650 EXTENT_NEW | EXTENT_DIRTY);
2651 if (ret) {
2652 btrfs_set_log_full_commit(root->fs_info, trans);
2653 btrfs_free_logged_extents(log, log_transid);
2654 mutex_unlock(&log_root_tree->log_mutex);
2655 goto out_wake_log_root;
2656 }
2657 btrfs_wait_logged_extents(trans, log, log_transid);
2649 2658
2650 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2659 btrfs_set_super_log_root(root->fs_info->super_for_commit,
2651 log_root_tree->node->start); 2660 log_root_tree->node->start);
@@ -3626,6 +3635,12 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
3626 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); 3635 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
3627 3636
3628 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { 3637 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
3638 /*
3639 * Clear the AS_EIO/AS_ENOSPC flags from the inode's
3640 * i_mapping flags, so that the next fsync won't get
3641 * an outdated io error too.
3642 */
3643 btrfs_inode_check_errors(inode);
3629 *ordered_io_error = true; 3644 *ordered_io_error = true;
3630 break; 3645 break;
3631 } 3646 }
@@ -3766,7 +3781,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3766 fi = btrfs_item_ptr(leaf, path->slots[0], 3781 fi = btrfs_item_ptr(leaf, path->slots[0],
3767 struct btrfs_file_extent_item); 3782 struct btrfs_file_extent_item);
3768 3783
3769 btrfs_set_token_file_extent_generation(leaf, fi, em->generation, 3784 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
3770 &token); 3785 &token);
3771 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 3786 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3772 btrfs_set_token_file_extent_type(leaf, fi, 3787 btrfs_set_token_file_extent_type(leaf, fi,
@@ -3963,7 +3978,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3963 3978
3964 mutex_lock(&BTRFS_I(inode)->log_mutex); 3979 mutex_lock(&BTRFS_I(inode)->log_mutex);
3965 3980
3966 btrfs_get_logged_extents(inode, &logged_list); 3981 btrfs_get_logged_extents(inode, &logged_list, start, end);
3967 3982
3968 /* 3983 /*
3969 * a brute force approach to making sure we get the most uptodate 3984 * a brute force approach to making sure we get the most uptodate
@@ -4089,6 +4104,21 @@ log_extents:
4089 btrfs_release_path(path); 4104 btrfs_release_path(path);
4090 btrfs_release_path(dst_path); 4105 btrfs_release_path(dst_path);
4091 if (fast_search) { 4106 if (fast_search) {
4107 /*
4108 * Some ordered extents started by fsync might have completed
4109 * before we collected the ordered extents in logged_list, which
4110 * means they're gone, not in our logged_list nor in the inode's
4111 * ordered tree. We want the application/user space to know an
4112 * error happened while attempting to persist file data so that
4113 * it can take proper action. If such error happened, we leave
4114 * without writing to the log tree and the fsync must report the
4115 * file data write error and not commit the current transaction.
4116 */
4117 err = btrfs_inode_check_errors(inode);
4118 if (err) {
4119 ctx->io_err = err;
4120 goto out_unlock;
4121 }
4092 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4122 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4093 &logged_list, ctx); 4123 &logged_list, ctx);
4094 if (ret) { 4124 if (ret) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d47289c715c8..0144790e296e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -53,16 +53,6 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
53DEFINE_MUTEX(uuid_mutex); 53DEFINE_MUTEX(uuid_mutex);
54static LIST_HEAD(fs_uuids); 54static LIST_HEAD(fs_uuids);
55 55
56static void lock_chunks(struct btrfs_root *root)
57{
58 mutex_lock(&root->fs_info->chunk_mutex);
59}
60
61static void unlock_chunks(struct btrfs_root *root)
62{
63 mutex_unlock(&root->fs_info->chunk_mutex);
64}
65
66static struct btrfs_fs_devices *__alloc_fs_devices(void) 56static struct btrfs_fs_devices *__alloc_fs_devices(void)
67{ 57{
68 struct btrfs_fs_devices *fs_devs; 58 struct btrfs_fs_devices *fs_devs;
@@ -1068,9 +1058,11 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
1068 u64 *start, u64 len) 1058 u64 *start, u64 len)
1069{ 1059{
1070 struct extent_map *em; 1060 struct extent_map *em;
1061 struct list_head *search_list = &trans->transaction->pending_chunks;
1071 int ret = 0; 1062 int ret = 0;
1072 1063
1073 list_for_each_entry(em, &trans->transaction->pending_chunks, list) { 1064again:
1065 list_for_each_entry(em, search_list, list) {
1074 struct map_lookup *map; 1066 struct map_lookup *map;
1075 int i; 1067 int i;
1076 1068
@@ -1087,6 +1079,10 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
1087 ret = 1; 1079 ret = 1;
1088 } 1080 }
1089 } 1081 }
1082 if (search_list == &trans->transaction->pending_chunks) {
1083 search_list = &trans->root->fs_info->pinned_chunks;
1084 goto again;
1085 }
1090 1086
1091 return ret; 1087 return ret;
1092} 1088}
@@ -1800,8 +1796,8 @@ error_undo:
1800 goto error_brelse; 1796 goto error_brelse;
1801} 1797}
1802 1798
1803void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, 1799void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
1804 struct btrfs_device *srcdev) 1800 struct btrfs_device *srcdev)
1805{ 1801{
1806 struct btrfs_fs_devices *fs_devices; 1802 struct btrfs_fs_devices *fs_devices;
1807 1803
@@ -1829,6 +1825,12 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
1829 1825
1830 if (srcdev->bdev) 1826 if (srcdev->bdev)
1831 fs_devices->open_devices--; 1827 fs_devices->open_devices--;
1828}
1829
1830void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
1831 struct btrfs_device *srcdev)
1832{
1833 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
1832 1834
1833 call_rcu(&srcdev->rcu, free_device); 1835 call_rcu(&srcdev->rcu, free_device);
1834 1836
@@ -2647,18 +2649,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2647 } 2649 }
2648 } 2650 }
2649 2651
2650 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 2652 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em);
2651 if (ret) { 2653 if (ret) {
2652 btrfs_abort_transaction(trans, extent_root, ret); 2654 btrfs_abort_transaction(trans, extent_root, ret);
2653 goto out; 2655 goto out;
2654 } 2656 }
2655 2657
2656 write_lock(&em_tree->lock);
2657 remove_extent_mapping(em_tree, em);
2658 write_unlock(&em_tree->lock);
2659
2660 /* once for the tree */
2661 free_extent_map(em);
2662out: 2658out:
2663 /* once for us */ 2659 /* once for us */
2664 free_extent_map(em); 2660 free_extent_map(em);
@@ -4505,6 +4501,8 @@ error_del_extent:
4505 free_extent_map(em); 4501 free_extent_map(em);
4506 /* One for the tree reference */ 4502 /* One for the tree reference */
4507 free_extent_map(em); 4503 free_extent_map(em);
4504 /* One for the pending_chunks list reference */
4505 free_extent_map(em);
4508error: 4506error:
4509 kfree(devices_info); 4507 kfree(devices_info);
4510 return ret; 4508 return ret;
@@ -4881,13 +4879,15 @@ static inline int parity_smaller(u64 a, u64 b)
4881static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) 4879static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4882{ 4880{
4883 struct btrfs_bio_stripe s; 4881 struct btrfs_bio_stripe s;
4882 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
4884 int i; 4883 int i;
4885 u64 l; 4884 u64 l;
4886 int again = 1; 4885 int again = 1;
4886 int m;
4887 4887
4888 while (again) { 4888 while (again) {
4889 again = 0; 4889 again = 0;
4890 for (i = 0; i < bbio->num_stripes - 1; i++) { 4890 for (i = 0; i < real_stripes - 1; i++) {
4891 if (parity_smaller(raid_map[i], raid_map[i+1])) { 4891 if (parity_smaller(raid_map[i], raid_map[i+1])) {
4892 s = bbio->stripes[i]; 4892 s = bbio->stripes[i];
4893 l = raid_map[i]; 4893 l = raid_map[i];
@@ -4895,6 +4895,14 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4895 raid_map[i] = raid_map[i+1]; 4895 raid_map[i] = raid_map[i+1];
4896 bbio->stripes[i+1] = s; 4896 bbio->stripes[i+1] = s;
4897 raid_map[i+1] = l; 4897 raid_map[i+1] = l;
4898
4899 if (bbio->tgtdev_map) {
4900 m = bbio->tgtdev_map[i];
4901 bbio->tgtdev_map[i] =
4902 bbio->tgtdev_map[i + 1];
4903 bbio->tgtdev_map[i + 1] = m;
4904 }
4905
4898 again = 1; 4906 again = 1;
4899 } 4907 }
4900 } 4908 }
@@ -4923,6 +4931,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4923 int ret = 0; 4931 int ret = 0;
4924 int num_stripes; 4932 int num_stripes;
4925 int max_errors = 0; 4933 int max_errors = 0;
4934 int tgtdev_indexes = 0;
4926 struct btrfs_bio *bbio = NULL; 4935 struct btrfs_bio *bbio = NULL;
4927 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 4936 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
4928 int dev_replace_is_ongoing = 0; 4937 int dev_replace_is_ongoing = 0;
@@ -5161,15 +5170,14 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5161 BTRFS_BLOCK_GROUP_RAID6)) { 5170 BTRFS_BLOCK_GROUP_RAID6)) {
5162 u64 tmp; 5171 u64 tmp;
5163 5172
5164 if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) 5173 if (raid_map_ret &&
5165 && raid_map_ret) { 5174 ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
5175 mirror_num > 1)) {
5166 int i, rot; 5176 int i, rot;
5167 5177
5168 /* push stripe_nr back to the start of the full stripe */ 5178 /* push stripe_nr back to the start of the full stripe */
5169 stripe_nr = raid56_full_stripe_start; 5179 stripe_nr = raid56_full_stripe_start;
5170 do_div(stripe_nr, stripe_len); 5180 do_div(stripe_nr, stripe_len * nr_data_stripes(map));
5171
5172 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
5173 5181
5174 /* RAID[56] write or recovery. Return all stripes */ 5182 /* RAID[56] write or recovery. Return all stripes */
5175 num_stripes = map->num_stripes; 5183 num_stripes = map->num_stripes;
@@ -5235,14 +5243,19 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5235 num_alloc_stripes <<= 1; 5243 num_alloc_stripes <<= 1;
5236 if (rw & REQ_GET_READ_MIRRORS) 5244 if (rw & REQ_GET_READ_MIRRORS)
5237 num_alloc_stripes++; 5245 num_alloc_stripes++;
5246 tgtdev_indexes = num_stripes;
5238 } 5247 }
5239 bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); 5248
5249 bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes),
5250 GFP_NOFS);
5240 if (!bbio) { 5251 if (!bbio) {
5241 kfree(raid_map); 5252 kfree(raid_map);
5242 ret = -ENOMEM; 5253 ret = -ENOMEM;
5243 goto out; 5254 goto out;
5244 } 5255 }
5245 atomic_set(&bbio->error, 0); 5256 atomic_set(&bbio->error, 0);
5257 if (dev_replace_is_ongoing)
5258 bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
5246 5259
5247 if (rw & REQ_DISCARD) { 5260 if (rw & REQ_DISCARD) {
5248 int factor = 0; 5261 int factor = 0;
@@ -5327,6 +5340,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5327 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) 5340 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
5328 max_errors = btrfs_chunk_max_errors(map); 5341 max_errors = btrfs_chunk_max_errors(map);
5329 5342
5343 tgtdev_indexes = 0;
5330 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && 5344 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
5331 dev_replace->tgtdev != NULL) { 5345 dev_replace->tgtdev != NULL) {
5332 int index_where_to_add; 5346 int index_where_to_add;
@@ -5355,8 +5369,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5355 new->physical = old->physical; 5369 new->physical = old->physical;
5356 new->length = old->length; 5370 new->length = old->length;
5357 new->dev = dev_replace->tgtdev; 5371 new->dev = dev_replace->tgtdev;
5372 bbio->tgtdev_map[i] = index_where_to_add;
5358 index_where_to_add++; 5373 index_where_to_add++;
5359 max_errors++; 5374 max_errors++;
5375 tgtdev_indexes++;
5360 } 5376 }
5361 } 5377 }
5362 num_stripes = index_where_to_add; 5378 num_stripes = index_where_to_add;
@@ -5402,7 +5418,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5402 tgtdev_stripe->length = 5418 tgtdev_stripe->length =
5403 bbio->stripes[index_srcdev].length; 5419 bbio->stripes[index_srcdev].length;
5404 tgtdev_stripe->dev = dev_replace->tgtdev; 5420 tgtdev_stripe->dev = dev_replace->tgtdev;
5421 bbio->tgtdev_map[index_srcdev] = num_stripes;
5405 5422
5423 tgtdev_indexes++;
5406 num_stripes++; 5424 num_stripes++;
5407 } 5425 }
5408 } 5426 }
@@ -5412,6 +5430,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5412 bbio->num_stripes = num_stripes; 5430 bbio->num_stripes = num_stripes;
5413 bbio->max_errors = max_errors; 5431 bbio->max_errors = max_errors;
5414 bbio->mirror_num = mirror_num; 5432 bbio->mirror_num = mirror_num;
5433 bbio->num_tgtdevs = tgtdev_indexes;
5415 5434
5416 /* 5435 /*
5417 * this is the case that REQ_READ && dev_replace_is_ongoing && 5436 * this is the case that REQ_READ && dev_replace_is_ongoing &&
@@ -5443,6 +5462,16 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5443 mirror_num, NULL); 5462 mirror_num, NULL);
5444} 5463}
5445 5464
5465/* For Scrub/replace */
5466int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
5467 u64 logical, u64 *length,
5468 struct btrfs_bio **bbio_ret, int mirror_num,
5469 u64 **raid_map_ret)
5470{
5471 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
5472 mirror_num, raid_map_ret);
5473}
5474
5446int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 5475int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5447 u64 chunk_start, u64 physical, u64 devid, 5476 u64 chunk_start, u64 physical, u64 devid,
5448 u64 **logical, int *naddrs, int *stripe_len) 5477 u64 **logical, int *naddrs, int *stripe_len)
@@ -5812,12 +5841,9 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5812 } else { 5841 } else {
5813 ret = raid56_parity_recover(root, bio, bbio, 5842 ret = raid56_parity_recover(root, bio, bbio,
5814 raid_map, map_length, 5843 raid_map, map_length,
5815 mirror_num); 5844 mirror_num, 1);
5816 } 5845 }
5817 /* 5846
5818 * FIXME, replace dosen't support raid56 yet, please fix
5819 * it in the future.
5820 */
5821 btrfs_bio_counter_dec(root->fs_info); 5847 btrfs_bio_counter_dec(root->fs_info);
5822 return ret; 5848 return ret;
5823 } 5849 }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 08980fa23039..d6fe73c0f4a2 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -292,7 +292,7 @@ struct btrfs_bio_stripe {
292struct btrfs_bio; 292struct btrfs_bio;
293typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); 293typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
294 294
295#define BTRFS_BIO_ORIG_BIO_SUBMITTED 0x1 295#define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0)
296 296
297struct btrfs_bio { 297struct btrfs_bio {
298 atomic_t stripes_pending; 298 atomic_t stripes_pending;
@@ -305,6 +305,8 @@ struct btrfs_bio {
305 int max_errors; 305 int max_errors;
306 int num_stripes; 306 int num_stripes;
307 int mirror_num; 307 int mirror_num;
308 int num_tgtdevs;
309 int *tgtdev_map;
308 struct btrfs_bio_stripe stripes[]; 310 struct btrfs_bio_stripe stripes[];
309}; 311};
310 312
@@ -387,12 +389,18 @@ struct btrfs_balance_control {
387int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 389int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
388 u64 end, u64 *length); 390 u64 end, u64 *length);
389 391
390#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \ 392#define btrfs_bio_size(total_stripes, real_stripes) \
391 (sizeof(struct btrfs_bio_stripe) * (n))) 393 (sizeof(struct btrfs_bio) + \
394 (sizeof(struct btrfs_bio_stripe) * (total_stripes)) + \
395 (sizeof(int) * (real_stripes)))
392 396
393int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 397int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
394 u64 logical, u64 *length, 398 u64 logical, u64 *length,
395 struct btrfs_bio **bbio_ret, int mirror_num); 399 struct btrfs_bio **bbio_ret, int mirror_num);
400int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
401 u64 logical, u64 *length,
402 struct btrfs_bio **bbio_ret, int mirror_num,
403 u64 **raid_map_ret);
396int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 404int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
397 u64 chunk_start, u64 physical, u64 devid, 405 u64 chunk_start, u64 physical, u64 devid,
398 u64 **logical, int *naddrs, int *stripe_len); 406 u64 **logical, int *naddrs, int *stripe_len);
@@ -448,8 +456,10 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
448int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); 456int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
449int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 457int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
450 struct btrfs_fs_info *fs_info); 458 struct btrfs_fs_info *fs_info);
451void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, 459void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
452 struct btrfs_device *srcdev); 460 struct btrfs_device *srcdev);
461void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
462 struct btrfs_device *srcdev);
453void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 463void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
454 struct btrfs_device *tgtdev); 464 struct btrfs_device *tgtdev);
455void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 465void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
@@ -513,4 +523,16 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
513void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); 523void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
514void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, 524void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
515 struct btrfs_transaction *transaction); 525 struct btrfs_transaction *transaction);
526
527static inline void lock_chunks(struct btrfs_root *root)
528{
529 mutex_lock(&root->fs_info->chunk_mutex);
530}
531
532static inline void unlock_chunks(struct btrfs_root *root)
533{
534 mutex_unlock(&root->fs_info->chunk_mutex);
535}
536
537
516#endif 538#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index dcf20131fbe4..47b19465f0dc 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -29,6 +29,7 @@
29#include "xattr.h" 29#include "xattr.h"
30#include "disk-io.h" 30#include "disk-io.h"
31#include "props.h" 31#include "props.h"
32#include "locking.h"
32 33
33 34
34ssize_t __btrfs_getxattr(struct inode *inode, const char *name, 35ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
@@ -91,7 +92,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
91 struct inode *inode, const char *name, 92 struct inode *inode, const char *name,
92 const void *value, size_t size, int flags) 93 const void *value, size_t size, int flags)
93{ 94{
94 struct btrfs_dir_item *di; 95 struct btrfs_dir_item *di = NULL;
95 struct btrfs_root *root = BTRFS_I(inode)->root; 96 struct btrfs_root *root = BTRFS_I(inode)->root;
96 struct btrfs_path *path; 97 struct btrfs_path *path;
97 size_t name_len = strlen(name); 98 size_t name_len = strlen(name);
@@ -103,84 +104,119 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
103 path = btrfs_alloc_path(); 104 path = btrfs_alloc_path();
104 if (!path) 105 if (!path)
105 return -ENOMEM; 106 return -ENOMEM;
107 path->skip_release_on_error = 1;
108
109 if (!value) {
110 di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
111 name, name_len, -1);
112 if (!di && (flags & XATTR_REPLACE))
113 ret = -ENODATA;
114 else if (di)
115 ret = btrfs_delete_one_dir_name(trans, root, path, di);
116 goto out;
117 }
106 118
119 /*
120 * For a replace we can't just do the insert blindly.
121 * Do a lookup first (read-only btrfs_search_slot), and return if xattr
122 * doesn't exist. If it exists, fall down below to the insert/replace
123 * path - we can't race with a concurrent xattr delete, because the VFS
124 * locks the inode's i_mutex before calling setxattr or removexattr.
125 */
107 if (flags & XATTR_REPLACE) { 126 if (flags & XATTR_REPLACE) {
108 di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, 127 ASSERT(mutex_is_locked(&inode->i_mutex));
109 name_len, -1); 128 di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
110 if (IS_ERR(di)) { 129 name, name_len, 0);
111 ret = PTR_ERR(di); 130 if (!di) {
112 goto out;
113 } else if (!di) {
114 ret = -ENODATA; 131 ret = -ENODATA;
115 goto out; 132 goto out;
116 } 133 }
117 ret = btrfs_delete_one_dir_name(trans, root, path, di);
118 if (ret)
119 goto out;
120 btrfs_release_path(path); 134 btrfs_release_path(path);
135 di = NULL;
136 }
121 137
138 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
139 name, name_len, value, size);
140 if (ret == -EOVERFLOW) {
122 /* 141 /*
123 * remove the attribute 142 * We have an existing item in a leaf, split_leaf couldn't
143 * expand it. That item might have or not a dir_item that
144 * matches our target xattr, so lets check.
124 */ 145 */
125 if (!value) 146 ret = 0;
126 goto out; 147 btrfs_assert_tree_locked(path->nodes[0]);
127 } else { 148 di = btrfs_match_dir_item_name(root, path, name, name_len);
128 di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), 149 if (!di && !(flags & XATTR_REPLACE)) {
129 name, name_len, 0); 150 ret = -ENOSPC;
130 if (IS_ERR(di)) {
131 ret = PTR_ERR(di);
132 goto out; 151 goto out;
133 } 152 }
134 if (!di && !value) 153 } else if (ret == -EEXIST) {
135 goto out; 154 ret = 0;
136 btrfs_release_path(path); 155 di = btrfs_match_dir_item_name(root, path, name, name_len);
156 ASSERT(di); /* logic error */
157 } else if (ret) {
158 goto out;
137 } 159 }
138 160
139again: 161 if (di && (flags & XATTR_CREATE)) {
140 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
141 name, name_len, value, size);
142 /*
143 * If we're setting an xattr to a new value but the new value is say
144 * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
145 * back from split_leaf. This is because it thinks we'll be extending
146 * the existing item size, but we're asking for enough space to add the
147 * item itself. So if we get EOVERFLOW just set ret to EEXIST and let
148 * the rest of the function figure it out.
149 */
150 if (ret == -EOVERFLOW)
151 ret = -EEXIST; 162 ret = -EEXIST;
163 goto out;
164 }
152 165
153 if (ret == -EEXIST) { 166 if (di) {
154 if (flags & XATTR_CREATE)
155 goto out;
156 /* 167 /*
157 * We can't use the path we already have since we won't have the 168 * We're doing a replace, and it must be atomic, that is, at
158 * proper locking for a delete, so release the path and 169 * any point in time we have either the old or the new xattr
159 * re-lookup to delete the thing. 170 * value in the tree. We don't want readers (getxattr and
171 * listxattrs) to miss a value, this is specially important
172 * for ACLs.
160 */ 173 */
161 btrfs_release_path(path); 174 const int slot = path->slots[0];
162 di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), 175 struct extent_buffer *leaf = path->nodes[0];
163 name, name_len, -1); 176 const u16 old_data_len = btrfs_dir_data_len(leaf, di);
164 if (IS_ERR(di)) { 177 const u32 item_size = btrfs_item_size_nr(leaf, slot);
165 ret = PTR_ERR(di); 178 const u32 data_size = sizeof(*di) + name_len + size;
166 goto out; 179 struct btrfs_item *item;
167 } else if (!di) { 180 unsigned long data_ptr;
168 /* Shouldn't happen but just in case... */ 181 char *ptr;
169 btrfs_release_path(path); 182
170 goto again; 183 if (size > old_data_len) {
184 if (btrfs_leaf_free_space(root, leaf) <
185 (size - old_data_len)) {
186 ret = -ENOSPC;
187 goto out;
188 }
171 } 189 }
172 190
173 ret = btrfs_delete_one_dir_name(trans, root, path, di); 191 if (old_data_len + name_len + sizeof(*di) == item_size) {
174 if (ret) 192 /* No other xattrs packed in the same leaf item. */
175 goto out; 193 if (size > old_data_len)
194 btrfs_extend_item(root, path,
195 size - old_data_len);
196 else if (size < old_data_len)
197 btrfs_truncate_item(root, path, data_size, 1);
198 } else {
199 /* There are other xattrs packed in the same item. */
200 ret = btrfs_delete_one_dir_name(trans, root, path, di);
201 if (ret)
202 goto out;
203 btrfs_extend_item(root, path, data_size);
204 }
176 205
206 item = btrfs_item_nr(slot);
207 ptr = btrfs_item_ptr(leaf, slot, char);
208 ptr += btrfs_item_size(leaf, item) - data_size;
209 di = (struct btrfs_dir_item *)ptr;
210 btrfs_set_dir_data_len(leaf, di, size);
211 data_ptr = ((unsigned long)(di + 1)) + name_len;
212 write_extent_buffer(leaf, value, data_ptr, size);
213 btrfs_mark_buffer_dirty(leaf);
214 } else {
177 /* 215 /*
178 * We have a value to set, so go back and try to insert it now. 216 * Insert, and we had space for the xattr, so path->slots[0] is
217 * where our xattr dir_item is and btrfs_insert_xattr_item()
218 * filled it.
179 */ 219 */
180 if (value) {
181 btrfs_release_path(path);
182 goto again;
183 }
184 } 220 }
185out: 221out:
186 btrfs_free_path(path); 222 btrfs_free_path(path);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index db3f772e57ae..a75fba67bb1f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -158,17 +158,8 @@ struct ext4_allocation_request {
158#define EXT4_MAP_MAPPED (1 << BH_Mapped) 158#define EXT4_MAP_MAPPED (1 << BH_Mapped)
159#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) 159#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
160#define EXT4_MAP_BOUNDARY (1 << BH_Boundary) 160#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
161/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of
162 * ext4_map_blocks wants to know whether or not the underlying cluster has
163 * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that
164 * the requested mapping was from previously mapped (or delayed allocated)
165 * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster
166 * should never appear on buffer_head's state flags.
167 */
168#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster)
169#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ 161#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
170 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ 162 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY)
171 EXT4_MAP_FROM_CLUSTER)
172 163
173struct ext4_map_blocks { 164struct ext4_map_blocks {
174 ext4_fsblk_t m_pblk; 165 ext4_fsblk_t m_pblk;
@@ -565,10 +556,8 @@ enum {
565#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 556#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080
566 /* Do not take i_data_sem locking in ext4_map_blocks */ 557 /* Do not take i_data_sem locking in ext4_map_blocks */
567#define EXT4_GET_BLOCKS_NO_LOCK 0x0100 558#define EXT4_GET_BLOCKS_NO_LOCK 0x0100
568 /* Do not put hole in extent cache */
569#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
570 /* Convert written extents to unwritten */ 559 /* Convert written extents to unwritten */
571#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400 560#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0200
572 561
573/* 562/*
574 * The bit position of these flags must not overlap with any of the 563 * The bit position of these flags must not overlap with any of the
@@ -889,10 +878,12 @@ struct ext4_inode_info {
889 /* extents status tree */ 878 /* extents status tree */
890 struct ext4_es_tree i_es_tree; 879 struct ext4_es_tree i_es_tree;
891 rwlock_t i_es_lock; 880 rwlock_t i_es_lock;
892 struct list_head i_es_lru; 881 struct list_head i_es_list;
893 unsigned int i_es_all_nr; /* protected by i_es_lock */ 882 unsigned int i_es_all_nr; /* protected by i_es_lock */
894 unsigned int i_es_lru_nr; /* protected by i_es_lock */ 883 unsigned int i_es_shk_nr; /* protected by i_es_lock */
895 unsigned long i_touch_when; /* jiffies of last accessing */ 884 ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for
885 extents to shrink. Protected by
886 i_es_lock */
896 887
897 /* ialloc */ 888 /* ialloc */
898 ext4_group_t i_last_alloc_group; 889 ext4_group_t i_last_alloc_group;
@@ -1337,10 +1328,11 @@ struct ext4_sb_info {
1337 1328
1338 /* Reclaim extents from extent status tree */ 1329 /* Reclaim extents from extent status tree */
1339 struct shrinker s_es_shrinker; 1330 struct shrinker s_es_shrinker;
1340 struct list_head s_es_lru; 1331 struct list_head s_es_list; /* List of inodes with reclaimable extents */
1332 long s_es_nr_inode;
1341 struct ext4_es_stats s_es_stats; 1333 struct ext4_es_stats s_es_stats;
1342 struct mb_cache *s_mb_cache; 1334 struct mb_cache *s_mb_cache;
1343 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; 1335 spinlock_t s_es_lock ____cacheline_aligned_in_smp;
1344 1336
1345 /* Ratelimit ext4 messages. */ 1337 /* Ratelimit ext4 messages. */
1346 struct ratelimit_state s_err_ratelimit_state; 1338 struct ratelimit_state s_err_ratelimit_state;
@@ -2196,7 +2188,6 @@ extern int ext4_calculate_overhead(struct super_block *sb);
2196extern void ext4_superblock_csum_set(struct super_block *sb); 2188extern void ext4_superblock_csum_set(struct super_block *sb);
2197extern void *ext4_kvmalloc(size_t size, gfp_t flags); 2189extern void *ext4_kvmalloc(size_t size, gfp_t flags);
2198extern void *ext4_kvzalloc(size_t size, gfp_t flags); 2190extern void *ext4_kvzalloc(size_t size, gfp_t flags);
2199extern void ext4_kvfree(void *ptr);
2200extern int ext4_alloc_flex_bg_array(struct super_block *sb, 2191extern int ext4_alloc_flex_bg_array(struct super_block *sb,
2201 ext4_group_t ngroup); 2192 ext4_group_t ngroup);
2202extern const char *ext4_decode_error(struct super_block *sb, int errno, 2193extern const char *ext4_decode_error(struct super_block *sb, int errno,
@@ -2647,7 +2638,7 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
2647 int *retval); 2638 int *retval);
2648extern int ext4_inline_data_fiemap(struct inode *inode, 2639extern int ext4_inline_data_fiemap(struct inode *inode,
2649 struct fiemap_extent_info *fieinfo, 2640 struct fiemap_extent_info *fieinfo,
2650 int *has_inline); 2641 int *has_inline, __u64 start, __u64 len);
2651extern int ext4_try_to_evict_inline_data(handle_t *handle, 2642extern int ext4_try_to_evict_inline_data(handle_t *handle,
2652 struct inode *inode, 2643 struct inode *inode,
2653 int needed); 2644 int needed);
@@ -2795,16 +2786,6 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
2795extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); 2786extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
2796 2787
2797/* 2788/*
2798 * Note that these flags will never ever appear in a buffer_head's state flag.
2799 * See EXT4_MAP_... to see where this is used.
2800 */
2801enum ext4_state_bits {
2802 BH_AllocFromCluster /* allocated blocks were part of already
2803 * allocated cluster. */
2804 = BH_JBDPrivateStart
2805};
2806
2807/*
2808 * Add new method to test whether block and inode bitmaps are properly 2789 * Add new method to test whether block and inode bitmaps are properly
2809 * initialized. With uninit_bg reading the block from disk is not enough 2790 * initialized. With uninit_bg reading the block from disk is not enough
2810 * to mark the bitmap uptodate. We need to also zero-out the bitmap 2791 * to mark the bitmap uptodate. We need to also zero-out the bitmap
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0b16fb4c06d3..e5d3eadf47b1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2306,16 +2306,16 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2306 ext4_lblk_t block) 2306 ext4_lblk_t block)
2307{ 2307{
2308 int depth = ext_depth(inode); 2308 int depth = ext_depth(inode);
2309 unsigned long len = 0; 2309 ext4_lblk_t len;
2310 ext4_lblk_t lblock = 0; 2310 ext4_lblk_t lblock;
2311 struct ext4_extent *ex; 2311 struct ext4_extent *ex;
2312 struct extent_status es;
2312 2313
2313 ex = path[depth].p_ext; 2314 ex = path[depth].p_ext;
2314 if (ex == NULL) { 2315 if (ex == NULL) {
2315 /* 2316 /* there is no extent yet, so gap is [0;-] */
2316 * there is no extent yet, so gap is [0;-] and we 2317 lblock = 0;
2317 * don't cache it 2318 len = EXT_MAX_BLOCKS;
2318 */
2319 ext_debug("cache gap(whole file):"); 2319 ext_debug("cache gap(whole file):");
2320 } else if (block < le32_to_cpu(ex->ee_block)) { 2320 } else if (block < le32_to_cpu(ex->ee_block)) {
2321 lblock = block; 2321 lblock = block;
@@ -2324,9 +2324,6 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2324 block, 2324 block,
2325 le32_to_cpu(ex->ee_block), 2325 le32_to_cpu(ex->ee_block),
2326 ext4_ext_get_actual_len(ex)); 2326 ext4_ext_get_actual_len(ex));
2327 if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
2328 ext4_es_insert_extent(inode, lblock, len, ~0,
2329 EXTENT_STATUS_HOLE);
2330 } else if (block >= le32_to_cpu(ex->ee_block) 2327 } else if (block >= le32_to_cpu(ex->ee_block)
2331 + ext4_ext_get_actual_len(ex)) { 2328 + ext4_ext_get_actual_len(ex)) {
2332 ext4_lblk_t next; 2329 ext4_lblk_t next;
@@ -2340,14 +2337,19 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2340 block); 2337 block);
2341 BUG_ON(next == lblock); 2338 BUG_ON(next == lblock);
2342 len = next - lblock; 2339 len = next - lblock;
2343 if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
2344 ext4_es_insert_extent(inode, lblock, len, ~0,
2345 EXTENT_STATUS_HOLE);
2346 } else { 2340 } else {
2347 BUG(); 2341 BUG();
2348 } 2342 }
2349 2343
2350 ext_debug(" -> %u:%lu\n", lblock, len); 2344 ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es);
2345 if (es.es_len) {
2346 /* There's delayed extent containing lblock? */
2347 if (es.es_lblk <= lblock)
2348 return;
2349 len = min(es.es_lblk - lblock, len);
2350 }
2351 ext_debug(" -> %u:%u\n", lblock, len);
2352 ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE);
2351} 2353}
2352 2354
2353/* 2355/*
@@ -2481,7 +2483,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2481 ext4_lblk_t from, ext4_lblk_t to) 2483 ext4_lblk_t from, ext4_lblk_t to)
2482{ 2484{
2483 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2485 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2484 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2486 unsigned short ee_len = ext4_ext_get_actual_len(ex);
2485 ext4_fsblk_t pblk; 2487 ext4_fsblk_t pblk;
2486 int flags = get_default_free_blocks_flags(inode); 2488 int flags = get_default_free_blocks_flags(inode);
2487 2489
@@ -2490,7 +2492,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2490 * at the beginning of the extent. Instead, we make a note 2492 * at the beginning of the extent. Instead, we make a note
2491 * that we tried freeing the cluster, and check to see if we 2493 * that we tried freeing the cluster, and check to see if we
2492 * need to free it on a subsequent call to ext4_remove_blocks, 2494 * need to free it on a subsequent call to ext4_remove_blocks,
2493 * or at the end of the ext4_truncate() operation. 2495 * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
2494 */ 2496 */
2495 flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; 2497 flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2496 2498
@@ -2501,8 +2503,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2501 * partial cluster here. 2503 * partial cluster here.
2502 */ 2504 */
2503 pblk = ext4_ext_pblock(ex) + ee_len - 1; 2505 pblk = ext4_ext_pblock(ex) + ee_len - 1;
2504 if ((*partial_cluster > 0) && 2506 if (*partial_cluster > 0 &&
2505 (EXT4_B2C(sbi, pblk) != *partial_cluster)) { 2507 *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
2506 ext4_free_blocks(handle, inode, NULL, 2508 ext4_free_blocks(handle, inode, NULL,
2507 EXT4_C2B(sbi, *partial_cluster), 2509 EXT4_C2B(sbi, *partial_cluster),
2508 sbi->s_cluster_ratio, flags); 2510 sbi->s_cluster_ratio, flags);
@@ -2528,7 +2530,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2528 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 2530 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2529 /* tail removal */ 2531 /* tail removal */
2530 ext4_lblk_t num; 2532 ext4_lblk_t num;
2531 unsigned int unaligned; 2533 long long first_cluster;
2532 2534
2533 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2535 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2534 pblk = ext4_ext_pblock(ex) + ee_len - num; 2536 pblk = ext4_ext_pblock(ex) + ee_len - num;
@@ -2538,7 +2540,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2538 * used by any other extent (partial_cluster is negative). 2540 * used by any other extent (partial_cluster is negative).
2539 */ 2541 */
2540 if (*partial_cluster < 0 && 2542 if (*partial_cluster < 0 &&
2541 -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1)) 2543 *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1))
2542 flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; 2544 flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2543 2545
2544 ext_debug("free last %u blocks starting %llu partial %lld\n", 2546 ext_debug("free last %u blocks starting %llu partial %lld\n",
@@ -2549,21 +2551,24 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2549 * beginning of a cluster, and we removed the entire 2551 * beginning of a cluster, and we removed the entire
2550 * extent and the cluster is not used by any other extent, 2552 * extent and the cluster is not used by any other extent,
2551 * save the partial cluster here, since we might need to 2553 * save the partial cluster here, since we might need to
2552 * delete if we determine that the truncate operation has 2554 * delete if we determine that the truncate or punch hole
2553 * removed all of the blocks in the cluster. 2555 * operation has removed all of the blocks in the cluster.
2556 * If that cluster is used by another extent, preserve its
2557 * negative value so it isn't freed later on.
2554 * 2558 *
2555 * On the other hand, if we did not manage to free the whole 2559 * If the whole extent wasn't freed, we've reached the
2556 * extent, we have to mark the cluster as used (store negative 2560 * start of the truncated/punched region and have finished
2557 * cluster number in partial_cluster). 2561 * removing blocks. If there's a partial cluster here it's
2562 * shared with the remainder of the extent and is no longer
2563 * a candidate for removal.
2558 */ 2564 */
2559 unaligned = EXT4_PBLK_COFF(sbi, pblk); 2565 if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) {
2560 if (unaligned && (ee_len == num) && 2566 first_cluster = (long long) EXT4_B2C(sbi, pblk);
2561 (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk)))) 2567 if (first_cluster != -*partial_cluster)
2562 *partial_cluster = EXT4_B2C(sbi, pblk); 2568 *partial_cluster = first_cluster;
2563 else if (unaligned) 2569 } else {
2564 *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
2565 else if (*partial_cluster > 0)
2566 *partial_cluster = 0; 2570 *partial_cluster = 0;
2571 }
2567 } else 2572 } else
2568 ext4_error(sbi->s_sb, "strange request: removal(2) " 2573 ext4_error(sbi->s_sb, "strange request: removal(2) "
2569 "%u-%u from %u:%u\n", 2574 "%u-%u from %u:%u\n",
@@ -2574,15 +2579,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2574 2579
2575/* 2580/*
2576 * ext4_ext_rm_leaf() Removes the extents associated with the 2581 * ext4_ext_rm_leaf() Removes the extents associated with the
2577 * blocks appearing between "start" and "end", and splits the extents 2582 * blocks appearing between "start" and "end". Both "start"
2578 * if "start" and "end" appear in the same extent 2583 * and "end" must appear in the same extent or EIO is returned.
2579 * 2584 *
2580 * @handle: The journal handle 2585 * @handle: The journal handle
2581 * @inode: The files inode 2586 * @inode: The files inode
2582 * @path: The path to the leaf 2587 * @path: The path to the leaf
2583 * @partial_cluster: The cluster which we'll have to free if all extents 2588 * @partial_cluster: The cluster which we'll have to free if all extents
2584 * has been released from it. It gets negative in case 2589 * has been released from it. However, if this value is
2585 * that the cluster is still used. 2590 * negative, it's a cluster just to the right of the
2591 * punched region and it must not be freed.
2586 * @start: The first block to remove 2592 * @start: The first block to remove
2587 * @end: The last block to remove 2593 * @end: The last block to remove
2588 */ 2594 */
@@ -2621,27 +2627,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2621 ex_ee_block = le32_to_cpu(ex->ee_block); 2627 ex_ee_block = le32_to_cpu(ex->ee_block);
2622 ex_ee_len = ext4_ext_get_actual_len(ex); 2628 ex_ee_len = ext4_ext_get_actual_len(ex);
2623 2629
2624 /*
2625 * If we're starting with an extent other than the last one in the
2626 * node, we need to see if it shares a cluster with the extent to
2627 * the right (towards the end of the file). If its leftmost cluster
2628 * is this extent's rightmost cluster and it is not cluster aligned,
2629 * we'll mark it as a partial that is not to be deallocated.
2630 */
2631
2632 if (ex != EXT_LAST_EXTENT(eh)) {
2633 ext4_fsblk_t current_pblk, right_pblk;
2634 long long current_cluster, right_cluster;
2635
2636 current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2637 current_cluster = (long long)EXT4_B2C(sbi, current_pblk);
2638 right_pblk = ext4_ext_pblock(ex + 1);
2639 right_cluster = (long long)EXT4_B2C(sbi, right_pblk);
2640 if (current_cluster == right_cluster &&
2641 EXT4_PBLK_COFF(sbi, right_pblk))
2642 *partial_cluster = -right_cluster;
2643 }
2644
2645 trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); 2630 trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
2646 2631
2647 while (ex >= EXT_FIRST_EXTENT(eh) && 2632 while (ex >= EXT_FIRST_EXTENT(eh) &&
@@ -2666,14 +2651,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2666 if (end < ex_ee_block) { 2651 if (end < ex_ee_block) {
2667 /* 2652 /*
2668 * We're going to skip this extent and move to another, 2653 * We're going to skip this extent and move to another,
2669 * so if this extent is not cluster aligned we have 2654 * so note that its first cluster is in use to avoid
2670 * to mark the current cluster as used to avoid 2655 * freeing it when removing blocks. Eventually, the
2671 * accidentally freeing it later on 2656 * right edge of the truncated/punched region will
2657 * be just to the left.
2672 */ 2658 */
2673 pblk = ext4_ext_pblock(ex); 2659 if (sbi->s_cluster_ratio > 1) {
2674 if (EXT4_PBLK_COFF(sbi, pblk)) 2660 pblk = ext4_ext_pblock(ex);
2675 *partial_cluster = 2661 *partial_cluster =
2676 -((long long)EXT4_B2C(sbi, pblk)); 2662 -(long long) EXT4_B2C(sbi, pblk);
2663 }
2677 ex--; 2664 ex--;
2678 ex_ee_block = le32_to_cpu(ex->ee_block); 2665 ex_ee_block = le32_to_cpu(ex->ee_block);
2679 ex_ee_len = ext4_ext_get_actual_len(ex); 2666 ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2749,8 +2736,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2749 sizeof(struct ext4_extent)); 2736 sizeof(struct ext4_extent));
2750 } 2737 }
2751 le16_add_cpu(&eh->eh_entries, -1); 2738 le16_add_cpu(&eh->eh_entries, -1);
2752 } else if (*partial_cluster > 0) 2739 }
2753 *partial_cluster = 0;
2754 2740
2755 err = ext4_ext_dirty(handle, inode, path + depth); 2741 err = ext4_ext_dirty(handle, inode, path + depth);
2756 if (err) 2742 if (err)
@@ -2769,20 +2755,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2769 /* 2755 /*
2770 * If there's a partial cluster and at least one extent remains in 2756 * If there's a partial cluster and at least one extent remains in
2771 * the leaf, free the partial cluster if it isn't shared with the 2757 * the leaf, free the partial cluster if it isn't shared with the
2772 * current extent. If there's a partial cluster and no extents 2758 * current extent. If it is shared with the current extent
2773 * remain in the leaf, it can't be freed here. It can only be 2759 * we zero partial_cluster because we've reached the start of the
2774 * freed when it's possible to determine if it's not shared with 2760 * truncated/punched region and we're done removing blocks.
2775 * any other extent - when the next leaf is processed or when space
2776 * removal is complete.
2777 */ 2761 */
2778 if (*partial_cluster > 0 && eh->eh_entries && 2762 if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) {
2779 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != 2763 pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2780 *partial_cluster)) { 2764 if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
2781 int flags = get_default_free_blocks_flags(inode); 2765 ext4_free_blocks(handle, inode, NULL,
2782 2766 EXT4_C2B(sbi, *partial_cluster),
2783 ext4_free_blocks(handle, inode, NULL, 2767 sbi->s_cluster_ratio,
2784 EXT4_C2B(sbi, *partial_cluster), 2768 get_default_free_blocks_flags(inode));
2785 sbi->s_cluster_ratio, flags); 2769 }
2786 *partial_cluster = 0; 2770 *partial_cluster = 0;
2787 } 2771 }
2788 2772
@@ -2819,7 +2803,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
2819int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, 2803int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2820 ext4_lblk_t end) 2804 ext4_lblk_t end)
2821{ 2805{
2822 struct super_block *sb = inode->i_sb; 2806 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2823 int depth = ext_depth(inode); 2807 int depth = ext_depth(inode);
2824 struct ext4_ext_path *path = NULL; 2808 struct ext4_ext_path *path = NULL;
2825 long long partial_cluster = 0; 2809 long long partial_cluster = 0;
@@ -2845,9 +2829,10 @@ again:
2845 */ 2829 */
2846 if (end < EXT_MAX_BLOCKS - 1) { 2830 if (end < EXT_MAX_BLOCKS - 1) {
2847 struct ext4_extent *ex; 2831 struct ext4_extent *ex;
2848 ext4_lblk_t ee_block; 2832 ext4_lblk_t ee_block, ex_end, lblk;
2833 ext4_fsblk_t pblk;
2849 2834
2850 /* find extent for this block */ 2835 /* find extent for or closest extent to this block */
2851 path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); 2836 path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
2852 if (IS_ERR(path)) { 2837 if (IS_ERR(path)) {
2853 ext4_journal_stop(handle); 2838 ext4_journal_stop(handle);
@@ -2867,6 +2852,7 @@ again:
2867 } 2852 }
2868 2853
2869 ee_block = le32_to_cpu(ex->ee_block); 2854 ee_block = le32_to_cpu(ex->ee_block);
2855 ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;
2870 2856
2871 /* 2857 /*
2872 * See if the last block is inside the extent, if so split 2858 * See if the last block is inside the extent, if so split
@@ -2874,8 +2860,19 @@ again:
2874 * tail of the first part of the split extent in 2860 * tail of the first part of the split extent in
2875 * ext4_ext_rm_leaf(). 2861 * ext4_ext_rm_leaf().
2876 */ 2862 */
2877 if (end >= ee_block && 2863 if (end >= ee_block && end < ex_end) {
2878 end < ee_block + ext4_ext_get_actual_len(ex) - 1) { 2864
2865 /*
2866 * If we're going to split the extent, note that
2867 * the cluster containing the block after 'end' is
2868 * in use to avoid freeing it when removing blocks.
2869 */
2870 if (sbi->s_cluster_ratio > 1) {
2871 pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
2872 partial_cluster =
2873 -(long long) EXT4_B2C(sbi, pblk);
2874 }
2875
2879 /* 2876 /*
2880 * Split the extent in two so that 'end' is the last 2877 * Split the extent in two so that 'end' is the last
2881 * block in the first new extent. Also we should not 2878 * block in the first new extent. Also we should not
@@ -2886,6 +2883,24 @@ again:
2886 end + 1, 1); 2883 end + 1, 1);
2887 if (err < 0) 2884 if (err < 0)
2888 goto out; 2885 goto out;
2886
2887 } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) {
2888 /*
2889 * If there's an extent to the right its first cluster
2890 * contains the immediate right boundary of the
2891 * truncated/punched region. Set partial_cluster to
2892 * its negative value so it won't be freed if shared
2893 * with the current extent. The end < ee_block case
2894 * is handled in ext4_ext_rm_leaf().
2895 */
2896 lblk = ex_end + 1;
2897 err = ext4_ext_search_right(inode, path, &lblk, &pblk,
2898 &ex);
2899 if (err)
2900 goto out;
2901 if (pblk)
2902 partial_cluster =
2903 -(long long) EXT4_B2C(sbi, pblk);
2889 } 2904 }
2890 } 2905 }
2891 /* 2906 /*
@@ -2996,16 +3011,18 @@ again:
2996 trace_ext4_ext_remove_space_done(inode, start, end, depth, 3011 trace_ext4_ext_remove_space_done(inode, start, end, depth,
2997 partial_cluster, path->p_hdr->eh_entries); 3012 partial_cluster, path->p_hdr->eh_entries);
2998 3013
2999 /* If we still have something in the partial cluster and we have removed 3014 /*
3015 * If we still have something in the partial cluster and we have removed
3000 * even the first extent, then we should free the blocks in the partial 3016 * even the first extent, then we should free the blocks in the partial
3001 * cluster as well. */ 3017 * cluster as well. (This code will only run when there are no leaves
3002 if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) { 3018 * to the immediate left of the truncated/punched region.)
3003 int flags = get_default_free_blocks_flags(inode); 3019 */
3004 3020 if (partial_cluster > 0 && err == 0) {
3021 /* don't zero partial_cluster since it's not used afterwards */
3005 ext4_free_blocks(handle, inode, NULL, 3022 ext4_free_blocks(handle, inode, NULL,
3006 EXT4_C2B(EXT4_SB(sb), partial_cluster), 3023 EXT4_C2B(sbi, partial_cluster),
3007 EXT4_SB(sb)->s_cluster_ratio, flags); 3024 sbi->s_cluster_ratio,
3008 partial_cluster = 0; 3025 get_default_free_blocks_flags(inode));
3009 } 3026 }
3010 3027
3011 /* TODO: flexible tree reduction should be here */ 3028 /* TODO: flexible tree reduction should be here */
@@ -4267,6 +4284,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4267 ext4_io_end_t *io = ext4_inode_aio(inode); 4284 ext4_io_end_t *io = ext4_inode_aio(inode);
4268 ext4_lblk_t cluster_offset; 4285 ext4_lblk_t cluster_offset;
4269 int set_unwritten = 0; 4286 int set_unwritten = 0;
4287 bool map_from_cluster = false;
4270 4288
4271 ext_debug("blocks %u/%u requested for inode %lu\n", 4289 ext_debug("blocks %u/%u requested for inode %lu\n",
4272 map->m_lblk, map->m_len, inode->i_ino); 4290 map->m_lblk, map->m_len, inode->i_ino);
@@ -4343,10 +4361,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4343 } 4361 }
4344 } 4362 }
4345 4363
4346 if ((sbi->s_cluster_ratio > 1) &&
4347 ext4_find_delalloc_cluster(inode, map->m_lblk))
4348 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
4349
4350 /* 4364 /*
4351 * requested block isn't allocated yet; 4365 * requested block isn't allocated yet;
4352 * we couldn't try to create block if create flag is zero 4366 * we couldn't try to create block if create flag is zero
@@ -4356,15 +4370,13 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4356 * put just found gap into cache to speed up 4370 * put just found gap into cache to speed up
4357 * subsequent requests 4371 * subsequent requests
4358 */ 4372 */
4359 if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0) 4373 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
4360 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
4361 goto out2; 4374 goto out2;
4362 } 4375 }
4363 4376
4364 /* 4377 /*
4365 * Okay, we need to do block allocation. 4378 * Okay, we need to do block allocation.
4366 */ 4379 */
4367 map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
4368 newex.ee_block = cpu_to_le32(map->m_lblk); 4380 newex.ee_block = cpu_to_le32(map->m_lblk);
4369 cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); 4381 cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4370 4382
@@ -4376,7 +4388,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4376 get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { 4388 get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
4377 ar.len = allocated = map->m_len; 4389 ar.len = allocated = map->m_len;
4378 newblock = map->m_pblk; 4390 newblock = map->m_pblk;
4379 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 4391 map_from_cluster = true;
4380 goto got_allocated_blocks; 4392 goto got_allocated_blocks;
4381 } 4393 }
4382 4394
@@ -4397,7 +4409,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4397 get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { 4409 get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
4398 ar.len = allocated = map->m_len; 4410 ar.len = allocated = map->m_len;
4399 newblock = map->m_pblk; 4411 newblock = map->m_pblk;
4400 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 4412 map_from_cluster = true;
4401 goto got_allocated_blocks; 4413 goto got_allocated_blocks;
4402 } 4414 }
4403 4415
@@ -4523,7 +4535,7 @@ got_allocated_blocks:
4523 */ 4535 */
4524 reserved_clusters = get_reserved_cluster_alloc(inode, 4536 reserved_clusters = get_reserved_cluster_alloc(inode,
4525 map->m_lblk, allocated); 4537 map->m_lblk, allocated);
4526 if (map->m_flags & EXT4_MAP_FROM_CLUSTER) { 4538 if (map_from_cluster) {
4527 if (reserved_clusters) { 4539 if (reserved_clusters) {
4528 /* 4540 /*
4529 * We have clusters reserved for this range. 4541 * We have clusters reserved for this range.
@@ -4620,7 +4632,6 @@ out2:
4620 4632
4621 trace_ext4_ext_map_blocks_exit(inode, flags, map, 4633 trace_ext4_ext_map_blocks_exit(inode, flags, map,
4622 err ? err : allocated); 4634 err ? err : allocated);
4623 ext4_es_lru_add(inode);
4624 return err ? err : allocated; 4635 return err ? err : allocated;
4625} 4636}
4626 4637
@@ -5140,7 +5151,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5140 if (ext4_has_inline_data(inode)) { 5151 if (ext4_has_inline_data(inode)) {
5141 int has_inline = 1; 5152 int has_inline = 1;
5142 5153
5143 error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline); 5154 error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline,
5155 start, len);
5144 5156
5145 if (has_inline) 5157 if (has_inline)
5146 return error; 5158 return error;
@@ -5154,8 +5166,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5154 5166
5155 /* fallback to generic here if not in extents fmt */ 5167 /* fallback to generic here if not in extents fmt */
5156 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 5168 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5157 return generic_block_fiemap(inode, fieinfo, start, len, 5169 return __generic_block_fiemap(inode, fieinfo, start, len,
5158 ext4_get_block); 5170 ext4_get_block);
5159 5171
5160 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) 5172 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
5161 return -EBADR; 5173 return -EBADR;
@@ -5179,7 +5191,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5179 error = ext4_fill_fiemap_extents(inode, start_blk, 5191 error = ext4_fill_fiemap_extents(inode, start_blk,
5180 len_blks, fieinfo); 5192 len_blks, fieinfo);
5181 } 5193 }
5182 ext4_es_lru_add(inode);
5183 return error; 5194 return error;
5184} 5195}
5185 5196
@@ -5239,8 +5250,6 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
5239 return -EIO; 5250 return -EIO;
5240 5251
5241 ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); 5252 ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
5242 if (!ex_last)
5243 return -EIO;
5244 5253
5245 err = ext4_access_path(handle, inode, path + depth); 5254 err = ext4_access_path(handle, inode, path + depth);
5246 if (err) 5255 if (err)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 94e7855ae71b..e04d45733976 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -147,10 +147,9 @@ static struct kmem_cache *ext4_es_cachep;
147static int __es_insert_extent(struct inode *inode, struct extent_status *newes); 147static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
148static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, 148static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
149 ext4_lblk_t end); 149 ext4_lblk_t end);
150static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, 150static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
151 int nr_to_scan); 151static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
152static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, 152 struct ext4_inode_info *locked_ei);
153 struct ext4_inode_info *locked_ei);
154 153
155int __init ext4_init_es(void) 154int __init ext4_init_es(void)
156{ 155{
@@ -298,6 +297,36 @@ out:
298 trace_ext4_es_find_delayed_extent_range_exit(inode, es); 297 trace_ext4_es_find_delayed_extent_range_exit(inode, es);
299} 298}
300 299
300static void ext4_es_list_add(struct inode *inode)
301{
302 struct ext4_inode_info *ei = EXT4_I(inode);
303 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
304
305 if (!list_empty(&ei->i_es_list))
306 return;
307
308 spin_lock(&sbi->s_es_lock);
309 if (list_empty(&ei->i_es_list)) {
310 list_add_tail(&ei->i_es_list, &sbi->s_es_list);
311 sbi->s_es_nr_inode++;
312 }
313 spin_unlock(&sbi->s_es_lock);
314}
315
316static void ext4_es_list_del(struct inode *inode)
317{
318 struct ext4_inode_info *ei = EXT4_I(inode);
319 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
320
321 spin_lock(&sbi->s_es_lock);
322 if (!list_empty(&ei->i_es_list)) {
323 list_del_init(&ei->i_es_list);
324 sbi->s_es_nr_inode--;
325 WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
326 }
327 spin_unlock(&sbi->s_es_lock);
328}
329
301static struct extent_status * 330static struct extent_status *
302ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, 331ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
303 ext4_fsblk_t pblk) 332 ext4_fsblk_t pblk)
@@ -314,9 +343,10 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
314 * We don't count delayed extent because we never try to reclaim them 343 * We don't count delayed extent because we never try to reclaim them
315 */ 344 */
316 if (!ext4_es_is_delayed(es)) { 345 if (!ext4_es_is_delayed(es)) {
317 EXT4_I(inode)->i_es_lru_nr++; 346 if (!EXT4_I(inode)->i_es_shk_nr++)
347 ext4_es_list_add(inode);
318 percpu_counter_inc(&EXT4_SB(inode->i_sb)-> 348 percpu_counter_inc(&EXT4_SB(inode->i_sb)->
319 s_es_stats.es_stats_lru_cnt); 349 s_es_stats.es_stats_shk_cnt);
320 } 350 }
321 351
322 EXT4_I(inode)->i_es_all_nr++; 352 EXT4_I(inode)->i_es_all_nr++;
@@ -330,12 +360,13 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
330 EXT4_I(inode)->i_es_all_nr--; 360 EXT4_I(inode)->i_es_all_nr--;
331 percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); 361 percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
332 362
333 /* Decrease the lru counter when this es is not delayed */ 363 /* Decrease the shrink counter when this es is not delayed */
334 if (!ext4_es_is_delayed(es)) { 364 if (!ext4_es_is_delayed(es)) {
335 BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); 365 BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
336 EXT4_I(inode)->i_es_lru_nr--; 366 if (!--EXT4_I(inode)->i_es_shk_nr)
367 ext4_es_list_del(inode);
337 percpu_counter_dec(&EXT4_SB(inode->i_sb)-> 368 percpu_counter_dec(&EXT4_SB(inode->i_sb)->
338 s_es_stats.es_stats_lru_cnt); 369 s_es_stats.es_stats_shk_cnt);
339 } 370 }
340 371
341 kmem_cache_free(ext4_es_cachep, es); 372 kmem_cache_free(ext4_es_cachep, es);
@@ -351,7 +382,7 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
351static int ext4_es_can_be_merged(struct extent_status *es1, 382static int ext4_es_can_be_merged(struct extent_status *es1,
352 struct extent_status *es2) 383 struct extent_status *es2)
353{ 384{
354 if (ext4_es_status(es1) != ext4_es_status(es2)) 385 if (ext4_es_type(es1) != ext4_es_type(es2))
355 return 0; 386 return 0;
356 387
357 if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) { 388 if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
@@ -394,6 +425,8 @@ ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
394 es1 = rb_entry(node, struct extent_status, rb_node); 425 es1 = rb_entry(node, struct extent_status, rb_node);
395 if (ext4_es_can_be_merged(es1, es)) { 426 if (ext4_es_can_be_merged(es1, es)) {
396 es1->es_len += es->es_len; 427 es1->es_len += es->es_len;
428 if (ext4_es_is_referenced(es))
429 ext4_es_set_referenced(es1);
397 rb_erase(&es->rb_node, &tree->root); 430 rb_erase(&es->rb_node, &tree->root);
398 ext4_es_free_extent(inode, es); 431 ext4_es_free_extent(inode, es);
399 es = es1; 432 es = es1;
@@ -416,6 +449,8 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
416 es1 = rb_entry(node, struct extent_status, rb_node); 449 es1 = rb_entry(node, struct extent_status, rb_node);
417 if (ext4_es_can_be_merged(es, es1)) { 450 if (ext4_es_can_be_merged(es, es1)) {
418 es->es_len += es1->es_len; 451 es->es_len += es1->es_len;
452 if (ext4_es_is_referenced(es1))
453 ext4_es_set_referenced(es);
419 rb_erase(node, &tree->root); 454 rb_erase(node, &tree->root);
420 ext4_es_free_extent(inode, es1); 455 ext4_es_free_extent(inode, es1);
421 } 456 }
@@ -683,8 +718,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
683 goto error; 718 goto error;
684retry: 719retry:
685 err = __es_insert_extent(inode, &newes); 720 err = __es_insert_extent(inode, &newes);
686 if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, 721 if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
687 EXT4_I(inode))) 722 128, EXT4_I(inode)))
688 goto retry; 723 goto retry;
689 if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) 724 if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
690 err = 0; 725 err = 0;
@@ -782,6 +817,8 @@ out:
782 es->es_lblk = es1->es_lblk; 817 es->es_lblk = es1->es_lblk;
783 es->es_len = es1->es_len; 818 es->es_len = es1->es_len;
784 es->es_pblk = es1->es_pblk; 819 es->es_pblk = es1->es_pblk;
820 if (!ext4_es_is_referenced(es))
821 ext4_es_set_referenced(es);
785 stats->es_stats_cache_hits++; 822 stats->es_stats_cache_hits++;
786 } else { 823 } else {
787 stats->es_stats_cache_misses++; 824 stats->es_stats_cache_misses++;
@@ -841,8 +878,8 @@ retry:
841 es->es_lblk = orig_es.es_lblk; 878 es->es_lblk = orig_es.es_lblk;
842 es->es_len = orig_es.es_len; 879 es->es_len = orig_es.es_len;
843 if ((err == -ENOMEM) && 880 if ((err == -ENOMEM) &&
844 __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, 881 __es_shrink(EXT4_SB(inode->i_sb),
845 EXT4_I(inode))) 882 128, EXT4_I(inode)))
846 goto retry; 883 goto retry;
847 goto out; 884 goto out;
848 } 885 }
@@ -914,6 +951,11 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
914 end = lblk + len - 1; 951 end = lblk + len - 1;
915 BUG_ON(end < lblk); 952 BUG_ON(end < lblk);
916 953
954 /*
955 * ext4_clear_inode() depends on us taking i_es_lock unconditionally
956 * so that we are sure __es_shrink() is done with the inode before it
957 * is reclaimed.
958 */
917 write_lock(&EXT4_I(inode)->i_es_lock); 959 write_lock(&EXT4_I(inode)->i_es_lock);
918 err = __es_remove_extent(inode, lblk, end); 960 err = __es_remove_extent(inode, lblk, end);
919 write_unlock(&EXT4_I(inode)->i_es_lock); 961 write_unlock(&EXT4_I(inode)->i_es_lock);
@@ -921,114 +963,75 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
921 return err; 963 return err;
922} 964}
923 965
924static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, 966static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
925 struct list_head *b) 967 struct ext4_inode_info *locked_ei)
926{
927 struct ext4_inode_info *eia, *eib;
928 eia = list_entry(a, struct ext4_inode_info, i_es_lru);
929 eib = list_entry(b, struct ext4_inode_info, i_es_lru);
930
931 if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
932 !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
933 return 1;
934 if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
935 ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
936 return -1;
937 if (eia->i_touch_when == eib->i_touch_when)
938 return 0;
939 if (time_after(eia->i_touch_when, eib->i_touch_when))
940 return 1;
941 else
942 return -1;
943}
944
945static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
946 struct ext4_inode_info *locked_ei)
947{ 968{
948 struct ext4_inode_info *ei; 969 struct ext4_inode_info *ei;
949 struct ext4_es_stats *es_stats; 970 struct ext4_es_stats *es_stats;
950 struct list_head *cur, *tmp;
951 LIST_HEAD(skipped);
952 ktime_t start_time; 971 ktime_t start_time;
953 u64 scan_time; 972 u64 scan_time;
973 int nr_to_walk;
954 int nr_shrunk = 0; 974 int nr_shrunk = 0;
955 int retried = 0, skip_precached = 1, nr_skipped = 0; 975 int retried = 0, nr_skipped = 0;
956 976
957 es_stats = &sbi->s_es_stats; 977 es_stats = &sbi->s_es_stats;
958 start_time = ktime_get(); 978 start_time = ktime_get();
959 spin_lock(&sbi->s_es_lru_lock);
960 979
961retry: 980retry:
962 list_for_each_safe(cur, tmp, &sbi->s_es_lru) { 981 spin_lock(&sbi->s_es_lock);
963 int shrunk; 982 nr_to_walk = sbi->s_es_nr_inode;
964 983 while (nr_to_walk-- > 0) {
965 /* 984 if (list_empty(&sbi->s_es_list)) {
966 * If we have already reclaimed all extents from extent 985 spin_unlock(&sbi->s_es_lock);
967 * status tree, just stop the loop immediately. 986 goto out;
968 */ 987 }
969 if (percpu_counter_read_positive( 988 ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
970 &es_stats->es_stats_lru_cnt) == 0) 989 i_es_list);
971 break; 990 /* Move the inode to the tail */
972 991 list_move_tail(&ei->i_es_list, &sbi->s_es_list);
973 ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
974 992
975 /* 993 /*
976 * Skip the inode that is newer than the last_sorted 994 * Normally we try hard to avoid shrinking precached inodes,
977 * time. Normally we try hard to avoid shrinking 995 * but we will as a last resort.
978 * precached inodes, but we will as a last resort.
979 */ 996 */
980 if ((es_stats->es_stats_last_sorted < ei->i_touch_when) || 997 if (!retried && ext4_test_inode_state(&ei->vfs_inode,
981 (skip_precached && ext4_test_inode_state(&ei->vfs_inode, 998 EXT4_STATE_EXT_PRECACHED)) {
982 EXT4_STATE_EXT_PRECACHED))) {
983 nr_skipped++; 999 nr_skipped++;
984 list_move_tail(cur, &skipped);
985 continue; 1000 continue;
986 } 1001 }
987 1002
988 if (ei->i_es_lru_nr == 0 || ei == locked_ei || 1003 if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
989 !write_trylock(&ei->i_es_lock)) 1004 nr_skipped++;
990 continue; 1005 continue;
1006 }
1007 /*
1008 * Now we hold i_es_lock which protects us from inode reclaim
1009 * freeing inode under us
1010 */
1011 spin_unlock(&sbi->s_es_lock);
991 1012
992 shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); 1013 nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
993 if (ei->i_es_lru_nr == 0)
994 list_del_init(&ei->i_es_lru);
995 write_unlock(&ei->i_es_lock); 1014 write_unlock(&ei->i_es_lock);
996 1015
997 nr_shrunk += shrunk; 1016 if (nr_to_scan <= 0)
998 nr_to_scan -= shrunk; 1017 goto out;
999 if (nr_to_scan == 0) 1018 spin_lock(&sbi->s_es_lock);
1000 break;
1001 } 1019 }
1002 1020 spin_unlock(&sbi->s_es_lock);
1003 /* Move the newer inodes into the tail of the LRU list. */
1004 list_splice_tail(&skipped, &sbi->s_es_lru);
1005 INIT_LIST_HEAD(&skipped);
1006 1021
1007 /* 1022 /*
1008 * If we skipped any inodes, and we weren't able to make any 1023 * If we skipped any inodes, and we weren't able to make any
1009 * forward progress, sort the list and try again. 1024 * forward progress, try again to scan precached inodes.
1010 */ 1025 */
1011 if ((nr_shrunk == 0) && nr_skipped && !retried) { 1026 if ((nr_shrunk == 0) && nr_skipped && !retried) {
1012 retried++; 1027 retried++;
1013 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
1014 es_stats->es_stats_last_sorted = jiffies;
1015 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
1016 i_es_lru);
1017 /*
1018 * If there are no non-precached inodes left on the
1019 * list, start releasing precached extents.
1020 */
1021 if (ext4_test_inode_state(&ei->vfs_inode,
1022 EXT4_STATE_EXT_PRECACHED))
1023 skip_precached = 0;
1024 goto retry; 1028 goto retry;
1025 } 1029 }
1026 1030
1027 spin_unlock(&sbi->s_es_lru_lock);
1028
1029 if (locked_ei && nr_shrunk == 0) 1031 if (locked_ei && nr_shrunk == 0)
1030 nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); 1032 nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
1031 1033
1034out:
1032 scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1035 scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1033 if (likely(es_stats->es_stats_scan_time)) 1036 if (likely(es_stats->es_stats_scan_time))
1034 es_stats->es_stats_scan_time = (scan_time + 1037 es_stats->es_stats_scan_time = (scan_time +
@@ -1043,7 +1046,7 @@ retry:
1043 else 1046 else
1044 es_stats->es_stats_shrunk = nr_shrunk; 1047 es_stats->es_stats_shrunk = nr_shrunk;
1045 1048
1046 trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached, 1049 trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
1047 nr_skipped, retried); 1050 nr_skipped, retried);
1048 return nr_shrunk; 1051 return nr_shrunk;
1049} 1052}
@@ -1055,7 +1058,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
1055 struct ext4_sb_info *sbi; 1058 struct ext4_sb_info *sbi;
1056 1059
1057 sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); 1060 sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
1058 nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); 1061 nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1059 trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); 1062 trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
1060 return nr; 1063 return nr;
1061} 1064}
@@ -1068,13 +1071,13 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
1068 int nr_to_scan = sc->nr_to_scan; 1071 int nr_to_scan = sc->nr_to_scan;
1069 int ret, nr_shrunk; 1072 int ret, nr_shrunk;
1070 1073
1071 ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); 1074 ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1072 trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); 1075 trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
1073 1076
1074 if (!nr_to_scan) 1077 if (!nr_to_scan)
1075 return ret; 1078 return ret;
1076 1079
1077 nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); 1080 nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
1078 1081
1079 trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret); 1082 trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
1080 return nr_shrunk; 1083 return nr_shrunk;
@@ -1102,28 +1105,24 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
1102 return 0; 1105 return 0;
1103 1106
1104 /* here we just find an inode that has the max nr. of objects */ 1107 /* here we just find an inode that has the max nr. of objects */
1105 spin_lock(&sbi->s_es_lru_lock); 1108 spin_lock(&sbi->s_es_lock);
1106 list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) { 1109 list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
1107 inode_cnt++; 1110 inode_cnt++;
1108 if (max && max->i_es_all_nr < ei->i_es_all_nr) 1111 if (max && max->i_es_all_nr < ei->i_es_all_nr)
1109 max = ei; 1112 max = ei;
1110 else if (!max) 1113 else if (!max)
1111 max = ei; 1114 max = ei;
1112 } 1115 }
1113 spin_unlock(&sbi->s_es_lru_lock); 1116 spin_unlock(&sbi->s_es_lock);
1114 1117
1115 seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", 1118 seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
1116 percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), 1119 percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
1117 percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt)); 1120 percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
1118 seq_printf(seq, " %lu/%lu cache hits/misses\n", 1121 seq_printf(seq, " %lu/%lu cache hits/misses\n",
1119 es_stats->es_stats_cache_hits, 1122 es_stats->es_stats_cache_hits,
1120 es_stats->es_stats_cache_misses); 1123 es_stats->es_stats_cache_misses);
1121 if (es_stats->es_stats_last_sorted != 0)
1122 seq_printf(seq, " %u ms last sorted interval\n",
1123 jiffies_to_msecs(jiffies -
1124 es_stats->es_stats_last_sorted));
1125 if (inode_cnt) 1124 if (inode_cnt)
1126 seq_printf(seq, " %d inodes on lru list\n", inode_cnt); 1125 seq_printf(seq, " %d inodes on list\n", inode_cnt);
1127 1126
1128 seq_printf(seq, "average:\n %llu us scan time\n", 1127 seq_printf(seq, "average:\n %llu us scan time\n",
1129 div_u64(es_stats->es_stats_scan_time, 1000)); 1128 div_u64(es_stats->es_stats_scan_time, 1000));
@@ -1132,7 +1131,7 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
1132 seq_printf(seq, 1131 seq_printf(seq,
1133 "maximum:\n %lu inode (%u objects, %u reclaimable)\n" 1132 "maximum:\n %lu inode (%u objects, %u reclaimable)\n"
1134 " %llu us max scan time\n", 1133 " %llu us max scan time\n",
1135 max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr, 1134 max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
1136 div_u64(es_stats->es_stats_max_scan_time, 1000)); 1135 div_u64(es_stats->es_stats_max_scan_time, 1000));
1137 1136
1138 return 0; 1137 return 0;
@@ -1181,9 +1180,11 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
1181{ 1180{
1182 int err; 1181 int err;
1183 1182
1184 INIT_LIST_HEAD(&sbi->s_es_lru); 1183 /* Make sure we have enough bits for physical block number */
1185 spin_lock_init(&sbi->s_es_lru_lock); 1184 BUILD_BUG_ON(ES_SHIFT < 48);
1186 sbi->s_es_stats.es_stats_last_sorted = 0; 1185 INIT_LIST_HEAD(&sbi->s_es_list);
1186 sbi->s_es_nr_inode = 0;
1187 spin_lock_init(&sbi->s_es_lock);
1187 sbi->s_es_stats.es_stats_shrunk = 0; 1188 sbi->s_es_stats.es_stats_shrunk = 0;
1188 sbi->s_es_stats.es_stats_cache_hits = 0; 1189 sbi->s_es_stats.es_stats_cache_hits = 0;
1189 sbi->s_es_stats.es_stats_cache_misses = 0; 1190 sbi->s_es_stats.es_stats_cache_misses = 0;
@@ -1192,7 +1193,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
1192 err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL); 1193 err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
1193 if (err) 1194 if (err)
1194 return err; 1195 return err;
1195 err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0, GFP_KERNEL); 1196 err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
1196 if (err) 1197 if (err)
1197 goto err1; 1198 goto err1;
1198 1199
@@ -1210,7 +1211,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
1210 return 0; 1211 return 0;
1211 1212
1212err2: 1213err2:
1213 percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); 1214 percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1214err1: 1215err1:
1215 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); 1216 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1216 return err; 1217 return err;
@@ -1221,71 +1222,83 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
1221 if (sbi->s_proc) 1222 if (sbi->s_proc)
1222 remove_proc_entry("es_shrinker_info", sbi->s_proc); 1223 remove_proc_entry("es_shrinker_info", sbi->s_proc);
1223 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); 1224 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1224 percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); 1225 percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1225 unregister_shrinker(&sbi->s_es_shrinker); 1226 unregister_shrinker(&sbi->s_es_shrinker);
1226} 1227}
1227 1228
1228void ext4_es_lru_add(struct inode *inode) 1229/*
1230 * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
1231 * most *nr_to_scan extents, update *nr_to_scan accordingly.
1232 *
1233 * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
1234 * Increment *nr_shrunk by the number of reclaimed extents. Also update
1235 * ei->i_es_shrink_lblk to where we should continue scanning.
1236 */
1237static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
1238 int *nr_to_scan, int *nr_shrunk)
1229{ 1239{
1230 struct ext4_inode_info *ei = EXT4_I(inode); 1240 struct inode *inode = &ei->vfs_inode;
1231 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1241 struct ext4_es_tree *tree = &ei->i_es_tree;
1232 1242 struct extent_status *es;
1233 ei->i_touch_when = jiffies; 1243 struct rb_node *node;
1234
1235 if (!list_empty(&ei->i_es_lru))
1236 return;
1237 1244
1238 spin_lock(&sbi->s_es_lru_lock); 1245 es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
1239 if (list_empty(&ei->i_es_lru)) 1246 if (!es)
1240 list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); 1247 goto out_wrap;
1241 spin_unlock(&sbi->s_es_lru_lock); 1248 node = &es->rb_node;
1242} 1249 while (*nr_to_scan > 0) {
1250 if (es->es_lblk > end) {
1251 ei->i_es_shrink_lblk = end + 1;
1252 return 0;
1253 }
1243 1254
1244void ext4_es_lru_del(struct inode *inode) 1255 (*nr_to_scan)--;
1245{ 1256 node = rb_next(&es->rb_node);
1246 struct ext4_inode_info *ei = EXT4_I(inode); 1257 /*
1247 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1258 * We can't reclaim delayed extent from status tree because
1259 * fiemap, bigallic, and seek_data/hole need to use it.
1260 */
1261 if (ext4_es_is_delayed(es))
1262 goto next;
1263 if (ext4_es_is_referenced(es)) {
1264 ext4_es_clear_referenced(es);
1265 goto next;
1266 }
1248 1267
1249 spin_lock(&sbi->s_es_lru_lock); 1268 rb_erase(&es->rb_node, &tree->root);
1250 if (!list_empty(&ei->i_es_lru)) 1269 ext4_es_free_extent(inode, es);
1251 list_del_init(&ei->i_es_lru); 1270 (*nr_shrunk)++;
1252 spin_unlock(&sbi->s_es_lru_lock); 1271next:
1272 if (!node)
1273 goto out_wrap;
1274 es = rb_entry(node, struct extent_status, rb_node);
1275 }
1276 ei->i_es_shrink_lblk = es->es_lblk;
1277 return 1;
1278out_wrap:
1279 ei->i_es_shrink_lblk = 0;
1280 return 0;
1253} 1281}
1254 1282
1255static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, 1283static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
1256 int nr_to_scan)
1257{ 1284{
1258 struct inode *inode = &ei->vfs_inode; 1285 struct inode *inode = &ei->vfs_inode;
1259 struct ext4_es_tree *tree = &ei->i_es_tree; 1286 int nr_shrunk = 0;
1260 struct rb_node *node; 1287 ext4_lblk_t start = ei->i_es_shrink_lblk;
1261 struct extent_status *es;
1262 unsigned long nr_shrunk = 0;
1263 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, 1288 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1264 DEFAULT_RATELIMIT_BURST); 1289 DEFAULT_RATELIMIT_BURST);
1265 1290
1266 if (ei->i_es_lru_nr == 0) 1291 if (ei->i_es_shk_nr == 0)
1267 return 0; 1292 return 0;
1268 1293
1269 if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && 1294 if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
1270 __ratelimit(&_rs)) 1295 __ratelimit(&_rs))
1271 ext4_warning(inode->i_sb, "forced shrink of precached extents"); 1296 ext4_warning(inode->i_sb, "forced shrink of precached extents");
1272 1297
1273 node = rb_first(&tree->root); 1298 if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
1274 while (node != NULL) { 1299 start != 0)
1275 es = rb_entry(node, struct extent_status, rb_node); 1300 es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
1276 node = rb_next(&es->rb_node); 1301
1277 /* 1302 ei->i_es_tree.cache_es = NULL;
1278 * We can't reclaim delayed extent from status tree because
1279 * fiemap, bigallic, and seek_data/hole need to use it.
1280 */
1281 if (!ext4_es_is_delayed(es)) {
1282 rb_erase(&es->rb_node, &tree->root);
1283 ext4_es_free_extent(inode, es);
1284 nr_shrunk++;
1285 if (--nr_to_scan == 0)
1286 break;
1287 }
1288 }
1289 tree->cache_es = NULL;
1290 return nr_shrunk; 1303 return nr_shrunk;
1291} 1304}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index efd5f970b501..691b52613ce4 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -29,25 +29,28 @@
29/* 29/*
30 * These flags live in the high bits of extent_status.es_pblk 30 * These flags live in the high bits of extent_status.es_pblk
31 */ 31 */
32#define ES_SHIFT 60 32enum {
33 33 ES_WRITTEN_B,
34#define EXTENT_STATUS_WRITTEN (1 << 3) 34 ES_UNWRITTEN_B,
35#define EXTENT_STATUS_UNWRITTEN (1 << 2) 35 ES_DELAYED_B,
36#define EXTENT_STATUS_DELAYED (1 << 1) 36 ES_HOLE_B,
37#define EXTENT_STATUS_HOLE (1 << 0) 37 ES_REFERENCED_B,
38 ES_FLAGS
39};
38 40
39#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \ 41#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS)
40 EXTENT_STATUS_UNWRITTEN | \ 42#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT)
41 EXTENT_STATUS_DELAYED | \
42 EXTENT_STATUS_HOLE)
43 43
44#define ES_WRITTEN (1ULL << 63) 44#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B)
45#define ES_UNWRITTEN (1ULL << 62) 45#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B)
46#define ES_DELAYED (1ULL << 61) 46#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B)
47#define ES_HOLE (1ULL << 60) 47#define EXTENT_STATUS_HOLE (1 << ES_HOLE_B)
48#define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B)
48 49
49#define ES_MASK (ES_WRITTEN | ES_UNWRITTEN | \ 50#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \
50 ES_DELAYED | ES_HOLE) 51 EXTENT_STATUS_UNWRITTEN | \
52 EXTENT_STATUS_DELAYED | \
53 EXTENT_STATUS_HOLE) << ES_SHIFT)
51 54
52struct ext4_sb_info; 55struct ext4_sb_info;
53struct ext4_extent; 56struct ext4_extent;
@@ -65,14 +68,13 @@ struct ext4_es_tree {
65}; 68};
66 69
67struct ext4_es_stats { 70struct ext4_es_stats {
68 unsigned long es_stats_last_sorted;
69 unsigned long es_stats_shrunk; 71 unsigned long es_stats_shrunk;
70 unsigned long es_stats_cache_hits; 72 unsigned long es_stats_cache_hits;
71 unsigned long es_stats_cache_misses; 73 unsigned long es_stats_cache_misses;
72 u64 es_stats_scan_time; 74 u64 es_stats_scan_time;
73 u64 es_stats_max_scan_time; 75 u64 es_stats_max_scan_time;
74 struct percpu_counter es_stats_all_cnt; 76 struct percpu_counter es_stats_all_cnt;
75 struct percpu_counter es_stats_lru_cnt; 77 struct percpu_counter es_stats_shk_cnt;
76}; 78};
77 79
78extern int __init ext4_init_es(void); 80extern int __init ext4_init_es(void);
@@ -93,29 +95,49 @@ extern void ext4_es_find_delayed_extent_range(struct inode *inode,
93extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, 95extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
94 struct extent_status *es); 96 struct extent_status *es);
95 97
98static inline unsigned int ext4_es_status(struct extent_status *es)
99{
100 return es->es_pblk >> ES_SHIFT;
101}
102
103static inline unsigned int ext4_es_type(struct extent_status *es)
104{
105 return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT;
106}
107
96static inline int ext4_es_is_written(struct extent_status *es) 108static inline int ext4_es_is_written(struct extent_status *es)
97{ 109{
98 return (es->es_pblk & ES_WRITTEN) != 0; 110 return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0;
99} 111}
100 112
101static inline int ext4_es_is_unwritten(struct extent_status *es) 113static inline int ext4_es_is_unwritten(struct extent_status *es)
102{ 114{
103 return (es->es_pblk & ES_UNWRITTEN) != 0; 115 return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0;
104} 116}
105 117
106static inline int ext4_es_is_delayed(struct extent_status *es) 118static inline int ext4_es_is_delayed(struct extent_status *es)
107{ 119{
108 return (es->es_pblk & ES_DELAYED) != 0; 120 return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0;
109} 121}
110 122
111static inline int ext4_es_is_hole(struct extent_status *es) 123static inline int ext4_es_is_hole(struct extent_status *es)
112{ 124{
113 return (es->es_pblk & ES_HOLE) != 0; 125 return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
114} 126}
115 127
116static inline unsigned int ext4_es_status(struct extent_status *es) 128static inline void ext4_es_set_referenced(struct extent_status *es)
117{ 129{
118 return es->es_pblk >> ES_SHIFT; 130 es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
131}
132
133static inline void ext4_es_clear_referenced(struct extent_status *es)
134{
135 es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT);
136}
137
138static inline int ext4_es_is_referenced(struct extent_status *es)
139{
140 return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0;
119} 141}
120 142
121static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) 143static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
@@ -135,23 +157,19 @@ static inline void ext4_es_store_pblock(struct extent_status *es,
135static inline void ext4_es_store_status(struct extent_status *es, 157static inline void ext4_es_store_status(struct extent_status *es,
136 unsigned int status) 158 unsigned int status)
137{ 159{
138 es->es_pblk = (((ext4_fsblk_t) 160 es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
139 (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | 161 (es->es_pblk & ~ES_MASK);
140 (es->es_pblk & ~ES_MASK));
141} 162}
142 163
143static inline void ext4_es_store_pblock_status(struct extent_status *es, 164static inline void ext4_es_store_pblock_status(struct extent_status *es,
144 ext4_fsblk_t pb, 165 ext4_fsblk_t pb,
145 unsigned int status) 166 unsigned int status)
146{ 167{
147 es->es_pblk = (((ext4_fsblk_t) 168 es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
148 (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | 169 (pb & ~ES_MASK);
149 (pb & ~ES_MASK));
150} 170}
151 171
152extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); 172extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
153extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); 173extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
154extern void ext4_es_lru_add(struct inode *inode);
155extern void ext4_es_lru_del(struct inode *inode);
156 174
157#endif /* _EXT4_EXTENTS_STATUS_H */ 175#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8131be8c0af3..513c12cf444c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -273,24 +273,19 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
273 * we determine this extent as a data or a hole according to whether the 273 * we determine this extent as a data or a hole according to whether the
274 * page cache has data or not. 274 * page cache has data or not.
275 */ 275 */
276static int ext4_find_unwritten_pgoff(struct inode *inode, 276static int ext4_find_unwritten_pgoff(struct inode *inode, int whence,
277 int whence, 277 loff_t endoff, loff_t *offset)
278 struct ext4_map_blocks *map,
279 loff_t *offset)
280{ 278{
281 struct pagevec pvec; 279 struct pagevec pvec;
282 unsigned int blkbits;
283 pgoff_t index; 280 pgoff_t index;
284 pgoff_t end; 281 pgoff_t end;
285 loff_t endoff;
286 loff_t startoff; 282 loff_t startoff;
287 loff_t lastoff; 283 loff_t lastoff;
288 int found = 0; 284 int found = 0;
289 285
290 blkbits = inode->i_sb->s_blocksize_bits;
291 startoff = *offset; 286 startoff = *offset;
292 lastoff = startoff; 287 lastoff = startoff;
293 endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits; 288
294 289
295 index = startoff >> PAGE_CACHE_SHIFT; 290 index = startoff >> PAGE_CACHE_SHIFT;
296 end = endoff >> PAGE_CACHE_SHIFT; 291 end = endoff >> PAGE_CACHE_SHIFT;
@@ -408,147 +403,144 @@ out:
408static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) 403static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
409{ 404{
410 struct inode *inode = file->f_mapping->host; 405 struct inode *inode = file->f_mapping->host;
411 struct ext4_map_blocks map; 406 struct fiemap_extent_info fie;
412 struct extent_status es; 407 struct fiemap_extent ext[2];
413 ext4_lblk_t start, last, end; 408 loff_t next;
414 loff_t dataoff, isize; 409 int i, ret = 0;
415 int blkbits;
416 int ret = 0;
417 410
418 mutex_lock(&inode->i_mutex); 411 mutex_lock(&inode->i_mutex);
419 412 if (offset >= inode->i_size) {
420 isize = i_size_read(inode);
421 if (offset >= isize) {
422 mutex_unlock(&inode->i_mutex); 413 mutex_unlock(&inode->i_mutex);
423 return -ENXIO; 414 return -ENXIO;
424 } 415 }
425 416 fie.fi_flags = 0;
426 blkbits = inode->i_sb->s_blocksize_bits; 417 fie.fi_extents_max = 2;
427 start = offset >> blkbits; 418 fie.fi_extents_start = (struct fiemap_extent __user *) &ext;
428 last = start; 419 while (1) {
429 end = isize >> blkbits; 420 mm_segment_t old_fs = get_fs();
430 dataoff = offset; 421
431 422 fie.fi_extents_mapped = 0;
432 do { 423 memset(ext, 0, sizeof(*ext) * fie.fi_extents_max);
433 map.m_lblk = last; 424
434 map.m_len = end - last + 1; 425 set_fs(get_ds());
435 ret = ext4_map_blocks(NULL, inode, &map, 0); 426 ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
436 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 427 set_fs(old_fs);
437 if (last != start) 428 if (ret)
438 dataoff = (loff_t)last << blkbits;
439 break; 429 break;
440 }
441 430
442 /* 431 /* No extents found, EOF */
443 * If there is a delay extent at this offset, 432 if (!fie.fi_extents_mapped) {
444 * it will be as a data. 433 ret = -ENXIO;
445 */
446 ext4_es_find_delayed_extent_range(inode, last, last, &es);
447 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
448 if (last != start)
449 dataoff = (loff_t)last << blkbits;
450 break; 434 break;
451 } 435 }
436 for (i = 0; i < fie.fi_extents_mapped; i++) {
437 next = (loff_t)(ext[i].fe_length + ext[i].fe_logical);
452 438
453 /* 439 if (offset < (loff_t)ext[i].fe_logical)
454 * If there is a unwritten extent at this offset, 440 offset = (loff_t)ext[i].fe_logical;
455 * it will be as a data or a hole according to page 441 /*
456 * cache that has data or not. 442 * If extent is not unwritten, then it contains valid
457 */ 443 * data, mapped or delayed.
458 if (map.m_flags & EXT4_MAP_UNWRITTEN) { 444 */
459 int unwritten; 445 if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN))
460 unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, 446 goto out;
461 &map, &dataoff);
462 if (unwritten)
463 break;
464 }
465 447
466 last++; 448 /*
467 dataoff = (loff_t)last << blkbits; 449 * If there is a unwritten extent at this offset,
468 } while (last <= end); 450 * it will be as a data or a hole according to page
451 * cache that has data or not.
452 */
453 if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
454 next, &offset))
455 goto out;
469 456
457 if (ext[i].fe_flags & FIEMAP_EXTENT_LAST) {
458 ret = -ENXIO;
459 goto out;
460 }
461 offset = next;
462 }
463 }
464 if (offset > inode->i_size)
465 offset = inode->i_size;
466out:
470 mutex_unlock(&inode->i_mutex); 467 mutex_unlock(&inode->i_mutex);
468 if (ret)
469 return ret;
471 470
472 if (dataoff > isize) 471 return vfs_setpos(file, offset, maxsize);
473 return -ENXIO;
474
475 return vfs_setpos(file, dataoff, maxsize);
476} 472}
477 473
478/* 474/*
479 * ext4_seek_hole() retrieves the offset for SEEK_HOLE. 475 * ext4_seek_hole() retrieves the offset for SEEK_HOLE
480 */ 476 */
481static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) 477static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
482{ 478{
483 struct inode *inode = file->f_mapping->host; 479 struct inode *inode = file->f_mapping->host;
484 struct ext4_map_blocks map; 480 struct fiemap_extent_info fie;
485 struct extent_status es; 481 struct fiemap_extent ext[2];
486 ext4_lblk_t start, last, end; 482 loff_t next;
487 loff_t holeoff, isize; 483 int i, ret = 0;
488 int blkbits;
489 int ret = 0;
490 484
491 mutex_lock(&inode->i_mutex); 485 mutex_lock(&inode->i_mutex);
492 486 if (offset >= inode->i_size) {
493 isize = i_size_read(inode);
494 if (offset >= isize) {
495 mutex_unlock(&inode->i_mutex); 487 mutex_unlock(&inode->i_mutex);
496 return -ENXIO; 488 return -ENXIO;
497 } 489 }
498 490
499 blkbits = inode->i_sb->s_blocksize_bits; 491 fie.fi_flags = 0;
500 start = offset >> blkbits; 492 fie.fi_extents_max = 2;
501 last = start; 493 fie.fi_extents_start = (struct fiemap_extent __user *)&ext;
502 end = isize >> blkbits; 494 while (1) {
503 holeoff = offset; 495 mm_segment_t old_fs = get_fs();
504 496
505 do { 497 fie.fi_extents_mapped = 0;
506 map.m_lblk = last; 498 memset(ext, 0, sizeof(*ext));
507 map.m_len = end - last + 1;
508 ret = ext4_map_blocks(NULL, inode, &map, 0);
509 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
510 last += ret;
511 holeoff = (loff_t)last << blkbits;
512 continue;
513 }
514 499
515 /* 500 set_fs(get_ds());
516 * If there is a delay extent at this offset, 501 ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
517 * we will skip this extent. 502 set_fs(old_fs);
518 */ 503 if (ret)
519 ext4_es_find_delayed_extent_range(inode, last, last, &es); 504 break;
520 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
521 last = es.es_lblk + es.es_len;
522 holeoff = (loff_t)last << blkbits;
523 continue;
524 }
525 505
526 /* 506 /* No extents found */
527 * If there is a unwritten extent at this offset, 507 if (!fie.fi_extents_mapped)
528 * it will be as a data or a hole according to page 508 break;
529 * cache that has data or not. 509
530 */ 510 for (i = 0; i < fie.fi_extents_mapped; i++) {
531 if (map.m_flags & EXT4_MAP_UNWRITTEN) { 511 next = (loff_t)(ext[i].fe_logical + ext[i].fe_length);
532 int unwritten; 512 /*
533 unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, 513 * If extent is not unwritten, then it contains valid
534 &map, &holeoff); 514 * data, mapped or delayed.
535 if (!unwritten) { 515 */
536 last += ret; 516 if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN)) {
537 holeoff = (loff_t)last << blkbits; 517 if (offset < (loff_t)ext[i].fe_logical)
518 goto out;
519 offset = next;
538 continue; 520 continue;
539 } 521 }
540 } 522 /*
541 523 * If there is a unwritten extent at this offset,
542 /* find a hole */ 524 * it will be as a data or a hole according to page
543 break; 525 * cache that has data or not.
544 } while (last <= end); 526 */
527 if (ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
528 next, &offset))
529 goto out;
545 530
531 offset = next;
532 if (ext[i].fe_flags & FIEMAP_EXTENT_LAST)
533 goto out;
534 }
535 }
536 if (offset > inode->i_size)
537 offset = inode->i_size;
538out:
546 mutex_unlock(&inode->i_mutex); 539 mutex_unlock(&inode->i_mutex);
540 if (ret)
541 return ret;
547 542
548 if (holeoff > isize) 543 return vfs_setpos(file, offset, maxsize);
549 holeoff = isize;
550
551 return vfs_setpos(file, holeoff, maxsize);
552} 544}
553 545
554/* 546/*
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 3ea62695abce..4b143febf21f 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -811,8 +811,11 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
811 ret = __block_write_begin(page, 0, inline_size, 811 ret = __block_write_begin(page, 0, inline_size,
812 ext4_da_get_block_prep); 812 ext4_da_get_block_prep);
813 if (ret) { 813 if (ret) {
814 up_read(&EXT4_I(inode)->xattr_sem);
815 unlock_page(page);
816 page_cache_release(page);
814 ext4_truncate_failed_write(inode); 817 ext4_truncate_failed_write(inode);
815 goto out; 818 return ret;
816 } 819 }
817 820
818 SetPageDirty(page); 821 SetPageDirty(page);
@@ -870,6 +873,12 @@ retry_journal:
870 goto out_journal; 873 goto out_journal;
871 } 874 }
872 875
876 /*
877 * We cannot recurse into the filesystem as the transaction
878 * is already started.
879 */
880 flags |= AOP_FLAG_NOFS;
881
873 if (ret == -ENOSPC) { 882 if (ret == -ENOSPC) {
874 ret = ext4_da_convert_inline_data_to_extent(mapping, 883 ret = ext4_da_convert_inline_data_to_extent(mapping,
875 inode, 884 inode,
@@ -882,11 +891,6 @@ retry_journal:
882 goto out; 891 goto out;
883 } 892 }
884 893
885 /*
886 * We cannot recurse into the filesystem as the transaction
887 * is already started.
888 */
889 flags |= AOP_FLAG_NOFS;
890 894
891 page = grab_cache_page_write_begin(mapping, 0, flags); 895 page = grab_cache_page_write_begin(mapping, 0, flags);
892 if (!page) { 896 if (!page) {
@@ -1807,11 +1811,12 @@ int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
1807 1811
1808int ext4_inline_data_fiemap(struct inode *inode, 1812int ext4_inline_data_fiemap(struct inode *inode,
1809 struct fiemap_extent_info *fieinfo, 1813 struct fiemap_extent_info *fieinfo,
1810 int *has_inline) 1814 int *has_inline, __u64 start, __u64 len)
1811{ 1815{
1812 __u64 physical = 0; 1816 __u64 physical = 0;
1813 __u64 length; 1817 __u64 inline_len;
1814 __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST; 1818 __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED |
1819 FIEMAP_EXTENT_LAST;
1815 int error = 0; 1820 int error = 0;
1816 struct ext4_iloc iloc; 1821 struct ext4_iloc iloc;
1817 1822
@@ -1820,6 +1825,13 @@ int ext4_inline_data_fiemap(struct inode *inode,
1820 *has_inline = 0; 1825 *has_inline = 0;
1821 goto out; 1826 goto out;
1822 } 1827 }
1828 inline_len = min_t(size_t, ext4_get_inline_size(inode),
1829 i_size_read(inode));
1830 if (start >= inline_len)
1831 goto out;
1832 if (start + len < inline_len)
1833 inline_len = start + len;
1834 inline_len -= start;
1823 1835
1824 error = ext4_get_inode_loc(inode, &iloc); 1836 error = ext4_get_inode_loc(inode, &iloc);
1825 if (error) 1837 if (error)
@@ -1828,11 +1840,10 @@ int ext4_inline_data_fiemap(struct inode *inode,
1828 physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; 1840 physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
1829 physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; 1841 physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
1830 physical += offsetof(struct ext4_inode, i_block); 1842 physical += offsetof(struct ext4_inode, i_block);
1831 length = i_size_read(inode);
1832 1843
1833 if (physical) 1844 if (physical)
1834 error = fiemap_fill_next_extent(fieinfo, 0, physical, 1845 error = fiemap_fill_next_extent(fieinfo, start, physical,
1835 length, flags); 1846 inline_len, flags);
1836 brelse(iloc.bh); 1847 brelse(iloc.bh);
1837out: 1848out:
1838 up_read(&EXT4_I(inode)->xattr_sem); 1849 up_read(&EXT4_I(inode)->xattr_sem);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3356ab5395f4..5653fa42930b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -416,11 +416,6 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
416 } 416 }
417 if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) 417 if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
418 up_read((&EXT4_I(inode)->i_data_sem)); 418 up_read((&EXT4_I(inode)->i_data_sem));
419 /*
420 * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag
421 * because it shouldn't be marked in es_map->m_flags.
422 */
423 map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY);
424 419
425 /* 420 /*
426 * We don't check m_len because extent will be collpased in status 421 * We don't check m_len because extent will be collpased in status
@@ -491,7 +486,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
491 486
492 /* Lookup extent status tree firstly */ 487 /* Lookup extent status tree firstly */
493 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { 488 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
494 ext4_es_lru_add(inode);
495 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { 489 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
496 map->m_pblk = ext4_es_pblock(&es) + 490 map->m_pblk = ext4_es_pblock(&es) +
497 map->m_lblk - es.es_lblk; 491 map->m_lblk - es.es_lblk;
@@ -1393,7 +1387,6 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1393 1387
1394 /* Lookup extent status tree firstly */ 1388 /* Lookup extent status tree firstly */
1395 if (ext4_es_lookup_extent(inode, iblock, &es)) { 1389 if (ext4_es_lookup_extent(inode, iblock, &es)) {
1396 ext4_es_lru_add(inode);
1397 if (ext4_es_is_hole(&es)) { 1390 if (ext4_es_is_hole(&es)) {
1398 retval = 0; 1391 retval = 0;
1399 down_read(&EXT4_I(inode)->i_data_sem); 1392 down_read(&EXT4_I(inode)->i_data_sem);
@@ -1434,24 +1427,12 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1434 * file system block. 1427 * file system block.
1435 */ 1428 */
1436 down_read(&EXT4_I(inode)->i_data_sem); 1429 down_read(&EXT4_I(inode)->i_data_sem);
1437 if (ext4_has_inline_data(inode)) { 1430 if (ext4_has_inline_data(inode))
1438 /*
1439 * We will soon create blocks for this page, and let
1440 * us pretend as if the blocks aren't allocated yet.
1441 * In case of clusters, we have to handle the work
1442 * of mapping from cluster so that the reserved space
1443 * is calculated properly.
1444 */
1445 if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) &&
1446 ext4_find_delalloc_cluster(inode, map->m_lblk))
1447 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
1448 retval = 0; 1431 retval = 0;
1449 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 1432 else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1450 retval = ext4_ext_map_blocks(NULL, inode, map, 1433 retval = ext4_ext_map_blocks(NULL, inode, map, 0);
1451 EXT4_GET_BLOCKS_NO_PUT_HOLE);
1452 else 1434 else
1453 retval = ext4_ind_map_blocks(NULL, inode, map, 1435 retval = ext4_ind_map_blocks(NULL, inode, map, 0);
1454 EXT4_GET_BLOCKS_NO_PUT_HOLE);
1455 1436
1456add_delayed: 1437add_delayed:
1457 if (retval == 0) { 1438 if (retval == 0) {
@@ -1465,7 +1446,8 @@ add_delayed:
1465 * then we don't need to reserve it again. However we still need 1446 * then we don't need to reserve it again. However we still need
1466 * to reserve metadata for every block we're going to write. 1447 * to reserve metadata for every block we're going to write.
1467 */ 1448 */
1468 if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { 1449 if (EXT4_SB(inode->i_sb)->s_cluster_ratio <= 1 ||
1450 !ext4_find_delalloc_cluster(inode, map->m_lblk)) {
1469 ret = ext4_da_reserve_space(inode, iblock); 1451 ret = ext4_da_reserve_space(inode, iblock);
1470 if (ret) { 1452 if (ret) {
1471 /* not enough space to reserve */ 1453 /* not enough space to reserve */
@@ -1481,11 +1463,6 @@ add_delayed:
1481 goto out_unlock; 1463 goto out_unlock;
1482 } 1464 }
1483 1465
1484 /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
1485 * and it should not appear on the bh->b_state.
1486 */
1487 map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
1488
1489 map_bh(bh, inode->i_sb, invalid_block); 1466 map_bh(bh, inode->i_sb, invalid_block);
1490 set_buffer_new(bh); 1467 set_buffer_new(bh);
1491 set_buffer_delay(bh); 1468 set_buffer_delay(bh);
@@ -3643,7 +3620,7 @@ out_stop:
3643 * If this was a simple ftruncate() and the file will remain alive, 3620 * If this was a simple ftruncate() and the file will remain alive,
3644 * then we need to clear up the orphan record which we created above. 3621 * then we need to clear up the orphan record which we created above.
3645 * However, if this was a real unlink then we were called by 3622 * However, if this was a real unlink then we were called by
3646 * ext4_delete_inode(), and we allow that function to clean up the 3623 * ext4_evict_inode(), and we allow that function to clean up the
3647 * orphan info for us. 3624 * orphan info for us.
3648 */ 3625 */
3649 if (inode->i_nlink) 3626 if (inode->i_nlink)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bfda18a15592..f58a0d106726 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -78,8 +78,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
78 memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); 78 memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
79 ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); 79 ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
80 ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); 80 ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
81 ext4_es_lru_del(inode1);
82 ext4_es_lru_del(inode2);
83 81
84 isize = i_size_read(inode1); 82 isize = i_size_read(inode1);
85 i_size_write(inode1, i_size_read(inode2)); 83 i_size_write(inode1, i_size_read(inode2));
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index dbfe15c2533c..8d1e60214ef0 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2358,7 +2358,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
2358 if (sbi->s_group_info) { 2358 if (sbi->s_group_info) {
2359 memcpy(new_groupinfo, sbi->s_group_info, 2359 memcpy(new_groupinfo, sbi->s_group_info,
2360 sbi->s_group_info_size * sizeof(*sbi->s_group_info)); 2360 sbi->s_group_info_size * sizeof(*sbi->s_group_info));
2361 ext4_kvfree(sbi->s_group_info); 2361 kvfree(sbi->s_group_info);
2362 } 2362 }
2363 sbi->s_group_info = new_groupinfo; 2363 sbi->s_group_info = new_groupinfo;
2364 sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); 2364 sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
@@ -2385,7 +2385,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2385 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { 2385 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2386 metalen = sizeof(*meta_group_info) << 2386 metalen = sizeof(*meta_group_info) <<
2387 EXT4_DESC_PER_BLOCK_BITS(sb); 2387 EXT4_DESC_PER_BLOCK_BITS(sb);
2388 meta_group_info = kmalloc(metalen, GFP_KERNEL); 2388 meta_group_info = kmalloc(metalen, GFP_NOFS);
2389 if (meta_group_info == NULL) { 2389 if (meta_group_info == NULL) {
2390 ext4_msg(sb, KERN_ERR, "can't allocate mem " 2390 ext4_msg(sb, KERN_ERR, "can't allocate mem "
2391 "for a buddy group"); 2391 "for a buddy group");
@@ -2399,7 +2399,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2399 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; 2399 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2400 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); 2400 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2401 2401
2402 meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL); 2402 meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
2403 if (meta_group_info[i] == NULL) { 2403 if (meta_group_info[i] == NULL) {
2404 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); 2404 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
2405 goto exit_group_info; 2405 goto exit_group_info;
@@ -2428,7 +2428,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2428 { 2428 {
2429 struct buffer_head *bh; 2429 struct buffer_head *bh;
2430 meta_group_info[i]->bb_bitmap = 2430 meta_group_info[i]->bb_bitmap =
2431 kmalloc(sb->s_blocksize, GFP_KERNEL); 2431 kmalloc(sb->s_blocksize, GFP_NOFS);
2432 BUG_ON(meta_group_info[i]->bb_bitmap == NULL); 2432 BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
2433 bh = ext4_read_block_bitmap(sb, group); 2433 bh = ext4_read_block_bitmap(sb, group);
2434 BUG_ON(bh == NULL); 2434 BUG_ON(bh == NULL);
@@ -2495,7 +2495,7 @@ err_freebuddy:
2495 kfree(sbi->s_group_info[i]); 2495 kfree(sbi->s_group_info[i]);
2496 iput(sbi->s_buddy_cache); 2496 iput(sbi->s_buddy_cache);
2497err_freesgi: 2497err_freesgi:
2498 ext4_kvfree(sbi->s_group_info); 2498 kvfree(sbi->s_group_info);
2499 return -ENOMEM; 2499 return -ENOMEM;
2500} 2500}
2501 2501
@@ -2708,12 +2708,11 @@ int ext4_mb_release(struct super_block *sb)
2708 EXT4_DESC_PER_BLOCK_BITS(sb); 2708 EXT4_DESC_PER_BLOCK_BITS(sb);
2709 for (i = 0; i < num_meta_group_infos; i++) 2709 for (i = 0; i < num_meta_group_infos; i++)
2710 kfree(sbi->s_group_info[i]); 2710 kfree(sbi->s_group_info[i]);
2711 ext4_kvfree(sbi->s_group_info); 2711 kvfree(sbi->s_group_info);
2712 } 2712 }
2713 kfree(sbi->s_mb_offsets); 2713 kfree(sbi->s_mb_offsets);
2714 kfree(sbi->s_mb_maxs); 2714 kfree(sbi->s_mb_maxs);
2715 if (sbi->s_buddy_cache) 2715 iput(sbi->s_buddy_cache);
2716 iput(sbi->s_buddy_cache);
2717 if (sbi->s_mb_stats) { 2716 if (sbi->s_mb_stats) {
2718 ext4_msg(sb, KERN_INFO, 2717 ext4_msg(sb, KERN_INFO,
2719 "mballoc: %u blocks %u reqs (%u success)", 2718 "mballoc: %u blocks %u reqs (%u success)",
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index a432634f2e6a..3cb267aee802 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -592,7 +592,7 @@ err_out:
592 592
593 /* 593 /*
594 * set the i_blocks count to zero 594 * set the i_blocks count to zero
595 * so that the ext4_delete_inode does the 595 * so that the ext4_evict_inode() does the
596 * right job 596 * right job
597 * 597 *
598 * We don't need to take the i_lock because 598 * We don't need to take the i_lock because
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 9f2311bc9c4f..503ea15dc5db 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -273,6 +273,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
273 int replaced_count = 0; 273 int replaced_count = 0;
274 int from = data_offset_in_page << orig_inode->i_blkbits; 274 int from = data_offset_in_page << orig_inode->i_blkbits;
275 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; 275 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
276 struct super_block *sb = orig_inode->i_sb;
276 277
277 /* 278 /*
278 * It needs twice the amount of ordinary journal buffers because 279 * It needs twice the amount of ordinary journal buffers because
@@ -405,10 +406,13 @@ unlock_pages:
405 page_cache_release(pagep[1]); 406 page_cache_release(pagep[1]);
406stop_journal: 407stop_journal:
407 ext4_journal_stop(handle); 408 ext4_journal_stop(handle);
409 if (*err == -ENOSPC &&
410 ext4_should_retry_alloc(sb, &retries))
411 goto again;
408 /* Buffer was busy because probably is pinned to journal transaction, 412 /* Buffer was busy because probably is pinned to journal transaction,
409 * force transaction commit may help to free it. */ 413 * force transaction commit may help to free it. */
410 if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb, 414 if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal &&
411 &retries)) 415 jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal))
412 goto again; 416 goto again;
413 return replaced_count; 417 return replaced_count;
414 418
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 426211882f72..2291923dae4e 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2814,7 +2814,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2814 ext4_orphan_add(handle, inode); 2814 ext4_orphan_add(handle, inode);
2815 inode->i_ctime = ext4_current_time(inode); 2815 inode->i_ctime = ext4_current_time(inode);
2816 ext4_mark_inode_dirty(handle, inode); 2816 ext4_mark_inode_dirty(handle, inode);
2817 retval = 0;
2818 2817
2819end_unlink: 2818end_unlink:
2820 brelse(bh); 2819 brelse(bh);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index ca4588388fc3..bf76f405a5f9 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -856,7 +856,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
856 n_group_desc[gdb_num] = gdb_bh; 856 n_group_desc[gdb_num] = gdb_bh;
857 EXT4_SB(sb)->s_group_desc = n_group_desc; 857 EXT4_SB(sb)->s_group_desc = n_group_desc;
858 EXT4_SB(sb)->s_gdb_count++; 858 EXT4_SB(sb)->s_gdb_count++;
859 ext4_kvfree(o_group_desc); 859 kvfree(o_group_desc);
860 860
861 le16_add_cpu(&es->s_reserved_gdt_blocks, -1); 861 le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
862 err = ext4_handle_dirty_super(handle, sb); 862 err = ext4_handle_dirty_super(handle, sb);
@@ -866,7 +866,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
866 return err; 866 return err;
867 867
868exit_inode: 868exit_inode:
869 ext4_kvfree(n_group_desc); 869 kvfree(n_group_desc);
870 brelse(iloc.bh); 870 brelse(iloc.bh);
871exit_dind: 871exit_dind:
872 brelse(dind); 872 brelse(dind);
@@ -909,7 +909,7 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
909 n_group_desc[gdb_num] = gdb_bh; 909 n_group_desc[gdb_num] = gdb_bh;
910 EXT4_SB(sb)->s_group_desc = n_group_desc; 910 EXT4_SB(sb)->s_group_desc = n_group_desc;
911 EXT4_SB(sb)->s_gdb_count++; 911 EXT4_SB(sb)->s_gdb_count++;
912 ext4_kvfree(o_group_desc); 912 kvfree(o_group_desc);
913 BUFFER_TRACE(gdb_bh, "get_write_access"); 913 BUFFER_TRACE(gdb_bh, "get_write_access");
914 err = ext4_journal_get_write_access(handle, gdb_bh); 914 err = ext4_journal_get_write_access(handle, gdb_bh);
915 if (unlikely(err)) 915 if (unlikely(err))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 63e802b8ec68..43c92b1685cb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -176,15 +176,6 @@ void *ext4_kvzalloc(size_t size, gfp_t flags)
176 return ret; 176 return ret;
177} 177}
178 178
179void ext4_kvfree(void *ptr)
180{
181 if (is_vmalloc_addr(ptr))
182 vfree(ptr);
183 else
184 kfree(ptr);
185
186}
187
188ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, 179ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
189 struct ext4_group_desc *bg) 180 struct ext4_group_desc *bg)
190{ 181{
@@ -811,8 +802,8 @@ static void ext4_put_super(struct super_block *sb)
811 802
812 for (i = 0; i < sbi->s_gdb_count; i++) 803 for (i = 0; i < sbi->s_gdb_count; i++)
813 brelse(sbi->s_group_desc[i]); 804 brelse(sbi->s_group_desc[i]);
814 ext4_kvfree(sbi->s_group_desc); 805 kvfree(sbi->s_group_desc);
815 ext4_kvfree(sbi->s_flex_groups); 806 kvfree(sbi->s_flex_groups);
816 percpu_counter_destroy(&sbi->s_freeclusters_counter); 807 percpu_counter_destroy(&sbi->s_freeclusters_counter);
817 percpu_counter_destroy(&sbi->s_freeinodes_counter); 808 percpu_counter_destroy(&sbi->s_freeinodes_counter);
818 percpu_counter_destroy(&sbi->s_dirs_counter); 809 percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -880,10 +871,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
880 spin_lock_init(&ei->i_prealloc_lock); 871 spin_lock_init(&ei->i_prealloc_lock);
881 ext4_es_init_tree(&ei->i_es_tree); 872 ext4_es_init_tree(&ei->i_es_tree);
882 rwlock_init(&ei->i_es_lock); 873 rwlock_init(&ei->i_es_lock);
883 INIT_LIST_HEAD(&ei->i_es_lru); 874 INIT_LIST_HEAD(&ei->i_es_list);
884 ei->i_es_all_nr = 0; 875 ei->i_es_all_nr = 0;
885 ei->i_es_lru_nr = 0; 876 ei->i_es_shk_nr = 0;
886 ei->i_touch_when = 0; 877 ei->i_es_shrink_lblk = 0;
887 ei->i_reserved_data_blocks = 0; 878 ei->i_reserved_data_blocks = 0;
888 ei->i_reserved_meta_blocks = 0; 879 ei->i_reserved_meta_blocks = 0;
889 ei->i_allocated_meta_blocks = 0; 880 ei->i_allocated_meta_blocks = 0;
@@ -973,7 +964,6 @@ void ext4_clear_inode(struct inode *inode)
973 dquot_drop(inode); 964 dquot_drop(inode);
974 ext4_discard_preallocations(inode); 965 ext4_discard_preallocations(inode);
975 ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); 966 ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
976 ext4_es_lru_del(inode);
977 if (EXT4_I(inode)->jinode) { 967 if (EXT4_I(inode)->jinode) {
978 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), 968 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
979 EXT4_I(inode)->jinode); 969 EXT4_I(inode)->jinode);
@@ -1153,7 +1143,7 @@ enum {
1153 Opt_inode_readahead_blks, Opt_journal_ioprio, 1143 Opt_inode_readahead_blks, Opt_journal_ioprio,
1154 Opt_dioread_nolock, Opt_dioread_lock, 1144 Opt_dioread_nolock, Opt_dioread_lock,
1155 Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, 1145 Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
1156 Opt_max_dir_size_kb, 1146 Opt_max_dir_size_kb, Opt_nojournal_checksum,
1157}; 1147};
1158 1148
1159static const match_table_t tokens = { 1149static const match_table_t tokens = {
@@ -1187,6 +1177,7 @@ static const match_table_t tokens = {
1187 {Opt_journal_dev, "journal_dev=%u"}, 1177 {Opt_journal_dev, "journal_dev=%u"},
1188 {Opt_journal_path, "journal_path=%s"}, 1178 {Opt_journal_path, "journal_path=%s"},
1189 {Opt_journal_checksum, "journal_checksum"}, 1179 {Opt_journal_checksum, "journal_checksum"},
1180 {Opt_nojournal_checksum, "nojournal_checksum"},
1190 {Opt_journal_async_commit, "journal_async_commit"}, 1181 {Opt_journal_async_commit, "journal_async_commit"},
1191 {Opt_abort, "abort"}, 1182 {Opt_abort, "abort"},
1192 {Opt_data_journal, "data=journal"}, 1183 {Opt_data_journal, "data=journal"},
@@ -1368,6 +1359,8 @@ static const struct mount_opts {
1368 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, 1359 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1369 {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, 1360 {Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
1370 MOPT_EXT4_ONLY | MOPT_CLEAR}, 1361 MOPT_EXT4_ONLY | MOPT_CLEAR},
1362 {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1363 MOPT_EXT4_ONLY | MOPT_CLEAR},
1371 {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, 1364 {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1372 MOPT_EXT4_ONLY | MOPT_SET}, 1365 MOPT_EXT4_ONLY | MOPT_SET},
1373 {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | 1366 {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
@@ -1709,6 +1702,12 @@ static int parse_options(char *options, struct super_block *sb,
1709 return 0; 1702 return 0;
1710 } 1703 }
1711 } 1704 }
1705 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
1706 test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
1707 ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit "
1708 "in data=ordered mode");
1709 return 0;
1710 }
1712 return 1; 1711 return 1;
1713} 1712}
1714 1713
@@ -1946,7 +1945,7 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
1946 memcpy(new_groups, sbi->s_flex_groups, 1945 memcpy(new_groups, sbi->s_flex_groups,
1947 (sbi->s_flex_groups_allocated * 1946 (sbi->s_flex_groups_allocated *
1948 sizeof(struct flex_groups))); 1947 sizeof(struct flex_groups)));
1949 ext4_kvfree(sbi->s_flex_groups); 1948 kvfree(sbi->s_flex_groups);
1950 } 1949 }
1951 sbi->s_flex_groups = new_groups; 1950 sbi->s_flex_groups = new_groups;
1952 sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups); 1951 sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
@@ -3317,7 +3316,7 @@ int ext4_calculate_overhead(struct super_block *sb)
3317 struct ext4_super_block *es = sbi->s_es; 3316 struct ext4_super_block *es = sbi->s_es;
3318 ext4_group_t i, ngroups = ext4_get_groups_count(sb); 3317 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
3319 ext4_fsblk_t overhead = 0; 3318 ext4_fsblk_t overhead = 0;
3320 char *buf = (char *) get_zeroed_page(GFP_KERNEL); 3319 char *buf = (char *) get_zeroed_page(GFP_NOFS);
3321 3320
3322 if (!buf) 3321 if (!buf)
3323 return -ENOMEM; 3322 return -ENOMEM;
@@ -3345,8 +3344,8 @@ int ext4_calculate_overhead(struct super_block *sb)
3345 memset(buf, 0, PAGE_SIZE); 3344 memset(buf, 0, PAGE_SIZE);
3346 cond_resched(); 3345 cond_resched();
3347 } 3346 }
3348 /* Add the journal blocks as well */ 3347 /* Add the internal journal blocks as well */
3349 if (sbi->s_journal) 3348 if (sbi->s_journal && !sbi->journal_bdev)
3350 overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); 3349 overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
3351 3350
3352 sbi->s_overhead = overhead; 3351 sbi->s_overhead = overhead;
@@ -4232,7 +4231,7 @@ failed_mount7:
4232failed_mount6: 4231failed_mount6:
4233 ext4_mb_release(sb); 4232 ext4_mb_release(sb);
4234 if (sbi->s_flex_groups) 4233 if (sbi->s_flex_groups)
4235 ext4_kvfree(sbi->s_flex_groups); 4234 kvfree(sbi->s_flex_groups);
4236 percpu_counter_destroy(&sbi->s_freeclusters_counter); 4235 percpu_counter_destroy(&sbi->s_freeclusters_counter);
4237 percpu_counter_destroy(&sbi->s_freeinodes_counter); 4236 percpu_counter_destroy(&sbi->s_freeinodes_counter);
4238 percpu_counter_destroy(&sbi->s_dirs_counter); 4237 percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -4261,7 +4260,7 @@ failed_mount3:
4261failed_mount2: 4260failed_mount2:
4262 for (i = 0; i < db_count; i++) 4261 for (i = 0; i < db_count; i++)
4263 brelse(sbi->s_group_desc[i]); 4262 brelse(sbi->s_group_desc[i]);
4264 ext4_kvfree(sbi->s_group_desc); 4263 kvfree(sbi->s_group_desc);
4265failed_mount: 4264failed_mount:
4266 if (sbi->s_chksum_driver) 4265 if (sbi->s_chksum_driver)
4267 crypto_free_shash(sbi->s_chksum_driver); 4266 crypto_free_shash(sbi->s_chksum_driver);
@@ -4862,6 +4861,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4862 goto restore_opts; 4861 goto restore_opts;
4863 } 4862 }
4864 4863
4864 if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
4865 test_opt(sb, JOURNAL_CHECKSUM)) {
4866 ext4_msg(sb, KERN_ERR, "changing journal_checksum "
4867 "during remount not supported");
4868 err = -EINVAL;
4869 goto restore_opts;
4870 }
4871
4865 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 4872 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
4866 if (test_opt2(sb, EXPLICIT_DELALLOC)) { 4873 if (test_opt2(sb, EXPLICIT_DELALLOC)) {
4867 ext4_msg(sb, KERN_ERR, "can't mount with " 4874 ext4_msg(sb, KERN_ERR, "can't mount with "
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 1df94fabe4eb..b96bd8076b70 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1714,8 +1714,7 @@ int jbd2_journal_destroy(journal_t *journal)
1714 1714
1715 if (journal->j_proc_entry) 1715 if (journal->j_proc_entry)
1716 jbd2_stats_proc_exit(journal); 1716 jbd2_stats_proc_exit(journal);
1717 if (journal->j_inode) 1717 iput(journal->j_inode);
1718 iput(journal->j_inode);
1719 if (journal->j_revoke) 1718 if (journal->j_revoke)
1720 jbd2_journal_destroy_revoke(journal); 1719 jbd2_journal_destroy_revoke(journal);
1721 if (journal->j_chksum_driver) 1720 if (journal->j_chksum_driver)
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index ec881b312700..2f389ce5023c 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -61,6 +61,11 @@ module_param(mem_size, ulong, 0400);
61MODULE_PARM_DESC(mem_size, 61MODULE_PARM_DESC(mem_size,
62 "size of reserved RAM used to store oops/panic logs"); 62 "size of reserved RAM used to store oops/panic logs");
63 63
64static unsigned int mem_type;
65module_param(mem_type, uint, 0600);
66MODULE_PARM_DESC(mem_type,
67 "set to 1 to try to use unbuffered memory (default 0)");
68
64static int dump_oops = 1; 69static int dump_oops = 1;
65module_param(dump_oops, int, 0600); 70module_param(dump_oops, int, 0600);
66MODULE_PARM_DESC(dump_oops, 71MODULE_PARM_DESC(dump_oops,
@@ -79,6 +84,7 @@ struct ramoops_context {
79 struct persistent_ram_zone *fprz; 84 struct persistent_ram_zone *fprz;
80 phys_addr_t phys_addr; 85 phys_addr_t phys_addr;
81 unsigned long size; 86 unsigned long size;
87 unsigned int memtype;
82 size_t record_size; 88 size_t record_size;
83 size_t console_size; 89 size_t console_size;
84 size_t ftrace_size; 90 size_t ftrace_size;
@@ -366,7 +372,8 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
366 size_t sz = cxt->record_size; 372 size_t sz = cxt->record_size;
367 373
368 cxt->przs[i] = persistent_ram_new(*paddr, sz, 0, 374 cxt->przs[i] = persistent_ram_new(*paddr, sz, 0,
369 &cxt->ecc_info); 375 &cxt->ecc_info,
376 cxt->memtype);
370 if (IS_ERR(cxt->przs[i])) { 377 if (IS_ERR(cxt->przs[i])) {
371 err = PTR_ERR(cxt->przs[i]); 378 err = PTR_ERR(cxt->przs[i]);
372 dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n", 379 dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
@@ -396,7 +403,7 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
396 return -ENOMEM; 403 return -ENOMEM;
397 } 404 }
398 405
399 *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info); 406 *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, cxt->memtype);
400 if (IS_ERR(*prz)) { 407 if (IS_ERR(*prz)) {
401 int err = PTR_ERR(*prz); 408 int err = PTR_ERR(*prz);
402 409
@@ -443,6 +450,7 @@ static int ramoops_probe(struct platform_device *pdev)
443 450
444 cxt->size = pdata->mem_size; 451 cxt->size = pdata->mem_size;
445 cxt->phys_addr = pdata->mem_address; 452 cxt->phys_addr = pdata->mem_address;
453 cxt->memtype = pdata->mem_type;
446 cxt->record_size = pdata->record_size; 454 cxt->record_size = pdata->record_size;
447 cxt->console_size = pdata->console_size; 455 cxt->console_size = pdata->console_size;
448 cxt->ftrace_size = pdata->ftrace_size; 456 cxt->ftrace_size = pdata->ftrace_size;
@@ -572,6 +580,7 @@ static void ramoops_register_dummy(void)
572 580
573 dummy_data->mem_size = mem_size; 581 dummy_data->mem_size = mem_size;
574 dummy_data->mem_address = mem_address; 582 dummy_data->mem_address = mem_address;
583 dummy_data->mem_type = 0;
575 dummy_data->record_size = record_size; 584 dummy_data->record_size = record_size;
576 dummy_data->console_size = ramoops_console_size; 585 dummy_data->console_size = ramoops_console_size;
577 dummy_data->ftrace_size = ramoops_ftrace_size; 586 dummy_data->ftrace_size = ramoops_ftrace_size;
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 9d7b9a83699e..76c3f80efdfa 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -380,7 +380,8 @@ void persistent_ram_zap(struct persistent_ram_zone *prz)
380 persistent_ram_update_header_ecc(prz); 380 persistent_ram_update_header_ecc(prz);
381} 381}
382 382
383static void *persistent_ram_vmap(phys_addr_t start, size_t size) 383static void *persistent_ram_vmap(phys_addr_t start, size_t size,
384 unsigned int memtype)
384{ 385{
385 struct page **pages; 386 struct page **pages;
386 phys_addr_t page_start; 387 phys_addr_t page_start;
@@ -392,7 +393,10 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size)
392 page_start = start - offset_in_page(start); 393 page_start = start - offset_in_page(start);
393 page_count = DIV_ROUND_UP(size + offset_in_page(start), PAGE_SIZE); 394 page_count = DIV_ROUND_UP(size + offset_in_page(start), PAGE_SIZE);
394 395
395 prot = pgprot_noncached(PAGE_KERNEL); 396 if (memtype)
397 prot = pgprot_noncached(PAGE_KERNEL);
398 else
399 prot = pgprot_writecombine(PAGE_KERNEL);
396 400
397 pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL); 401 pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL);
398 if (!pages) { 402 if (!pages) {
@@ -411,8 +415,11 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size)
411 return vaddr; 415 return vaddr;
412} 416}
413 417
414static void *persistent_ram_iomap(phys_addr_t start, size_t size) 418static void *persistent_ram_iomap(phys_addr_t start, size_t size,
419 unsigned int memtype)
415{ 420{
421 void *va;
422
416 if (!request_mem_region(start, size, "persistent_ram")) { 423 if (!request_mem_region(start, size, "persistent_ram")) {
417 pr_err("request mem region (0x%llx@0x%llx) failed\n", 424 pr_err("request mem region (0x%llx@0x%llx) failed\n",
418 (unsigned long long)size, (unsigned long long)start); 425 (unsigned long long)size, (unsigned long long)start);
@@ -422,19 +429,24 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size)
422 buffer_start_add = buffer_start_add_locked; 429 buffer_start_add = buffer_start_add_locked;
423 buffer_size_add = buffer_size_add_locked; 430 buffer_size_add = buffer_size_add_locked;
424 431
425 return ioremap(start, size); 432 if (memtype)
433 va = ioremap(start, size);
434 else
435 va = ioremap_wc(start, size);
436
437 return va;
426} 438}
427 439
428static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size, 440static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size,
429 struct persistent_ram_zone *prz) 441 struct persistent_ram_zone *prz, int memtype)
430{ 442{
431 prz->paddr = start; 443 prz->paddr = start;
432 prz->size = size; 444 prz->size = size;
433 445
434 if (pfn_valid(start >> PAGE_SHIFT)) 446 if (pfn_valid(start >> PAGE_SHIFT))
435 prz->vaddr = persistent_ram_vmap(start, size); 447 prz->vaddr = persistent_ram_vmap(start, size, memtype);
436 else 448 else
437 prz->vaddr = persistent_ram_iomap(start, size); 449 prz->vaddr = persistent_ram_iomap(start, size, memtype);
438 450
439 if (!prz->vaddr) { 451 if (!prz->vaddr) {
440 pr_err("%s: Failed to map 0x%llx pages at 0x%llx\n", __func__, 452 pr_err("%s: Failed to map 0x%llx pages at 0x%llx\n", __func__,
@@ -500,7 +512,8 @@ void persistent_ram_free(struct persistent_ram_zone *prz)
500} 512}
501 513
502struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, 514struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
503 u32 sig, struct persistent_ram_ecc_info *ecc_info) 515 u32 sig, struct persistent_ram_ecc_info *ecc_info,
516 unsigned int memtype)
504{ 517{
505 struct persistent_ram_zone *prz; 518 struct persistent_ram_zone *prz;
506 int ret = -ENOMEM; 519 int ret = -ENOMEM;
@@ -511,7 +524,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
511 goto err; 524 goto err;
512 } 525 }
513 526
514 ret = persistent_ram_buffer_map(start, size, prz); 527 ret = persistent_ram_buffer_map(start, size, prz, memtype);
515 if (ret) 528 if (ret)
516 goto err; 529 goto err;
517 530
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index d571e173a990..9d6486d416a3 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2772,7 +2772,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
2772 2772
2773 if (journal_init_dev(sb, journal, j_dev_name) != 0) { 2773 if (journal_init_dev(sb, journal, j_dev_name) != 0) {
2774 reiserfs_warning(sb, "sh-462", 2774 reiserfs_warning(sb, "sh-462",
2775 "unable to initialize jornal device"); 2775 "unable to initialize journal device");
2776 goto free_and_return; 2776 goto free_and_return;
2777 } 2777 }
2778 2778
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index b5b593c45270..538519ee37d9 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -262,6 +262,7 @@ static int write_begin_slow(struct address_space *mapping,
262 if (err) { 262 if (err) {
263 unlock_page(page); 263 unlock_page(page);
264 page_cache_release(page); 264 page_cache_release(page);
265 ubifs_release_budget(c, &req);
265 return err; 266 return err;
266 } 267 }
267 } 268 }
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index fb166e204441..f6ac3f29323c 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -571,7 +571,11 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
571 571
572 aligned_dlen = ALIGN(dlen, 8); 572 aligned_dlen = ALIGN(dlen, 8);
573 aligned_ilen = ALIGN(ilen, 8); 573 aligned_ilen = ALIGN(ilen, 8);
574
574 len = aligned_dlen + aligned_ilen + UBIFS_INO_NODE_SZ; 575 len = aligned_dlen + aligned_ilen + UBIFS_INO_NODE_SZ;
576 /* Make sure to also account for extended attributes */
577 len += host_ui->data_len;
578
575 dent = kmalloc(len, GFP_NOFS); 579 dent = kmalloc(len, GFP_NOFS);
576 if (!dent) 580 if (!dent)
577 return -ENOMEM; 581 return -ENOMEM;
@@ -648,7 +652,8 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
648 652
649 ino_key_init(c, &ino_key, dir->i_ino); 653 ino_key_init(c, &ino_key, dir->i_ino);
650 ino_offs += aligned_ilen; 654 ino_offs += aligned_ilen;
651 err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, UBIFS_INO_NODE_SZ); 655 err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs,
656 UBIFS_INO_NODE_SZ + host_ui->data_len);
652 if (err) 657 if (err)
653 goto out_ro; 658 goto out_ro;
654 659
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
deleted file mode 100644
index 6e247a99f5db..000000000000
--- a/fs/xfs/libxfs/xfs_ag.h
+++ /dev/null
@@ -1,281 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_AG_H__
19#define __XFS_AG_H__
20
21/*
22 * Allocation group header
23 * This is divided into three structures, placed in sequential 512-byte
24 * buffers after a copy of the superblock (also in a 512-byte buffer).
25 */
26
27struct xfs_buf;
28struct xfs_mount;
29struct xfs_trans;
30
31#define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */
32#define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */
33#define XFS_AGFL_MAGIC 0x5841464c /* 'XAFL' */
34#define XFS_AGF_VERSION 1
35#define XFS_AGI_VERSION 1
36
37#define XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION)
38#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION)
39
40/*
41 * Btree number 0 is bno, 1 is cnt. This value gives the size of the
42 * arrays below.
43 */
44#define XFS_BTNUM_AGF ((int)XFS_BTNUM_CNTi + 1)
45
46/*
47 * The second word of agf_levels in the first a.g. overlaps the EFS
48 * superblock's magic number. Since the magic numbers valid for EFS
49 * are > 64k, our value cannot be confused for an EFS superblock's.
50 */
51
52typedef struct xfs_agf {
53 /*
54 * Common allocation group header information
55 */
56 __be32 agf_magicnum; /* magic number == XFS_AGF_MAGIC */
57 __be32 agf_versionnum; /* header version == XFS_AGF_VERSION */
58 __be32 agf_seqno; /* sequence # starting from 0 */
59 __be32 agf_length; /* size in blocks of a.g. */
60 /*
61 * Freespace information
62 */
63 __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */
64 __be32 agf_spare0; /* spare field */
65 __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */
66 __be32 agf_spare1; /* spare field */
67
68 __be32 agf_flfirst; /* first freelist block's index */
69 __be32 agf_fllast; /* last freelist block's index */
70 __be32 agf_flcount; /* count of blocks in freelist */
71 __be32 agf_freeblks; /* total free blocks */
72
73 __be32 agf_longest; /* longest free space */
74 __be32 agf_btreeblks; /* # of blocks held in AGF btrees */
75 uuid_t agf_uuid; /* uuid of filesystem */
76
77 /*
78 * reserve some contiguous space for future logged fields before we add
79 * the unlogged fields. This makes the range logging via flags and
80 * structure offsets much simpler.
81 */
82 __be64 agf_spare64[16];
83
84 /* unlogged fields, written during buffer writeback. */
85 __be64 agf_lsn; /* last write sequence */
86 __be32 agf_crc; /* crc of agf sector */
87 __be32 agf_spare2;
88
89 /* structure must be padded to 64 bit alignment */
90} xfs_agf_t;
91
92#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc)
93
94#define XFS_AGF_MAGICNUM 0x00000001
95#define XFS_AGF_VERSIONNUM 0x00000002
96#define XFS_AGF_SEQNO 0x00000004
97#define XFS_AGF_LENGTH 0x00000008
98#define XFS_AGF_ROOTS 0x00000010
99#define XFS_AGF_LEVELS 0x00000020
100#define XFS_AGF_FLFIRST 0x00000040
101#define XFS_AGF_FLLAST 0x00000080
102#define XFS_AGF_FLCOUNT 0x00000100
103#define XFS_AGF_FREEBLKS 0x00000200
104#define XFS_AGF_LONGEST 0x00000400
105#define XFS_AGF_BTREEBLKS 0x00000800
106#define XFS_AGF_UUID 0x00001000
107#define XFS_AGF_NUM_BITS 13
108#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
109
110#define XFS_AGF_FLAGS \
111 { XFS_AGF_MAGICNUM, "MAGICNUM" }, \
112 { XFS_AGF_VERSIONNUM, "VERSIONNUM" }, \
113 { XFS_AGF_SEQNO, "SEQNO" }, \
114 { XFS_AGF_LENGTH, "LENGTH" }, \
115 { XFS_AGF_ROOTS, "ROOTS" }, \
116 { XFS_AGF_LEVELS, "LEVELS" }, \
117 { XFS_AGF_FLFIRST, "FLFIRST" }, \
118 { XFS_AGF_FLLAST, "FLLAST" }, \
119 { XFS_AGF_FLCOUNT, "FLCOUNT" }, \
120 { XFS_AGF_FREEBLKS, "FREEBLKS" }, \
121 { XFS_AGF_LONGEST, "LONGEST" }, \
122 { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \
123 { XFS_AGF_UUID, "UUID" }
124
125/* disk block (xfs_daddr_t) in the AG */
126#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
127#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
128#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr))
129
130extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
131 xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
132
133/*
134 * Size of the unlinked inode hash table in the agi.
135 */
136#define XFS_AGI_UNLINKED_BUCKETS 64
137
138typedef struct xfs_agi {
139 /*
140 * Common allocation group header information
141 */
142 __be32 agi_magicnum; /* magic number == XFS_AGI_MAGIC */
143 __be32 agi_versionnum; /* header version == XFS_AGI_VERSION */
144 __be32 agi_seqno; /* sequence # starting from 0 */
145 __be32 agi_length; /* size in blocks of a.g. */
146 /*
147 * Inode information
148 * Inodes are mapped by interpreting the inode number, so no
149 * mapping data is needed here.
150 */
151 __be32 agi_count; /* count of allocated inodes */
152 __be32 agi_root; /* root of inode btree */
153 __be32 agi_level; /* levels in inode btree */
154 __be32 agi_freecount; /* number of free inodes */
155
156 __be32 agi_newino; /* new inode just allocated */
157 __be32 agi_dirino; /* last directory inode chunk */
158 /*
159 * Hash table of inodes which have been unlinked but are
160 * still being referenced.
161 */
162 __be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
163 /*
164 * This marks the end of logging region 1 and start of logging region 2.
165 */
166 uuid_t agi_uuid; /* uuid of filesystem */
167 __be32 agi_crc; /* crc of agi sector */
168 __be32 agi_pad32;
169 __be64 agi_lsn; /* last write sequence */
170
171 __be32 agi_free_root; /* root of the free inode btree */
172 __be32 agi_free_level;/* levels in free inode btree */
173
174 /* structure must be padded to 64 bit alignment */
175} xfs_agi_t;
176
177#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc)
178
179#define XFS_AGI_MAGICNUM (1 << 0)
180#define XFS_AGI_VERSIONNUM (1 << 1)
181#define XFS_AGI_SEQNO (1 << 2)
182#define XFS_AGI_LENGTH (1 << 3)
183#define XFS_AGI_COUNT (1 << 4)
184#define XFS_AGI_ROOT (1 << 5)
185#define XFS_AGI_LEVEL (1 << 6)
186#define XFS_AGI_FREECOUNT (1 << 7)
187#define XFS_AGI_NEWINO (1 << 8)
188#define XFS_AGI_DIRINO (1 << 9)
189#define XFS_AGI_UNLINKED (1 << 10)
190#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */
191#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1)
192#define XFS_AGI_FREE_ROOT (1 << 11)
193#define XFS_AGI_FREE_LEVEL (1 << 12)
194#define XFS_AGI_NUM_BITS_R2 13
195
196/* disk block (xfs_daddr_t) in the AG */
197#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
198#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
199#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr))
200
201extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
202 xfs_agnumber_t agno, struct xfs_buf **bpp);
203
204/*
205 * The third a.g. block contains the a.g. freelist, an array
206 * of block pointers to blocks owned by the allocation btree code.
207 */
208#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
209#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
210#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr))
211
212#define XFS_BUF_TO_AGFL_BNO(mp, bp) \
213 (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
214 &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \
215 (__be32 *)(bp)->b_addr)
216
217/*
218 * Size of the AGFL. For CRC-enabled filesystes we steal a couple of
219 * slots in the beginning of the block for a proper header with the
220 * location information and CRC.
221 */
222#define XFS_AGFL_SIZE(mp) \
223 (((mp)->m_sb.sb_sectsize - \
224 (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
225 sizeof(struct xfs_agfl) : 0)) / \
226 sizeof(xfs_agblock_t))
227
228typedef struct xfs_agfl {
229 __be32 agfl_magicnum;
230 __be32 agfl_seqno;
231 uuid_t agfl_uuid;
232 __be64 agfl_lsn;
233 __be32 agfl_crc;
234 __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */
235} xfs_agfl_t;
236
237#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
238
239/*
240 * tags for inode radix tree
241 */
242#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
243 in xfs_inode_ag_iterator */
244#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
245#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */
246
247#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
248#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \
249 (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
250#define XFS_MIN_FREELIST(a,mp) \
251 (XFS_MIN_FREELIST_RAW( \
252 be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \
253 be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
254#define XFS_MIN_FREELIST_PAG(pag,mp) \
255 (XFS_MIN_FREELIST_RAW( \
256 (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
257 (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
258
259#define XFS_AGB_TO_FSB(mp,agno,agbno) \
260 (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
261#define XFS_FSB_TO_AGNO(mp,fsbno) \
262 ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
263#define XFS_FSB_TO_AGBNO(mp,fsbno) \
264 ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
265#define XFS_AGB_TO_DADDR(mp,agno,agbno) \
266 ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
267 (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
268#define XFS_AG_DADDR(mp,agno,d) (XFS_AGB_TO_DADDR(mp, agno, 0) + (d))
269
270/*
271 * For checking for bad ranges of xfs_daddr_t's, covering multiple
272 * allocation groups or a single xfs_daddr_t that's a superblock copy.
273 */
274#define XFS_AG_CHECK_DADDR(mp,d,len) \
275 ((len) == 1 ? \
276 ASSERT((d) == XFS_SB_DADDR || \
277 xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
278 ASSERT(xfs_daddr_to_agno(mp, d) == \
279 xfs_daddr_to_agno(mp, (d) + (len) - 1)))
280
281#endif /* __XFS_AG_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index eff34218f405..a6fbf4472017 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -23,7 +23,6 @@
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_bit.h" 24#include "xfs_bit.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h" 26#include "xfs_mount.h"
28#include "xfs_inode.h" 27#include "xfs_inode.h"
29#include "xfs_btree.h" 28#include "xfs_btree.h"
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index feacb061bab7..d1b4b6a5c894 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -231,4 +231,7 @@ xfs_alloc_get_rec(
231 xfs_extlen_t *len, /* output: length of extent */ 231 xfs_extlen_t *len, /* output: length of extent */
232 int *stat); /* output: success/failure */ 232 int *stat); /* output: success/failure */
233 233
234int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
235 xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
236
234#endif /* __XFS_ALLOC_H__ */ 237#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index e0e83e24d3ef..59d521c09a17 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -22,7 +22,6 @@
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h" 25#include "xfs_mount.h"
27#include "xfs_btree.h" 26#include "xfs_btree.h"
28#include "xfs_alloc_btree.h" 27#include "xfs_alloc_btree.h"
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 353fb425faef..0a472fbe06d4 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -22,8 +22,6 @@
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_bit.h" 24#include "xfs_bit.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h" 25#include "xfs_mount.h"
28#include "xfs_da_format.h" 26#include "xfs_da_format.h"
29#include "xfs_da_btree.h" 27#include "xfs_da_btree.h"
@@ -42,7 +40,6 @@
42#include "xfs_quota.h" 40#include "xfs_quota.h"
43#include "xfs_trans_space.h" 41#include "xfs_trans_space.h"
44#include "xfs_trace.h" 42#include "xfs_trace.h"
45#include "xfs_dinode.h"
46 43
47/* 44/*
48 * xfs_attr.c 45 * xfs_attr.c
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index b1f73dbbf3d8..5d38e8b8a913 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -24,7 +24,6 @@
24#include "xfs_trans_resv.h" 24#include "xfs_trans_resv.h"
25#include "xfs_bit.h" 25#include "xfs_bit.h"
26#include "xfs_sb.h" 26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_da_format.h" 28#include "xfs_da_format.h"
30#include "xfs_da_btree.h" 29#include "xfs_da_btree.h"
@@ -41,7 +40,6 @@
41#include "xfs_trace.h" 40#include "xfs_trace.h"
42#include "xfs_buf_item.h" 41#include "xfs_buf_item.h"
43#include "xfs_cksum.h" 42#include "xfs_cksum.h"
44#include "xfs_dinode.h"
45#include "xfs_dir2.h" 43#include "xfs_dir2.h"
46 44
47 45
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 7510ab8058a4..20de88d1bf86 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -23,8 +23,6 @@
23#include "xfs_log_format.h" 23#include "xfs_log_format.h"
24#include "xfs_trans_resv.h" 24#include "xfs_trans_resv.h"
25#include "xfs_bit.h" 25#include "xfs_bit.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h" 26#include "xfs_mount.h"
29#include "xfs_da_format.h" 27#include "xfs_da_format.h"
30#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 79c981984dca..b5eb4743f75a 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -22,9 +22,7 @@
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_bit.h" 24#include "xfs_bit.h"
25#include "xfs_inum.h"
26#include "xfs_sb.h" 25#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h" 26#include "xfs_mount.h"
29#include "xfs_da_format.h" 27#include "xfs_da_format.h"
30#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
@@ -46,7 +44,6 @@
46#include "xfs_trace.h" 44#include "xfs_trace.h"
47#include "xfs_symlink.h" 45#include "xfs_symlink.h"
48#include "xfs_attr_leaf.h" 46#include "xfs_attr_leaf.h"
49#include "xfs_dinode.h"
50#include "xfs_filestream.h" 47#include "xfs_filestream.h"
51 48
52 49
@@ -5450,13 +5447,11 @@ xfs_bmse_merge(
5450 struct xfs_btree_cur *cur, 5447 struct xfs_btree_cur *cur,
5451 int *logflags) /* output */ 5448 int *logflags) /* output */
5452{ 5449{
5453 struct xfs_ifork *ifp;
5454 struct xfs_bmbt_irec got; 5450 struct xfs_bmbt_irec got;
5455 struct xfs_bmbt_irec left; 5451 struct xfs_bmbt_irec left;
5456 xfs_filblks_t blockcount; 5452 xfs_filblks_t blockcount;
5457 int error, i; 5453 int error, i;
5458 5454
5459 ifp = XFS_IFORK_PTR(ip, whichfork);
5460 xfs_bmbt_get_all(gotp, &got); 5455 xfs_bmbt_get_all(gotp, &got);
5461 xfs_bmbt_get_all(leftp, &left); 5456 xfs_bmbt_get_all(leftp, &left);
5462 blockcount = left.br_blockcount + got.br_blockcount; 5457 blockcount = left.br_blockcount + got.br_blockcount;
@@ -5489,32 +5484,25 @@ xfs_bmse_merge(
5489 error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, 5484 error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock,
5490 got.br_blockcount, &i); 5485 got.br_blockcount, &i);
5491 if (error) 5486 if (error)
5492 goto out_error; 5487 return error;
5493 XFS_WANT_CORRUPTED_GOTO(i == 1, out_error); 5488 XFS_WANT_CORRUPTED_RETURN(i == 1);
5494 5489
5495 error = xfs_btree_delete(cur, &i); 5490 error = xfs_btree_delete(cur, &i);
5496 if (error) 5491 if (error)
5497 goto out_error; 5492 return error;
5498 XFS_WANT_CORRUPTED_GOTO(i == 1, out_error); 5493 XFS_WANT_CORRUPTED_RETURN(i == 1);
5499 5494
5500 /* lookup and update size of the previous extent */ 5495 /* lookup and update size of the previous extent */
5501 error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock, 5496 error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock,
5502 left.br_blockcount, &i); 5497 left.br_blockcount, &i);
5503 if (error) 5498 if (error)
5504 goto out_error; 5499 return error;
5505 XFS_WANT_CORRUPTED_GOTO(i == 1, out_error); 5500 XFS_WANT_CORRUPTED_RETURN(i == 1);
5506 5501
5507 left.br_blockcount = blockcount; 5502 left.br_blockcount = blockcount;
5508 5503
5509 error = xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, 5504 return xfs_bmbt_update(cur, left.br_startoff, left.br_startblock,
5510 left.br_blockcount, left.br_state); 5505 left.br_blockcount, left.br_state);
5511 if (error)
5512 goto out_error;
5513
5514 return 0;
5515
5516out_error:
5517 return error;
5518} 5506}
5519 5507
5520/* 5508/*
@@ -5544,35 +5532,29 @@ xfs_bmse_shift_one(
5544 startoff = got.br_startoff - offset_shift_fsb; 5532 startoff = got.br_startoff - offset_shift_fsb;
5545 5533
5546 /* delalloc extents should be prevented by caller */ 5534 /* delalloc extents should be prevented by caller */
5547 XFS_WANT_CORRUPTED_GOTO(!isnullstartblock(got.br_startblock), 5535 XFS_WANT_CORRUPTED_RETURN(!isnullstartblock(got.br_startblock));
5548 out_error);
5549 5536
5550 /* 5537 /*
5551 * If this is the first extent in the file, make sure there's enough 5538 * Check for merge if we've got an extent to the left, otherwise make
5552 * room at the start of the file and jump right to the shift as there's 5539 * sure there's enough room at the start of the file for the shift.
5553 * no left extent to merge.
5554 */ 5540 */
5555 if (*current_ext == 0) { 5541 if (*current_ext) {
5556 if (got.br_startoff < offset_shift_fsb) 5542 /* grab the left extent and check for a large enough hole */
5557 return -EINVAL; 5543 leftp = xfs_iext_get_ext(ifp, *current_ext - 1);
5558 goto shift_extent; 5544 xfs_bmbt_get_all(leftp, &left);
5559 }
5560 5545
5561 /* grab the left extent and check for a large enough hole */ 5546 if (startoff < left.br_startoff + left.br_blockcount)
5562 leftp = xfs_iext_get_ext(ifp, *current_ext - 1); 5547 return -EINVAL;
5563 xfs_bmbt_get_all(leftp, &left);
5564 5548
5565 if (startoff < left.br_startoff + left.br_blockcount) 5549 /* check whether to merge the extent or shift it down */
5550 if (xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) {
5551 return xfs_bmse_merge(ip, whichfork, offset_shift_fsb,
5552 *current_ext, gotp, leftp, cur,
5553 logflags);
5554 }
5555 } else if (got.br_startoff < offset_shift_fsb)
5566 return -EINVAL; 5556 return -EINVAL;
5567 5557
5568 /* check whether to merge the extent or shift it down */
5569 if (!xfs_bmse_can_merge(&left, &got, offset_shift_fsb))
5570 goto shift_extent;
5571
5572 return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, *current_ext,
5573 gotp, leftp, cur, logflags);
5574
5575shift_extent:
5576 /* 5558 /*
5577 * Increment the extent index for the next iteration, update the start 5559 * Increment the extent index for the next iteration, update the start
5578 * offset of the in-core extent and update the btree if applicable. 5560 * offset of the in-core extent and update the btree if applicable.
@@ -5589,18 +5571,11 @@ shift_extent:
5589 got.br_blockcount, &i); 5571 got.br_blockcount, &i);
5590 if (error) 5572 if (error)
5591 return error; 5573 return error;
5592 XFS_WANT_CORRUPTED_GOTO(i == 1, out_error); 5574 XFS_WANT_CORRUPTED_RETURN(i == 1);
5593 5575
5594 got.br_startoff = startoff; 5576 got.br_startoff = startoff;
5595 error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, 5577 return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
5596 got.br_blockcount, got.br_state); 5578 got.br_blockcount, got.br_state);
5597 if (error)
5598 return error;
5599
5600 return 0;
5601
5602out_error:
5603 return error;
5604} 5579}
5605 5580
5606/* 5581/*
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index fba753308f31..2c44c8e50782 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -22,8 +22,6 @@
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_bit.h" 24#include "xfs_bit.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h" 25#include "xfs_mount.h"
28#include "xfs_inode.h" 26#include "xfs_inode.h"
29#include "xfs_trans.h" 27#include "xfs_trans.h"
@@ -36,7 +34,6 @@
36#include "xfs_quota.h" 34#include "xfs_quota.h"
37#include "xfs_trace.h" 35#include "xfs_trace.h"
38#include "xfs_cksum.h" 36#include "xfs_cksum.h"
39#include "xfs_dinode.h"
40 37
41/* 38/*
42 * Determine the extent state. 39 * Determine the extent state.
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 8fe6a93ff473..81cad433df85 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -22,8 +22,6 @@
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_bit.h" 24#include "xfs_bit.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h" 25#include "xfs_mount.h"
28#include "xfs_inode.h" 26#include "xfs_inode.h"
29#include "xfs_trans.h" 27#include "xfs_trans.h"
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index fd827530afec..9cb0115c6bd1 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -23,8 +23,6 @@
23#include "xfs_log_format.h" 23#include "xfs_log_format.h"
24#include "xfs_trans_resv.h" 24#include "xfs_trans_resv.h"
25#include "xfs_bit.h" 25#include "xfs_bit.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h" 26#include "xfs_mount.h"
29#include "xfs_da_format.h" 27#include "xfs_da_format.h"
30#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
@@ -514,7 +512,6 @@ xfs_da3_root_split(
514 struct xfs_buf *bp; 512 struct xfs_buf *bp;
515 struct xfs_inode *dp; 513 struct xfs_inode *dp;
516 struct xfs_trans *tp; 514 struct xfs_trans *tp;
517 struct xfs_mount *mp;
518 struct xfs_dir2_leaf *leaf; 515 struct xfs_dir2_leaf *leaf;
519 xfs_dablk_t blkno; 516 xfs_dablk_t blkno;
520 int level; 517 int level;
@@ -534,7 +531,6 @@ xfs_da3_root_split(
534 531
535 dp = args->dp; 532 dp = args->dp;
536 tp = args->trans; 533 tp = args->trans;
537 mp = state->mp;
538 error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork); 534 error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
539 if (error) 535 if (error)
540 return error; 536 return error;
@@ -2342,14 +2338,12 @@ xfs_da_shrink_inode(
2342 xfs_inode_t *dp; 2338 xfs_inode_t *dp;
2343 int done, error, w, count; 2339 int done, error, w, count;
2344 xfs_trans_t *tp; 2340 xfs_trans_t *tp;
2345 xfs_mount_t *mp;
2346 2341
2347 trace_xfs_da_shrink_inode(args); 2342 trace_xfs_da_shrink_inode(args);
2348 2343
2349 dp = args->dp; 2344 dp = args->dp;
2350 w = args->whichfork; 2345 w = args->whichfork;
2351 tp = args->trans; 2346 tp = args->trans;
2352 mp = dp->i_mount;
2353 count = args->geo->fsbcount; 2347 count = args->geo->fsbcount;
2354 for (;;) { 2348 for (;;) {
2355 /* 2349 /*
diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
index 7e42fdfd2f1d..9d624a622946 100644
--- a/fs/xfs/libxfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -22,8 +22,6 @@
22#include "xfs_format.h" 22#include "xfs_format.h"
23#include "xfs_log_format.h" 23#include "xfs_log_format.h"
24#include "xfs_trans_resv.h" 24#include "xfs_trans_resv.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h" 25#include "xfs_mount.h"
28#include "xfs_da_format.h" 26#include "xfs_da_format.h"
29#include "xfs_da_btree.h" 27#include "xfs_da_btree.h"
diff --git a/fs/xfs/libxfs/xfs_dinode.h b/fs/xfs/libxfs/xfs_dinode.h
deleted file mode 100644
index 623bbe8fd921..000000000000
--- a/fs/xfs/libxfs/xfs_dinode.h
+++ /dev/null
@@ -1,243 +0,0 @@
1/*
2 * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_DINODE_H__
19#define __XFS_DINODE_H__
20
21#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
22#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3)
23
24typedef struct xfs_timestamp {
25 __be32 t_sec; /* timestamp seconds */
26 __be32 t_nsec; /* timestamp nanoseconds */
27} xfs_timestamp_t;
28
29/*
30 * On-disk inode structure.
31 *
32 * This is just the header or "dinode core", the inode is expanded to fill a
33 * variable size the leftover area split into a data and an attribute fork.
34 * The format of the data and attribute fork depends on the format of the
35 * inode as indicated by di_format and di_aformat. To access the data and
36 * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros
37 * below.
38 *
39 * There is a very similar struct icdinode in xfs_inode which matches the
40 * layout of the first 96 bytes of this structure, but is kept in native
41 * format instead of big endian.
42 *
43 * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
44 * padding field for v3 inodes.
45 */
46typedef struct xfs_dinode {
47 __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
48 __be16 di_mode; /* mode and type of file */
49 __u8 di_version; /* inode version */
50 __u8 di_format; /* format of di_c data */
51 __be16 di_onlink; /* old number of links to file */
52 __be32 di_uid; /* owner's user id */
53 __be32 di_gid; /* owner's group id */
54 __be32 di_nlink; /* number of links to file */
55 __be16 di_projid_lo; /* lower part of owner's project id */
56 __be16 di_projid_hi; /* higher part owner's project id */
57 __u8 di_pad[6]; /* unused, zeroed space */
58 __be16 di_flushiter; /* incremented on flush */
59 xfs_timestamp_t di_atime; /* time last accessed */
60 xfs_timestamp_t di_mtime; /* time last modified */
61 xfs_timestamp_t di_ctime; /* time created/inode modified */
62 __be64 di_size; /* number of bytes in file */
63 __be64 di_nblocks; /* # of direct & btree blocks used */
64 __be32 di_extsize; /* basic/minimum extent size for file */
65 __be32 di_nextents; /* number of extents in data fork */
66 __be16 di_anextents; /* number of extents in attribute fork*/
67 __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */
68 __s8 di_aformat; /* format of attr fork's data */
69 __be32 di_dmevmask; /* DMIG event mask */
70 __be16 di_dmstate; /* DMIG state info */
71 __be16 di_flags; /* random flags, XFS_DIFLAG_... */
72 __be32 di_gen; /* generation number */
73
74 /* di_next_unlinked is the only non-core field in the old dinode */
75 __be32 di_next_unlinked;/* agi unlinked list ptr */
76
77 /* start of the extended dinode, writable fields */
78 __le32 di_crc; /* CRC of the inode */
79 __be64 di_changecount; /* number of attribute changes */
80 __be64 di_lsn; /* flush sequence */
81 __be64 di_flags2; /* more random flags */
82 __u8 di_pad2[16]; /* more padding for future expansion */
83
84 /* fields only written to during inode creation */
85 xfs_timestamp_t di_crtime; /* time created */
86 __be64 di_ino; /* inode number */
87 uuid_t di_uuid; /* UUID of the filesystem */
88
89 /* structure must be padded to 64 bit alignment */
90} xfs_dinode_t;
91
92#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc)
93
94#define DI_MAX_FLUSH 0xffff
95
96/*
97 * Size of the core inode on disk. Version 1 and 2 inodes have
98 * the same size, but version 3 has grown a few additional fields.
99 */
100static inline uint xfs_dinode_size(int version)
101{
102 if (version == 3)
103 return sizeof(struct xfs_dinode);
104 return offsetof(struct xfs_dinode, di_crc);
105}
106
107/*
108 * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
109 * Since the pathconf interface is signed, we use 2^31 - 1 instead.
110 * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
111 */
112#define XFS_MAXLINK ((1U << 31) - 1U)
113#define XFS_MAXLINK_1 65535U
114
115/*
116 * Values for di_format
117 */
118typedef enum xfs_dinode_fmt {
119 XFS_DINODE_FMT_DEV, /* xfs_dev_t */
120 XFS_DINODE_FMT_LOCAL, /* bulk data */
121 XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */
122 XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */
123 XFS_DINODE_FMT_UUID /* uuid_t */
124} xfs_dinode_fmt_t;
125
126/*
127 * Inode minimum and maximum sizes.
128 */
129#define XFS_DINODE_MIN_LOG 8
130#define XFS_DINODE_MAX_LOG 11
131#define XFS_DINODE_MIN_SIZE (1 << XFS_DINODE_MIN_LOG)
132#define XFS_DINODE_MAX_SIZE (1 << XFS_DINODE_MAX_LOG)
133
134/*
135 * Inode size for given fs.
136 */
137#define XFS_LITINO(mp, version) \
138 ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
139
140/*
141 * Inode data & attribute fork sizes, per inode.
142 */
143#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0)
144#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3))
145
146#define XFS_DFORK_DSIZE(dip,mp) \
147 (XFS_DFORK_Q(dip) ? \
148 XFS_DFORK_BOFF(dip) : \
149 XFS_LITINO(mp, (dip)->di_version))
150#define XFS_DFORK_ASIZE(dip,mp) \
151 (XFS_DFORK_Q(dip) ? \
152 XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \
153 0)
154#define XFS_DFORK_SIZE(dip,mp,w) \
155 ((w) == XFS_DATA_FORK ? \
156 XFS_DFORK_DSIZE(dip, mp) : \
157 XFS_DFORK_ASIZE(dip, mp))
158
159/*
160 * Return pointers to the data or attribute forks.
161 */
162#define XFS_DFORK_DPTR(dip) \
163 ((char *)dip + xfs_dinode_size(dip->di_version))
164#define XFS_DFORK_APTR(dip) \
165 (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
166#define XFS_DFORK_PTR(dip,w) \
167 ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
168
169#define XFS_DFORK_FORMAT(dip,w) \
170 ((w) == XFS_DATA_FORK ? \
171 (dip)->di_format : \
172 (dip)->di_aformat)
173#define XFS_DFORK_NEXTENTS(dip,w) \
174 ((w) == XFS_DATA_FORK ? \
175 be32_to_cpu((dip)->di_nextents) : \
176 be16_to_cpu((dip)->di_anextents))
177
178#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)((bp)->b_addr))
179
180/*
181 * For block and character special files the 32bit dev_t is stored at the
182 * beginning of the data fork.
183 */
184static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
185{
186 return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
187}
188
189static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
190{
191 *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
192}
193
194/*
195 * Values for di_flags
196 * There should be a one-to-one correspondence between these flags and the
197 * XFS_XFLAG_s.
198 */
199#define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */
200#define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */
201#define XFS_DIFLAG_NEWRTBM_BIT 2 /* for rtbitmap inode, new format */
202#define XFS_DIFLAG_IMMUTABLE_BIT 3 /* inode is immutable */
203#define XFS_DIFLAG_APPEND_BIT 4 /* inode is append-only */
204#define XFS_DIFLAG_SYNC_BIT 5 /* inode is written synchronously */
205#define XFS_DIFLAG_NOATIME_BIT 6 /* do not update atime */
206#define XFS_DIFLAG_NODUMP_BIT 7 /* do not dump */
207#define XFS_DIFLAG_RTINHERIT_BIT 8 /* create with realtime bit set */
208#define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */
209#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */
210#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */
211#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */
212#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */
213#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */
214#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT)
215#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT)
216#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT)
217#define XFS_DIFLAG_IMMUTABLE (1 << XFS_DIFLAG_IMMUTABLE_BIT)
218#define XFS_DIFLAG_APPEND (1 << XFS_DIFLAG_APPEND_BIT)
219#define XFS_DIFLAG_SYNC (1 << XFS_DIFLAG_SYNC_BIT)
220#define XFS_DIFLAG_NOATIME (1 << XFS_DIFLAG_NOATIME_BIT)
221#define XFS_DIFLAG_NODUMP (1 << XFS_DIFLAG_NODUMP_BIT)
222#define XFS_DIFLAG_RTINHERIT (1 << XFS_DIFLAG_RTINHERIT_BIT)
223#define XFS_DIFLAG_PROJINHERIT (1 << XFS_DIFLAG_PROJINHERIT_BIT)
224#define XFS_DIFLAG_NOSYMLINKS (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
225#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT)
226#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
227#define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT)
228#define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT)
229
230#ifdef CONFIG_XFS_RT
231#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME)
232#else
233#define XFS_IS_REALTIME_INODE(ip) (0)
234#endif
235
236#define XFS_DIFLAG_ANY \
237 (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
238 XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
239 XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
240 XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
241 XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
242
243#endif /* __XFS_DINODE_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 7075aaf131f4..a69fb3a1e161 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -20,9 +20,6 @@
20#include "xfs_format.h" 20#include "xfs_format.h"
21#include "xfs_log_format.h" 21#include "xfs_log_format.h"
22#include "xfs_trans_resv.h" 22#include "xfs_trans_resv.h"
23#include "xfs_inum.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h" 23#include "xfs_mount.h"
27#include "xfs_da_format.h" 24#include "xfs_da_format.h"
28#include "xfs_da_btree.h" 25#include "xfs_da_btree.h"
@@ -34,10 +31,25 @@
34#include "xfs_dir2_priv.h" 31#include "xfs_dir2_priv.h"
35#include "xfs_error.h" 32#include "xfs_error.h"
36#include "xfs_trace.h" 33#include "xfs_trace.h"
37#include "xfs_dinode.h"
38 34
39struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR }; 35struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
40 36
37/*
38 * @mode, if set, indicates that the type field needs to be set up.
39 * This uses the transformation from file mode to DT_* as defined in linux/fs.h
40 * for file type specification. This will be propagated into the directory
41 * structure if appropriate for the given operation and filesystem config.
42 */
43const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = {
44 [0] = XFS_DIR3_FT_UNKNOWN,
45 [S_IFREG >> S_SHIFT] = XFS_DIR3_FT_REG_FILE,
46 [S_IFDIR >> S_SHIFT] = XFS_DIR3_FT_DIR,
47 [S_IFCHR >> S_SHIFT] = XFS_DIR3_FT_CHRDEV,
48 [S_IFBLK >> S_SHIFT] = XFS_DIR3_FT_BLKDEV,
49 [S_IFIFO >> S_SHIFT] = XFS_DIR3_FT_FIFO,
50 [S_IFSOCK >> S_SHIFT] = XFS_DIR3_FT_SOCK,
51 [S_IFLNK >> S_SHIFT] = XFS_DIR3_FT_SYMLINK,
52};
41 53
42/* 54/*
43 * ASCII case-insensitive (ie. A-Z) support for directories that was 55 * ASCII case-insensitive (ie. A-Z) support for directories that was
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 4dff261e6ed5..e55353651f5b 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -32,6 +32,12 @@ struct xfs_dir2_data_unused;
32extern struct xfs_name xfs_name_dotdot; 32extern struct xfs_name xfs_name_dotdot;
33 33
34/* 34/*
35 * directory filetype conversion tables.
36 */
37#define S_SHIFT 12
38extern const unsigned char xfs_mode_to_ftype[];
39
40/*
35 * directory operations vector for encode/decode routines 41 * directory operations vector for encode/decode routines
36 */ 42 */
37struct xfs_dir_ops { 43struct xfs_dir_ops {
@@ -177,4 +183,138 @@ extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
177extern const struct xfs_buf_ops xfs_dir3_free_buf_ops; 183extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
178extern const struct xfs_buf_ops xfs_dir3_data_buf_ops; 184extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
179 185
186/*
187 * Directory offset/block conversion functions.
188 *
189 * DB blocks here are logical directory block numbers, not filesystem blocks.
190 */
191
192/*
193 * Convert dataptr to byte in file space
194 */
195static inline xfs_dir2_off_t
196xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp)
197{
198 return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
199}
200
201/*
202 * Convert byte in file space to dataptr. It had better be aligned.
203 */
204static inline xfs_dir2_dataptr_t
205xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by)
206{
207 return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
208}
209
210/*
211 * Convert byte in space to (DB) block
212 */
213static inline xfs_dir2_db_t
214xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
215{
216 return (xfs_dir2_db_t)(by >> geo->blklog);
217}
218
219/*
220 * Convert dataptr to a block number
221 */
222static inline xfs_dir2_db_t
223xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
224{
225 return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp));
226}
227
228/*
229 * Convert byte in space to offset in a block
230 */
231static inline xfs_dir2_data_aoff_t
232xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
233{
234 return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1));
235}
236
237/*
238 * Convert dataptr to a byte offset in a block
239 */
240static inline xfs_dir2_data_aoff_t
241xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
242{
243 return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp));
244}
245
246/*
247 * Convert block and offset to byte in space
248 */
249static inline xfs_dir2_off_t
250xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
251 xfs_dir2_data_aoff_t o)
252{
253 return ((xfs_dir2_off_t)db << geo->blklog) + o;
254}
255
256/*
257 * Convert block (DB) to block (dablk)
258 */
259static inline xfs_dablk_t
260xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
261{
262 return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog));
263}
264
265/*
266 * Convert byte in space to (DA) block
267 */
268static inline xfs_dablk_t
269xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
270{
271 return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by));
272}
273
274/*
275 * Convert block and offset to dataptr
276 */
277static inline xfs_dir2_dataptr_t
278xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
279 xfs_dir2_data_aoff_t o)
280{
281 return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o));
282}
283
284/*
285 * Convert block (dablk) to block (DB)
286 */
287static inline xfs_dir2_db_t
288xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da)
289{
290 return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog));
291}
292
293/*
294 * Convert block (dablk) to byte offset in space
295 */
296static inline xfs_dir2_off_t
297xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da)
298{
299 return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0);
300}
301
302/*
303 * Directory tail pointer accessor functions. Based on block geometry.
304 */
305static inline struct xfs_dir2_block_tail *
306xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr)
307{
308 return ((struct xfs_dir2_block_tail *)
309 ((char *)hdr + geo->blksize)) - 1;
310}
311
312static inline struct xfs_dir2_leaf_tail *
313xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
314{
315 return (struct xfs_dir2_leaf_tail *)
316 ((char *)lp + geo->blksize -
317 sizeof(struct xfs_dir2_leaf_tail));
318}
319
180#endif /* __XFS_DIR2_H__ */ 320#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 9628ceccfa02..9354e190b82e 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -21,8 +21,6 @@
21#include "xfs_format.h" 21#include "xfs_format.h"
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h" 24#include "xfs_mount.h"
27#include "xfs_da_format.h" 25#include "xfs_da_format.h"
28#include "xfs_da_btree.h" 26#include "xfs_da_btree.h"
@@ -36,7 +34,6 @@
36#include "xfs_error.h" 34#include "xfs_error.h"
37#include "xfs_trace.h" 35#include "xfs_trace.h"
38#include "xfs_cksum.h" 36#include "xfs_cksum.h"
39#include "xfs_dinode.h"
40 37
41/* 38/*
42 * Local function prototypes. 39 * Local function prototypes.
@@ -353,7 +350,6 @@ xfs_dir2_block_addname(
353 int low; /* low index for binary srch */ 350 int low; /* low index for binary srch */
354 int lowstale; /* low stale index */ 351 int lowstale; /* low stale index */
355 int mid=0; /* midpoint for binary srch */ 352 int mid=0; /* midpoint for binary srch */
356 xfs_mount_t *mp; /* filesystem mount point */
357 int needlog; /* need to log header */ 353 int needlog; /* need to log header */
358 int needscan; /* need to rescan freespace */ 354 int needscan; /* need to rescan freespace */
359 __be16 *tagp; /* pointer to tag value */ 355 __be16 *tagp; /* pointer to tag value */
@@ -363,7 +359,6 @@ xfs_dir2_block_addname(
363 359
364 dp = args->dp; 360 dp = args->dp;
365 tp = args->trans; 361 tp = args->trans;
366 mp = dp->i_mount;
367 362
368 /* Read the (one and only) directory block into bp. */ 363 /* Read the (one and only) directory block into bp. */
369 error = xfs_dir3_block_read(tp, dp, &bp); 364 error = xfs_dir3_block_read(tp, dp, &bp);
@@ -618,7 +613,6 @@ xfs_dir2_block_lookup(
618 xfs_inode_t *dp; /* incore inode */ 613 xfs_inode_t *dp; /* incore inode */
619 int ent; /* entry index */ 614 int ent; /* entry index */
620 int error; /* error return value */ 615 int error; /* error return value */
621 xfs_mount_t *mp; /* filesystem mount point */
622 616
623 trace_xfs_dir2_block_lookup(args); 617 trace_xfs_dir2_block_lookup(args);
624 618
@@ -629,7 +623,6 @@ xfs_dir2_block_lookup(
629 if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) 623 if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent)))
630 return error; 624 return error;
631 dp = args->dp; 625 dp = args->dp;
632 mp = dp->i_mount;
633 hdr = bp->b_addr; 626 hdr = bp->b_addr;
634 xfs_dir3_data_check(dp, bp); 627 xfs_dir3_data_check(dp, bp);
635 btp = xfs_dir2_block_tail_p(args->geo, hdr); 628 btp = xfs_dir2_block_tail_p(args->geo, hdr);
@@ -770,7 +763,6 @@ xfs_dir2_block_removename(
770 xfs_inode_t *dp; /* incore inode */ 763 xfs_inode_t *dp; /* incore inode */
771 int ent; /* block leaf entry index */ 764 int ent; /* block leaf entry index */
772 int error; /* error return value */ 765 int error; /* error return value */
773 xfs_mount_t *mp; /* filesystem mount point */
774 int needlog; /* need to log block header */ 766 int needlog; /* need to log block header */
775 int needscan; /* need to fixup bestfree */ 767 int needscan; /* need to fixup bestfree */
776 xfs_dir2_sf_hdr_t sfh; /* shortform header */ 768 xfs_dir2_sf_hdr_t sfh; /* shortform header */
@@ -788,7 +780,6 @@ xfs_dir2_block_removename(
788 } 780 }
789 dp = args->dp; 781 dp = args->dp;
790 tp = args->trans; 782 tp = args->trans;
791 mp = dp->i_mount;
792 hdr = bp->b_addr; 783 hdr = bp->b_addr;
793 btp = xfs_dir2_block_tail_p(args->geo, hdr); 784 btp = xfs_dir2_block_tail_p(args->geo, hdr);
794 blp = xfs_dir2_block_leaf_p(btp); 785 blp = xfs_dir2_block_leaf_p(btp);
@@ -852,7 +843,6 @@ xfs_dir2_block_replace(
852 xfs_inode_t *dp; /* incore inode */ 843 xfs_inode_t *dp; /* incore inode */
853 int ent; /* leaf entry index */ 844 int ent; /* leaf entry index */
854 int error; /* error return value */ 845 int error; /* error return value */
855 xfs_mount_t *mp; /* filesystem mount point */
856 846
857 trace_xfs_dir2_block_replace(args); 847 trace_xfs_dir2_block_replace(args);
858 848
@@ -864,7 +854,6 @@ xfs_dir2_block_replace(
864 return error; 854 return error;
865 } 855 }
866 dp = args->dp; 856 dp = args->dp;
867 mp = dp->i_mount;
868 hdr = bp->b_addr; 857 hdr = bp->b_addr;
869 btp = xfs_dir2_block_tail_p(args->geo, hdr); 858 btp = xfs_dir2_block_tail_p(args->geo, hdr);
870 blp = xfs_dir2_block_leaf_p(btp); 859 blp = xfs_dir2_block_leaf_p(btp);
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index fdd803fecb8e..5ff31be9b1cd 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -21,8 +21,6 @@
21#include "xfs_format.h" 21#include "xfs_format.h"
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h" 24#include "xfs_mount.h"
27#include "xfs_da_format.h" 25#include "xfs_da_format.h"
28#include "xfs_da_btree.h" 26#include "xfs_da_btree.h"
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index a19174eb3cb2..106119955400 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -21,8 +21,6 @@
21#include "xfs_format.h" 21#include "xfs_format.h"
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h" 24#include "xfs_mount.h"
27#include "xfs_da_format.h" 25#include "xfs_da_format.h"
28#include "xfs_da_btree.h" 26#include "xfs_da_btree.h"
@@ -384,7 +382,6 @@ xfs_dir2_block_to_leaf(
384 xfs_dir2_db_t ldb; /* leaf block's bno */ 382 xfs_dir2_db_t ldb; /* leaf block's bno */
385 xfs_dir2_leaf_t *leaf; /* leaf structure */ 383 xfs_dir2_leaf_t *leaf; /* leaf structure */
386 xfs_dir2_leaf_tail_t *ltp; /* leaf's tail */ 384 xfs_dir2_leaf_tail_t *ltp; /* leaf's tail */
387 xfs_mount_t *mp; /* filesystem mount point */
388 int needlog; /* need to log block header */ 385 int needlog; /* need to log block header */
389 int needscan; /* need to rescan bestfree */ 386 int needscan; /* need to rescan bestfree */
390 xfs_trans_t *tp; /* transaction pointer */ 387 xfs_trans_t *tp; /* transaction pointer */
@@ -395,7 +392,6 @@ xfs_dir2_block_to_leaf(
395 trace_xfs_dir2_block_to_leaf(args); 392 trace_xfs_dir2_block_to_leaf(args);
396 393
397 dp = args->dp; 394 dp = args->dp;
398 mp = dp->i_mount;
399 tp = args->trans; 395 tp = args->trans;
400 /* 396 /*
401 * Add the leaf block to the inode. 397 * Add the leaf block to the inode.
@@ -626,7 +622,6 @@ xfs_dir2_leaf_addname(
626 int lfloghigh; /* high leaf logging index */ 622 int lfloghigh; /* high leaf logging index */
627 int lowstale; /* index of prev stale leaf */ 623 int lowstale; /* index of prev stale leaf */
628 xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */ 624 xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */
629 xfs_mount_t *mp; /* filesystem mount point */
630 int needbytes; /* leaf block bytes needed */ 625 int needbytes; /* leaf block bytes needed */
631 int needlog; /* need to log data header */ 626 int needlog; /* need to log data header */
632 int needscan; /* need to rescan data free */ 627 int needscan; /* need to rescan data free */
@@ -641,7 +636,6 @@ xfs_dir2_leaf_addname(
641 636
642 dp = args->dp; 637 dp = args->dp;
643 tp = args->trans; 638 tp = args->trans;
644 mp = dp->i_mount;
645 639
646 error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); 640 error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
647 if (error) 641 if (error)
@@ -1356,11 +1350,9 @@ xfs_dir2_leaf_removename(
1356 xfs_dir2_leaf_t *leaf; /* leaf structure */ 1350 xfs_dir2_leaf_t *leaf; /* leaf structure */
1357 xfs_dir2_leaf_entry_t *lep; /* leaf entry */ 1351 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
1358 xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ 1352 xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
1359 xfs_mount_t *mp; /* filesystem mount point */
1360 int needlog; /* need to log data header */ 1353 int needlog; /* need to log data header */
1361 int needscan; /* need to rescan data frees */ 1354 int needscan; /* need to rescan data frees */
1362 xfs_dir2_data_off_t oldbest; /* old value of best free */ 1355 xfs_dir2_data_off_t oldbest; /* old value of best free */
1363 xfs_trans_t *tp; /* transaction pointer */
1364 struct xfs_dir2_data_free *bf; /* bestfree table */ 1356 struct xfs_dir2_data_free *bf; /* bestfree table */
1365 struct xfs_dir2_leaf_entry *ents; 1357 struct xfs_dir2_leaf_entry *ents;
1366 struct xfs_dir3_icleaf_hdr leafhdr; 1358 struct xfs_dir3_icleaf_hdr leafhdr;
@@ -1374,8 +1366,6 @@ xfs_dir2_leaf_removename(
1374 return error; 1366 return error;
1375 } 1367 }
1376 dp = args->dp; 1368 dp = args->dp;
1377 tp = args->trans;
1378 mp = dp->i_mount;
1379 leaf = lbp->b_addr; 1369 leaf = lbp->b_addr;
1380 hdr = dbp->b_addr; 1370 hdr = dbp->b_addr;
1381 xfs_dir3_data_check(dp, dbp); 1371 xfs_dir3_data_check(dp, dbp);
@@ -1607,11 +1597,9 @@ xfs_dir2_leaf_trim_data(
1607 int error; /* error return value */ 1597 int error; /* error return value */
1608 xfs_dir2_leaf_t *leaf; /* leaf structure */ 1598 xfs_dir2_leaf_t *leaf; /* leaf structure */
1609 xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ 1599 xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
1610 xfs_mount_t *mp; /* filesystem mount point */
1611 xfs_trans_t *tp; /* transaction pointer */ 1600 xfs_trans_t *tp; /* transaction pointer */
1612 1601
1613 dp = args->dp; 1602 dp = args->dp;
1614 mp = dp->i_mount;
1615 tp = args->trans; 1603 tp = args->trans;
1616 /* 1604 /*
1617 * Read the offending data block. We need its buffer. 1605 * Read the offending data block. We need its buffer.
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 2ae6ac2c11ae..41b80d3d3877 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -21,8 +21,6 @@
21#include "xfs_format.h" 21#include "xfs_format.h"
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h" 24#include "xfs_mount.h"
27#include "xfs_da_format.h" 25#include "xfs_da_format.h"
28#include "xfs_da_btree.h" 26#include "xfs_da_btree.h"
@@ -297,7 +295,6 @@ xfs_dir2_leaf_to_node(
297 int i; /* leaf freespace index */ 295 int i; /* leaf freespace index */
298 xfs_dir2_leaf_t *leaf; /* leaf structure */ 296 xfs_dir2_leaf_t *leaf; /* leaf structure */
299 xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ 297 xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
300 xfs_mount_t *mp; /* filesystem mount point */
301 int n; /* count of live freespc ents */ 298 int n; /* count of live freespc ents */
302 xfs_dir2_data_off_t off; /* freespace entry value */ 299 xfs_dir2_data_off_t off; /* freespace entry value */
303 __be16 *to; /* pointer to freespace entry */ 300 __be16 *to; /* pointer to freespace entry */
@@ -307,7 +304,6 @@ xfs_dir2_leaf_to_node(
307 trace_xfs_dir2_leaf_to_node(args); 304 trace_xfs_dir2_leaf_to_node(args);
308 305
309 dp = args->dp; 306 dp = args->dp;
310 mp = dp->i_mount;
311 tp = args->trans; 307 tp = args->trans;
312 /* 308 /*
313 * Add a freespace block to the directory. 309 * Add a freespace block to the directory.
@@ -387,16 +383,12 @@ xfs_dir2_leafn_add(
387 int lfloghigh; /* high leaf entry logging */ 383 int lfloghigh; /* high leaf entry logging */
388 int lfloglow; /* low leaf entry logging */ 384 int lfloglow; /* low leaf entry logging */
389 int lowstale; /* previous stale entry */ 385 int lowstale; /* previous stale entry */
390 xfs_mount_t *mp; /* filesystem mount point */
391 xfs_trans_t *tp; /* transaction pointer */
392 struct xfs_dir3_icleaf_hdr leafhdr; 386 struct xfs_dir3_icleaf_hdr leafhdr;
393 struct xfs_dir2_leaf_entry *ents; 387 struct xfs_dir2_leaf_entry *ents;
394 388
395 trace_xfs_dir2_leafn_add(args, index); 389 trace_xfs_dir2_leafn_add(args, index);
396 390
397 dp = args->dp; 391 dp = args->dp;
398 mp = dp->i_mount;
399 tp = args->trans;
400 leaf = bp->b_addr; 392 leaf = bp->b_addr;
401 dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); 393 dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
402 ents = dp->d_ops->leaf_ents_p(leaf); 394 ents = dp->d_ops->leaf_ents_p(leaf);
@@ -1170,7 +1162,6 @@ xfs_dir2_leafn_remove(
1170 xfs_dir2_leaf_entry_t *lep; /* leaf entry */ 1162 xfs_dir2_leaf_entry_t *lep; /* leaf entry */
1171 int longest; /* longest data free entry */ 1163 int longest; /* longest data free entry */
1172 int off; /* data block entry offset */ 1164 int off; /* data block entry offset */
1173 xfs_mount_t *mp; /* filesystem mount point */
1174 int needlog; /* need to log data header */ 1165 int needlog; /* need to log data header */
1175 int needscan; /* need to rescan data frees */ 1166 int needscan; /* need to rescan data frees */
1176 xfs_trans_t *tp; /* transaction pointer */ 1167 xfs_trans_t *tp; /* transaction pointer */
@@ -1182,7 +1173,6 @@ xfs_dir2_leafn_remove(
1182 1173
1183 dp = args->dp; 1174 dp = args->dp;
1184 tp = args->trans; 1175 tp = args->trans;
1185 mp = dp->i_mount;
1186 leaf = bp->b_addr; 1176 leaf = bp->b_addr;
1187 dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); 1177 dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
1188 ents = dp->d_ops->leaf_ents_p(leaf); 1178 ents = dp->d_ops->leaf_ents_p(leaf);
@@ -1323,7 +1313,6 @@ xfs_dir2_leafn_split(
1323 xfs_da_args_t *args; /* operation arguments */ 1313 xfs_da_args_t *args; /* operation arguments */
1324 xfs_dablk_t blkno; /* new leaf block number */ 1314 xfs_dablk_t blkno; /* new leaf block number */
1325 int error; /* error return value */ 1315 int error; /* error return value */
1326 xfs_mount_t *mp; /* filesystem mount point */
1327 struct xfs_inode *dp; 1316 struct xfs_inode *dp;
1328 1317
1329 /* 1318 /*
@@ -1331,7 +1320,6 @@ xfs_dir2_leafn_split(
1331 */ 1320 */
1332 args = state->args; 1321 args = state->args;
1333 dp = args->dp; 1322 dp = args->dp;
1334 mp = dp->i_mount;
1335 ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC); 1323 ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
1336 error = xfs_da_grow_inode(args, &blkno); 1324 error = xfs_da_grow_inode(args, &blkno);
1337 if (error) { 1325 if (error) {
@@ -2231,12 +2219,10 @@ xfs_dir2_node_trim_free(
2231 xfs_inode_t *dp; /* incore directory inode */ 2219 xfs_inode_t *dp; /* incore directory inode */
2232 int error; /* error return code */ 2220 int error; /* error return code */
2233 xfs_dir2_free_t *free; /* freespace structure */ 2221 xfs_dir2_free_t *free; /* freespace structure */
2234 xfs_mount_t *mp; /* filesystem mount point */
2235 xfs_trans_t *tp; /* transaction pointer */ 2222 xfs_trans_t *tp; /* transaction pointer */
2236 struct xfs_dir3_icfree_hdr freehdr; 2223 struct xfs_dir3_icfree_hdr freehdr;
2237 2224
2238 dp = args->dp; 2225 dp = args->dp;
2239 mp = dp->i_mount;
2240 tp = args->trans; 2226 tp = args->trans;
2241 /* 2227 /*
2242 * Read the freespace block. 2228 * Read the freespace block.
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 27ce0794d196..ef9f6ead96a4 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -20,140 +20,6 @@
20 20
21struct dir_context; 21struct dir_context;
22 22
23/*
24 * Directory offset/block conversion functions.
25 *
26 * DB blocks here are logical directory block numbers, not filesystem blocks.
27 */
28
29/*
30 * Convert dataptr to byte in file space
31 */
32static inline xfs_dir2_off_t
33xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp)
34{
35 return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
36}
37
38/*
39 * Convert byte in file space to dataptr. It had better be aligned.
40 */
41static inline xfs_dir2_dataptr_t
42xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by)
43{
44 return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
45}
46
47/*
48 * Convert byte in space to (DB) block
49 */
50static inline xfs_dir2_db_t
51xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
52{
53 return (xfs_dir2_db_t)(by >> geo->blklog);
54}
55
56/*
57 * Convert dataptr to a block number
58 */
59static inline xfs_dir2_db_t
60xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
61{
62 return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp));
63}
64
65/*
66 * Convert byte in space to offset in a block
67 */
68static inline xfs_dir2_data_aoff_t
69xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
70{
71 return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1));
72}
73
74/*
75 * Convert dataptr to a byte offset in a block
76 */
77static inline xfs_dir2_data_aoff_t
78xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
79{
80 return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp));
81}
82
83/*
84 * Convert block and offset to byte in space
85 */
86static inline xfs_dir2_off_t
87xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
88 xfs_dir2_data_aoff_t o)
89{
90 return ((xfs_dir2_off_t)db << geo->blklog) + o;
91}
92
93/*
94 * Convert block (DB) to block (dablk)
95 */
96static inline xfs_dablk_t
97xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
98{
99 return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog));
100}
101
102/*
103 * Convert byte in space to (DA) block
104 */
105static inline xfs_dablk_t
106xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
107{
108 return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by));
109}
110
111/*
112 * Convert block and offset to dataptr
113 */
114static inline xfs_dir2_dataptr_t
115xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
116 xfs_dir2_data_aoff_t o)
117{
118 return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o));
119}
120
121/*
122 * Convert block (dablk) to block (DB)
123 */
124static inline xfs_dir2_db_t
125xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da)
126{
127 return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog));
128}
129
130/*
131 * Convert block (dablk) to byte offset in space
132 */
133static inline xfs_dir2_off_t
134xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da)
135{
136 return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0);
137}
138
139/*
140 * Directory tail pointer accessor functions. Based on block geometry.
141 */
142static inline struct xfs_dir2_block_tail *
143xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr)
144{
145 return ((struct xfs_dir2_block_tail *)
146 ((char *)hdr + geo->blksize)) - 1;
147}
148
149static inline struct xfs_dir2_leaf_tail *
150xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
151{
152 return (struct xfs_dir2_leaf_tail *)
153 ((char *)lp + geo->blksize -
154 sizeof(struct xfs_dir2_leaf_tail));
155}
156
157/* xfs_dir2.c */ 23/* xfs_dir2.c */
158extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); 24extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
159extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, 25extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
@@ -161,12 +27,6 @@ extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
161extern int xfs_dir_cilookup_result(struct xfs_da_args *args, 27extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
162 const unsigned char *name, int len); 28 const unsigned char *name, int len);
163 29
164#define S_SHIFT 12
165extern const unsigned char xfs_mode_to_ftype[];
166
167extern unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp,
168 __uint8_t filetype);
169
170 30
171/* xfs_dir2_block.c */ 31/* xfs_dir2_block.c */
172extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp, 32extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 5079e051ef08..974d62e677f4 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -20,8 +20,6 @@
20#include "xfs_format.h" 20#include "xfs_format.h"
21#include "xfs_log_format.h" 21#include "xfs_log_format.h"
22#include "xfs_trans_resv.h" 22#include "xfs_trans_resv.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_mount.h" 23#include "xfs_mount.h"
26#include "xfs_da_format.h" 24#include "xfs_da_format.h"
27#include "xfs_da_btree.h" 25#include "xfs_da_btree.h"
@@ -32,7 +30,6 @@
32#include "xfs_dir2.h" 30#include "xfs_dir2.h"
33#include "xfs_dir2_priv.h" 31#include "xfs_dir2_priv.h"
34#include "xfs_trace.h" 32#include "xfs_trace.h"
35#include "xfs_dinode.h"
36 33
37/* 34/*
38 * Prototypes for internal functions. 35 * Prototypes for internal functions.
@@ -455,13 +452,11 @@ xfs_dir2_sf_addname_hard(
455 xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */ 452 xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */
456 xfs_dir2_sf_entry_t *sfep; /* entry in new dir */ 453 xfs_dir2_sf_entry_t *sfep; /* entry in new dir */
457 xfs_dir2_sf_hdr_t *sfp; /* new shortform dir */ 454 xfs_dir2_sf_hdr_t *sfp; /* new shortform dir */
458 struct xfs_mount *mp;
459 455
460 /* 456 /*
461 * Copy the old directory to the stack buffer. 457 * Copy the old directory to the stack buffer.
462 */ 458 */
463 dp = args->dp; 459 dp = args->dp;
464 mp = dp->i_mount;
465 460
466 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 461 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
467 old_isize = (int)dp->i_d.di_size; 462 old_isize = (int)dp->i_d.di_size;
@@ -542,7 +537,6 @@ xfs_dir2_sf_addname_pick(
542 xfs_inode_t *dp; /* incore directory inode */ 537 xfs_inode_t *dp; /* incore directory inode */
543 int holefit; /* found hole it will fit in */ 538 int holefit; /* found hole it will fit in */
544 int i; /* entry number */ 539 int i; /* entry number */
545 xfs_mount_t *mp; /* filesystem mount point */
546 xfs_dir2_data_aoff_t offset; /* data block offset */ 540 xfs_dir2_data_aoff_t offset; /* data block offset */
547 xfs_dir2_sf_entry_t *sfep; /* shortform entry */ 541 xfs_dir2_sf_entry_t *sfep; /* shortform entry */
548 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ 542 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
@@ -550,7 +544,6 @@ xfs_dir2_sf_addname_pick(
550 int used; /* data bytes used */ 544 int used; /* data bytes used */
551 545
552 dp = args->dp; 546 dp = args->dp;
553 mp = dp->i_mount;
554 547
555 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 548 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
556 size = dp->d_ops->data_entsize(args->namelen); 549 size = dp->d_ops->data_entsize(args->namelen);
@@ -616,10 +609,8 @@ xfs_dir2_sf_check(
616 int offset; /* data offset */ 609 int offset; /* data offset */
617 xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */ 610 xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */
618 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ 611 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
619 struct xfs_mount *mp;
620 612
621 dp = args->dp; 613 dp = args->dp;
622 mp = dp->i_mount;
623 614
624 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 615 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
625 offset = dp->d_ops->data_first_offset; 616 offset = dp->d_ops->data_first_offset;
@@ -1016,12 +1007,10 @@ xfs_dir2_sf_toino4(
1016 int oldsize; /* old inode size */ 1007 int oldsize; /* old inode size */
1017 xfs_dir2_sf_entry_t *sfep; /* new sf entry */ 1008 xfs_dir2_sf_entry_t *sfep; /* new sf entry */
1018 xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ 1009 xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
1019 struct xfs_mount *mp;
1020 1010
1021 trace_xfs_dir2_sf_toino4(args); 1011 trace_xfs_dir2_sf_toino4(args);
1022 1012
1023 dp = args->dp; 1013 dp = args->dp;
1024 mp = dp->i_mount;
1025 1014
1026 /* 1015 /*
1027 * Copy the old directory to the buffer. 1016 * Copy the old directory to the buffer.
@@ -1094,12 +1083,10 @@ xfs_dir2_sf_toino8(
1094 int oldsize; /* old inode size */ 1083 int oldsize; /* old inode size */
1095 xfs_dir2_sf_entry_t *sfep; /* new sf entry */ 1084 xfs_dir2_sf_entry_t *sfep; /* new sf entry */
1096 xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ 1085 xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
1097 struct xfs_mount *mp;
1098 1086
1099 trace_xfs_dir2_sf_toino8(args); 1087 trace_xfs_dir2_sf_toino8(args);
1100 1088
1101 dp = args->dp; 1089 dp = args->dp;
1102 mp = dp->i_mount;
1103 1090
1104 /* 1091 /*
1105 * Copy the old directory to the buffer. 1092 * Copy the old directory to the buffer.
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index bb969337efc8..6fbf2d853a54 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -22,8 +22,6 @@
22#include "xfs_format.h" 22#include "xfs_format.h"
23#include "xfs_log_format.h" 23#include "xfs_log_format.h"
24#include "xfs_trans_resv.h" 24#include "xfs_trans_resv.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h" 25#include "xfs_mount.h"
28#include "xfs_inode.h" 26#include "xfs_inode.h"
29#include "xfs_quota.h" 27#include "xfs_quota.h"
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 7e42bba9a420..fbd6da263571 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -34,6 +34,1077 @@ struct xfs_buf;
34struct xfs_ifork; 34struct xfs_ifork;
35 35
36/* 36/*
37 * Super block
38 * Fits into a sector-sized buffer at address 0 of each allocation group.
39 * Only the first of these is ever updated except during growfs.
40 */
41#define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */
42#define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */
43#define XFS_SB_VERSION_2 2 /* 6.2 - attributes */
44#define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */
45#define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */
46#define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */
47#define XFS_SB_VERSION_NUMBITS 0x000f
48#define XFS_SB_VERSION_ALLFBITS 0xfff0
49#define XFS_SB_VERSION_ATTRBIT 0x0010
50#define XFS_SB_VERSION_NLINKBIT 0x0020
51#define XFS_SB_VERSION_QUOTABIT 0x0040
52#define XFS_SB_VERSION_ALIGNBIT 0x0080
53#define XFS_SB_VERSION_DALIGNBIT 0x0100
54#define XFS_SB_VERSION_SHAREDBIT 0x0200
55#define XFS_SB_VERSION_LOGV2BIT 0x0400
56#define XFS_SB_VERSION_SECTORBIT 0x0800
57#define XFS_SB_VERSION_EXTFLGBIT 0x1000
58#define XFS_SB_VERSION_DIRV2BIT 0x2000
59#define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */
60#define XFS_SB_VERSION_MOREBITSBIT 0x8000
61
62/*
63 * Supported feature bit list is just all bits in the versionnum field because
64 * we've used them all up and understand them all. Except, of course, for the
65 * shared superblock bit, which nobody knows what it does and so is unsupported.
66 */
67#define XFS_SB_VERSION_OKBITS \
68 ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \
69 ~XFS_SB_VERSION_SHAREDBIT)
70
71/*
72 * There are two words to hold XFS "feature" bits: the original
73 * word, sb_versionnum, and sb_features2. Whenever a bit is set in
74 * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set.
75 *
76 * These defines represent bits in sb_features2.
77 */
78#define XFS_SB_VERSION2_RESERVED1BIT 0x00000001
79#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */
80#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004
81#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */
82#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
83#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */
84#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */
85#define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */
86
87#define XFS_SB_VERSION2_OKBITS \
88 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
89 XFS_SB_VERSION2_ATTR2BIT | \
90 XFS_SB_VERSION2_PROJID32BIT | \
91 XFS_SB_VERSION2_FTYPE)
92
93/*
94 * Superblock - in core version. Must match the ondisk version below.
95 * Must be padded to 64 bit alignment.
96 */
97typedef struct xfs_sb {
98 __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */
99 __uint32_t sb_blocksize; /* logical block size, bytes */
100 xfs_rfsblock_t sb_dblocks; /* number of data blocks */
101 xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */
102 xfs_rtblock_t sb_rextents; /* number of realtime extents */
103 uuid_t sb_uuid; /* file system unique id */
104 xfs_fsblock_t sb_logstart; /* starting block of log if internal */
105 xfs_ino_t sb_rootino; /* root inode number */
106 xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */
107 xfs_ino_t sb_rsumino; /* summary inode for rt bitmap */
108 xfs_agblock_t sb_rextsize; /* realtime extent size, blocks */
109 xfs_agblock_t sb_agblocks; /* size of an allocation group */
110 xfs_agnumber_t sb_agcount; /* number of allocation groups */
111 xfs_extlen_t sb_rbmblocks; /* number of rt bitmap blocks */
112 xfs_extlen_t sb_logblocks; /* number of log blocks */
113 __uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */
114 __uint16_t sb_sectsize; /* volume sector size, bytes */
115 __uint16_t sb_inodesize; /* inode size, bytes */
116 __uint16_t sb_inopblock; /* inodes per block */
117 char sb_fname[12]; /* file system name */
118 __uint8_t sb_blocklog; /* log2 of sb_blocksize */
119 __uint8_t sb_sectlog; /* log2 of sb_sectsize */
120 __uint8_t sb_inodelog; /* log2 of sb_inodesize */
121 __uint8_t sb_inopblog; /* log2 of sb_inopblock */
122 __uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */
123 __uint8_t sb_rextslog; /* log2 of sb_rextents */
124 __uint8_t sb_inprogress; /* mkfs is in progress, don't mount */
125 __uint8_t sb_imax_pct; /* max % of fs for inode space */
126 /* statistics */
127 /*
128 * These fields must remain contiguous. If you really
129 * want to change their layout, make sure you fix the
130 * code in xfs_trans_apply_sb_deltas().
131 */
132 __uint64_t sb_icount; /* allocated inodes */
133 __uint64_t sb_ifree; /* free inodes */
134 __uint64_t sb_fdblocks; /* free data blocks */
135 __uint64_t sb_frextents; /* free realtime extents */
136 /*
137 * End contiguous fields.
138 */
139 xfs_ino_t sb_uquotino; /* user quota inode */
140 xfs_ino_t sb_gquotino; /* group quota inode */
141 __uint16_t sb_qflags; /* quota flags */
142 __uint8_t sb_flags; /* misc. flags */
143 __uint8_t sb_shared_vn; /* shared version number */
144 xfs_extlen_t sb_inoalignmt; /* inode chunk alignment, fsblocks */
145 __uint32_t sb_unit; /* stripe or raid unit */
146 __uint32_t sb_width; /* stripe or raid width */
147 __uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */
148 __uint8_t sb_logsectlog; /* log2 of the log sector size */
149 __uint16_t sb_logsectsize; /* sector size for the log, bytes */
150 __uint32_t sb_logsunit; /* stripe unit size for the log */
151 __uint32_t sb_features2; /* additional feature bits */
152
153 /*
154 * bad features2 field as a result of failing to pad the sb
155 * structure to 64 bits. Some machines will be using this field
156 * for features2 bits. Easiest just to mark it bad and not use
157 * it for anything else.
158 */
159 __uint32_t sb_bad_features2;
160
161 /* version 5 superblock fields start here */
162
163 /* feature masks */
164 __uint32_t sb_features_compat;
165 __uint32_t sb_features_ro_compat;
166 __uint32_t sb_features_incompat;
167 __uint32_t sb_features_log_incompat;
168
169 __uint32_t sb_crc; /* superblock crc */
170 __uint32_t sb_pad;
171
172 xfs_ino_t sb_pquotino; /* project quota inode */
173 xfs_lsn_t sb_lsn; /* last write sequence */
174
175 /* must be padded to 64 bit alignment */
176} xfs_sb_t;
177
178#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc)
179
180/*
181 * Superblock - on disk version. Must match the in core version above.
182 * Must be padded to 64 bit alignment.
183 */
184typedef struct xfs_dsb {
185 __be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */
186 __be32 sb_blocksize; /* logical block size, bytes */
187 __be64 sb_dblocks; /* number of data blocks */
188 __be64 sb_rblocks; /* number of realtime blocks */
189 __be64 sb_rextents; /* number of realtime extents */
190 uuid_t sb_uuid; /* file system unique id */
191 __be64 sb_logstart; /* starting block of log if internal */
192 __be64 sb_rootino; /* root inode number */
193 __be64 sb_rbmino; /* bitmap inode for realtime extents */
194 __be64 sb_rsumino; /* summary inode for rt bitmap */
195 __be32 sb_rextsize; /* realtime extent size, blocks */
196 __be32 sb_agblocks; /* size of an allocation group */
197 __be32 sb_agcount; /* number of allocation groups */
198 __be32 sb_rbmblocks; /* number of rt bitmap blocks */
199 __be32 sb_logblocks; /* number of log blocks */
200 __be16 sb_versionnum; /* header version == XFS_SB_VERSION */
201 __be16 sb_sectsize; /* volume sector size, bytes */
202 __be16 sb_inodesize; /* inode size, bytes */
203 __be16 sb_inopblock; /* inodes per block */
204 char sb_fname[12]; /* file system name */
205 __u8 sb_blocklog; /* log2 of sb_blocksize */
206 __u8 sb_sectlog; /* log2 of sb_sectsize */
207 __u8 sb_inodelog; /* log2 of sb_inodesize */
208 __u8 sb_inopblog; /* log2 of sb_inopblock */
209 __u8 sb_agblklog; /* log2 of sb_agblocks (rounded up) */
210 __u8 sb_rextslog; /* log2 of sb_rextents */
211 __u8 sb_inprogress; /* mkfs is in progress, don't mount */
212 __u8 sb_imax_pct; /* max % of fs for inode space */
213 /* statistics */
214 /*
215 * These fields must remain contiguous. If you really
216 * want to change their layout, make sure you fix the
217 * code in xfs_trans_apply_sb_deltas().
218 */
219 __be64 sb_icount; /* allocated inodes */
220 __be64 sb_ifree; /* free inodes */
221 __be64 sb_fdblocks; /* free data blocks */
222 __be64 sb_frextents; /* free realtime extents */
223 /*
224 * End contiguous fields.
225 */
226 __be64 sb_uquotino; /* user quota inode */
227 __be64 sb_gquotino; /* group quota inode */
228 __be16 sb_qflags; /* quota flags */
229 __u8 sb_flags; /* misc. flags */
230 __u8 sb_shared_vn; /* shared version number */
231 __be32 sb_inoalignmt; /* inode chunk alignment, fsblocks */
232 __be32 sb_unit; /* stripe or raid unit */
233 __be32 sb_width; /* stripe or raid width */
234 __u8 sb_dirblklog; /* log2 of dir block size (fsbs) */
235 __u8 sb_logsectlog; /* log2 of the log sector size */
236 __be16 sb_logsectsize; /* sector size for the log, bytes */
237 __be32 sb_logsunit; /* stripe unit size for the log */
238 __be32 sb_features2; /* additional feature bits */
239 /*
240 * bad features2 field as a result of failing to pad the sb
241 * structure to 64 bits. Some machines will be using this field
242 * for features2 bits. Easiest just to mark it bad and not use
243 * it for anything else.
244 */
245 __be32 sb_bad_features2;
246
247 /* version 5 superblock fields start here */
248
249 /* feature masks */
250 __be32 sb_features_compat;
251 __be32 sb_features_ro_compat;
252 __be32 sb_features_incompat;
253 __be32 sb_features_log_incompat;
254
255 __le32 sb_crc; /* superblock crc */
256 __be32 sb_pad;
257
258 __be64 sb_pquotino; /* project quota inode */
259 __be64 sb_lsn; /* last write sequence */
260
261 /* must be padded to 64 bit alignment */
262} xfs_dsb_t;
263
264/*
265 * Sequence number values for the fields.
266 */
267typedef enum {
268 XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS,
269 XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO,
270 XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS,
271 XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS,
272 XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE,
273 XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG,
274 XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG,
275 XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT,
276 XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO,
277 XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
278 XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
279 XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
280 XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT,
281 XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT,
282 XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD,
283 XFS_SBS_PQUOTINO, XFS_SBS_LSN,
284 XFS_SBS_FIELDCOUNT
285} xfs_sb_field_t;
286
287/*
288 * Mask values, defined based on the xfs_sb_field_t values.
289 * Only define the ones we're using.
290 */
291#define XFS_SB_MVAL(x) (1LL << XFS_SBS_ ## x)
292#define XFS_SB_UUID XFS_SB_MVAL(UUID)
293#define XFS_SB_FNAME XFS_SB_MVAL(FNAME)
294#define XFS_SB_ROOTINO XFS_SB_MVAL(ROOTINO)
295#define XFS_SB_RBMINO XFS_SB_MVAL(RBMINO)
296#define XFS_SB_RSUMINO XFS_SB_MVAL(RSUMINO)
297#define XFS_SB_VERSIONNUM XFS_SB_MVAL(VERSIONNUM)
298#define XFS_SB_UQUOTINO XFS_SB_MVAL(UQUOTINO)
299#define XFS_SB_GQUOTINO XFS_SB_MVAL(GQUOTINO)
300#define XFS_SB_QFLAGS XFS_SB_MVAL(QFLAGS)
301#define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN)
302#define XFS_SB_UNIT XFS_SB_MVAL(UNIT)
303#define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH)
304#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT)
305#define XFS_SB_IFREE XFS_SB_MVAL(IFREE)
306#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS)
307#define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2)
308#define XFS_SB_BAD_FEATURES2 XFS_SB_MVAL(BAD_FEATURES2)
309#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT)
310#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
311#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
312#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT)
313#define XFS_SB_CRC XFS_SB_MVAL(CRC)
314#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO)
315#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT)
316#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1)
317#define XFS_SB_MOD_BITS \
318 (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
319 XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
320 XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
321 XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
322 XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \
323 XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \
324 XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO)
325
326
327/*
328 * Misc. Flags - warning - these will be cleared by xfs_repair unless
329 * a feature bit is set when the flag is used.
330 */
331#define XFS_SBF_NOFLAGS 0x00 /* no flags set */
332#define XFS_SBF_READONLY 0x01 /* only read-only mounts allowed */
333
334/*
335 * define max. shared version we can interoperate with
336 */
337#define XFS_SB_MAX_SHARED_VN 0
338
339#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
340
341/*
342 * The first XFS version we support is a v4 superblock with V2 directories.
343 */
344static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp)
345{
346 if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
347 return false;
348
349 /* check for unknown features in the fs */
350 if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
351 ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
352 (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
353 return false;
354
355 return true;
356}
357
358static inline bool xfs_sb_good_version(struct xfs_sb *sbp)
359{
360 if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
361 return true;
362 if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
363 return xfs_sb_good_v4_features(sbp);
364 return false;
365}
366
367/*
368 * Detect a mismatched features2 field. Older kernels read/wrote
369 * this into the wrong slot, so to be safe we keep them in sync.
370 */
371static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp)
372{
373 return sbp->sb_bad_features2 != sbp->sb_features2;
374}
375
376static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp)
377{
378 return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT);
379}
380
381static inline void xfs_sb_version_addattr(struct xfs_sb *sbp)
382{
383 sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
384}
385
386static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp)
387{
388 return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
389}
390
391static inline void xfs_sb_version_addquota(struct xfs_sb *sbp)
392{
393 sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
394}
395
396static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp)
397{
398 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
399 (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT));
400}
401
402static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp)
403{
404 return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
405}
406
407static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp)
408{
409 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
410 (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
411}
412
413static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp)
414{
415 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
416 (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
417}
418
419static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp)
420{
421 return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
422}
423
424static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp)
425{
426 return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
427}
428
429static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
430{
431 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
432 (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
433}
434
435/*
436 * sb_features2 bit version macros.
437 */
438static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp)
439{
440 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
441 (xfs_sb_version_hasmorebits(sbp) &&
442 (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
443}
444
445static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp)
446{
447 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
448 (xfs_sb_version_hasmorebits(sbp) &&
449 (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT));
450}
451
452static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
453{
454 sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
455 sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
456 sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT;
457}
458
459static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp)
460{
461 sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
462 sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
463 if (!sbp->sb_features2)
464 sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
465}
466
467static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp)
468{
469 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
470 (xfs_sb_version_hasmorebits(sbp) &&
471 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
472}
473
474static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
475{
476 sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
477 sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
478 sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT;
479}
480
481/*
482 * Extended v5 superblock feature masks. These are to be used for new v5
483 * superblock features only.
484 *
485 * Compat features are new features that old kernels will not notice or affect
486 * and so can mount read-write without issues.
487 *
488 * RO-Compat (read only) are features that old kernels can read but will break
489 * if they write. Hence only read-only mounts of such filesystems are allowed on
490 * kernels that don't support the feature bit.
491 *
492 * InCompat features are features which old kernels will not understand and so
493 * must not mount.
494 *
495 * Log-InCompat features are for changes to log formats or new transactions that
496 * can't be replayed on older kernels. The fields are set when the filesystem is
497 * mounted, and a clean unmount clears the fields.
498 */
499#define XFS_SB_FEAT_COMPAT_ALL 0
500#define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL
501static inline bool
502xfs_sb_has_compat_feature(
503 struct xfs_sb *sbp,
504 __uint32_t feature)
505{
506 return (sbp->sb_features_compat & feature) != 0;
507}
508
509#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */
510#define XFS_SB_FEAT_RO_COMPAT_ALL \
511 (XFS_SB_FEAT_RO_COMPAT_FINOBT)
512#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
513static inline bool
514xfs_sb_has_ro_compat_feature(
515 struct xfs_sb *sbp,
516 __uint32_t feature)
517{
518 return (sbp->sb_features_ro_compat & feature) != 0;
519}
520
521#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
522#define XFS_SB_FEAT_INCOMPAT_ALL \
523 (XFS_SB_FEAT_INCOMPAT_FTYPE)
524
525#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
526static inline bool
527xfs_sb_has_incompat_feature(
528 struct xfs_sb *sbp,
529 __uint32_t feature)
530{
531 return (sbp->sb_features_incompat & feature) != 0;
532}
533
534#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0
535#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
536static inline bool
537xfs_sb_has_incompat_log_feature(
538 struct xfs_sb *sbp,
539 __uint32_t feature)
540{
541 return (sbp->sb_features_log_incompat & feature) != 0;
542}
543
544/*
545 * V5 superblock specific feature checks
546 */
547static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp)
548{
549 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
550}
551
552static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
553{
554 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
555}
556
557static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
558{
559 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
560 xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) ||
561 (xfs_sb_version_hasmorebits(sbp) &&
562 (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
563}
564
565static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
566{
567 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
568 (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
569}
570
571/*
572 * end of superblock version macros
573 */
574
575static inline bool
576xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
577{
578 return (ino == sbp->sb_uquotino ||
579 ino == sbp->sb_gquotino ||
580 ino == sbp->sb_pquotino);
581}
582
583#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */
584#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
585#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr))
586
587#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
588#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \
589 xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d))
590#define XFS_FSB_TO_DADDR(mp,fsbno) XFS_AGB_TO_DADDR(mp, \
591 XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno))
592
593/*
594 * File system sector to basic block conversions.
595 */
596#define XFS_FSS_TO_BB(mp,sec) ((sec) << (mp)->m_sectbb_log)
597
598/*
599 * File system block to basic block conversions.
600 */
601#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log)
602#define XFS_BB_TO_FSB(mp,bb) \
603 (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
604#define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log)
605
606/*
607 * File system block to byte conversions.
608 */
609#define XFS_FSB_TO_B(mp,fsbno) ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog)
610#define XFS_B_TO_FSB(mp,b) \
611 ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
612#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
613#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask)
614
615/*
616 * Allocation group header
617 *
618 * This is divided into three structures, placed in sequential 512-byte
619 * buffers after a copy of the superblock (also in a 512-byte buffer).
620 */
621#define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */
622#define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */
623#define XFS_AGFL_MAGIC 0x5841464c /* 'XAFL' */
624#define XFS_AGF_VERSION 1
625#define XFS_AGI_VERSION 1
626
627#define XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION)
628#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION)
629
630/*
631 * Btree number 0 is bno, 1 is cnt. This value gives the size of the
632 * arrays below.
633 */
634#define XFS_BTNUM_AGF ((int)XFS_BTNUM_CNTi + 1)
635
636/*
637 * The second word of agf_levels in the first a.g. overlaps the EFS
638 * superblock's magic number. Since the magic numbers valid for EFS
639 * are > 64k, our value cannot be confused for an EFS superblock's.
640 */
641
642typedef struct xfs_agf {
643 /*
644 * Common allocation group header information
645 */
646 __be32 agf_magicnum; /* magic number == XFS_AGF_MAGIC */
647 __be32 agf_versionnum; /* header version == XFS_AGF_VERSION */
648 __be32 agf_seqno; /* sequence # starting from 0 */
649 __be32 agf_length; /* size in blocks of a.g. */
650 /*
651 * Freespace information
652 */
653 __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */
654 __be32 agf_spare0; /* spare field */
655 __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */
656 __be32 agf_spare1; /* spare field */
657
658 __be32 agf_flfirst; /* first freelist block's index */
659 __be32 agf_fllast; /* last freelist block's index */
660 __be32 agf_flcount; /* count of blocks in freelist */
661 __be32 agf_freeblks; /* total free blocks */
662
663 __be32 agf_longest; /* longest free space */
664 __be32 agf_btreeblks; /* # of blocks held in AGF btrees */
665 uuid_t agf_uuid; /* uuid of filesystem */
666
667 /*
668 * reserve some contiguous space for future logged fields before we add
669 * the unlogged fields. This makes the range logging via flags and
670 * structure offsets much simpler.
671 */
672 __be64 agf_spare64[16];
673
674 /* unlogged fields, written during buffer writeback. */
675 __be64 agf_lsn; /* last write sequence */
676 __be32 agf_crc; /* crc of agf sector */
677 __be32 agf_spare2;
678
679 /* structure must be padded to 64 bit alignment */
680} xfs_agf_t;
681
682#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc)
683
684#define XFS_AGF_MAGICNUM 0x00000001
685#define XFS_AGF_VERSIONNUM 0x00000002
686#define XFS_AGF_SEQNO 0x00000004
687#define XFS_AGF_LENGTH 0x00000008
688#define XFS_AGF_ROOTS 0x00000010
689#define XFS_AGF_LEVELS 0x00000020
690#define XFS_AGF_FLFIRST 0x00000040
691#define XFS_AGF_FLLAST 0x00000080
692#define XFS_AGF_FLCOUNT 0x00000100
693#define XFS_AGF_FREEBLKS 0x00000200
694#define XFS_AGF_LONGEST 0x00000400
695#define XFS_AGF_BTREEBLKS 0x00000800
696#define XFS_AGF_UUID 0x00001000
697#define XFS_AGF_NUM_BITS 13
698#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
699
700#define XFS_AGF_FLAGS \
701 { XFS_AGF_MAGICNUM, "MAGICNUM" }, \
702 { XFS_AGF_VERSIONNUM, "VERSIONNUM" }, \
703 { XFS_AGF_SEQNO, "SEQNO" }, \
704 { XFS_AGF_LENGTH, "LENGTH" }, \
705 { XFS_AGF_ROOTS, "ROOTS" }, \
706 { XFS_AGF_LEVELS, "LEVELS" }, \
707 { XFS_AGF_FLFIRST, "FLFIRST" }, \
708 { XFS_AGF_FLLAST, "FLLAST" }, \
709 { XFS_AGF_FLCOUNT, "FLCOUNT" }, \
710 { XFS_AGF_FREEBLKS, "FREEBLKS" }, \
711 { XFS_AGF_LONGEST, "LONGEST" }, \
712 { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \
713 { XFS_AGF_UUID, "UUID" }
714
715/* disk block (xfs_daddr_t) in the AG */
716#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
717#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
718#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr))
719
720/*
721 * Size of the unlinked inode hash table in the agi.
722 */
723#define XFS_AGI_UNLINKED_BUCKETS 64
724
725typedef struct xfs_agi {
726 /*
727 * Common allocation group header information
728 */
729 __be32 agi_magicnum; /* magic number == XFS_AGI_MAGIC */
730 __be32 agi_versionnum; /* header version == XFS_AGI_VERSION */
731 __be32 agi_seqno; /* sequence # starting from 0 */
732 __be32 agi_length; /* size in blocks of a.g. */
733 /*
734 * Inode information
735 * Inodes are mapped by interpreting the inode number, so no
736 * mapping data is needed here.
737 */
738 __be32 agi_count; /* count of allocated inodes */
739 __be32 agi_root; /* root of inode btree */
740 __be32 agi_level; /* levels in inode btree */
741 __be32 agi_freecount; /* number of free inodes */
742
743 __be32 agi_newino; /* new inode just allocated */
744 __be32 agi_dirino; /* last directory inode chunk */
745 /*
746 * Hash table of inodes which have been unlinked but are
747 * still being referenced.
748 */
749 __be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
750 /*
751 * This marks the end of logging region 1 and start of logging region 2.
752 */
753 uuid_t agi_uuid; /* uuid of filesystem */
754 __be32 agi_crc; /* crc of agi sector */
755 __be32 agi_pad32;
756 __be64 agi_lsn; /* last write sequence */
757
758 __be32 agi_free_root; /* root of the free inode btree */
759 __be32 agi_free_level;/* levels in free inode btree */
760
761 /* structure must be padded to 64 bit alignment */
762} xfs_agi_t;
763
764#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc)
765
766#define XFS_AGI_MAGICNUM (1 << 0)
767#define XFS_AGI_VERSIONNUM (1 << 1)
768#define XFS_AGI_SEQNO (1 << 2)
769#define XFS_AGI_LENGTH (1 << 3)
770#define XFS_AGI_COUNT (1 << 4)
771#define XFS_AGI_ROOT (1 << 5)
772#define XFS_AGI_LEVEL (1 << 6)
773#define XFS_AGI_FREECOUNT (1 << 7)
774#define XFS_AGI_NEWINO (1 << 8)
775#define XFS_AGI_DIRINO (1 << 9)
776#define XFS_AGI_UNLINKED (1 << 10)
777#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */
778#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1)
779#define XFS_AGI_FREE_ROOT (1 << 11)
780#define XFS_AGI_FREE_LEVEL (1 << 12)
781#define XFS_AGI_NUM_BITS_R2 13
782
783/* disk block (xfs_daddr_t) in the AG */
784#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
785#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
786#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr))
787
788/*
789 * The third a.g. block contains the a.g. freelist, an array
790 * of block pointers to blocks owned by the allocation btree code.
791 */
792#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
793#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
794#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr))
795
796#define XFS_BUF_TO_AGFL_BNO(mp, bp) \
797 (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
798 &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \
799 (__be32 *)(bp)->b_addr)
800
801/*
802 * Size of the AGFL. For CRC-enabled filesystes we steal a couple of
803 * slots in the beginning of the block for a proper header with the
804 * location information and CRC.
805 */
806#define XFS_AGFL_SIZE(mp) \
807 (((mp)->m_sb.sb_sectsize - \
808 (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
809 sizeof(struct xfs_agfl) : 0)) / \
810 sizeof(xfs_agblock_t))
811
812typedef struct xfs_agfl {
813 __be32 agfl_magicnum;
814 __be32 agfl_seqno;
815 uuid_t agfl_uuid;
816 __be64 agfl_lsn;
817 __be32 agfl_crc;
818 __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */
819} xfs_agfl_t;
820
821#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
822
823
824#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
825#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \
826 (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
827#define XFS_MIN_FREELIST(a,mp) \
828 (XFS_MIN_FREELIST_RAW( \
829 be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \
830 be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
831#define XFS_MIN_FREELIST_PAG(pag,mp) \
832 (XFS_MIN_FREELIST_RAW( \
833 (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
834 (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
835
836#define XFS_AGB_TO_FSB(mp,agno,agbno) \
837 (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
838#define XFS_FSB_TO_AGNO(mp,fsbno) \
839 ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
840#define XFS_FSB_TO_AGBNO(mp,fsbno) \
841 ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
842#define XFS_AGB_TO_DADDR(mp,agno,agbno) \
843 ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
844 (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
845#define XFS_AG_DADDR(mp,agno,d) (XFS_AGB_TO_DADDR(mp, agno, 0) + (d))
846
847/*
848 * For checking for bad ranges of xfs_daddr_t's, covering multiple
849 * allocation groups or a single xfs_daddr_t that's a superblock copy.
850 */
851#define XFS_AG_CHECK_DADDR(mp,d,len) \
852 ((len) == 1 ? \
853 ASSERT((d) == XFS_SB_DADDR || \
854 xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
855 ASSERT(xfs_daddr_to_agno(mp, d) == \
856 xfs_daddr_to_agno(mp, (d) + (len) - 1)))
857
858typedef struct xfs_timestamp {
859 __be32 t_sec; /* timestamp seconds */
860 __be32 t_nsec; /* timestamp nanoseconds */
861} xfs_timestamp_t;
862
863/*
864 * On-disk inode structure.
865 *
866 * This is just the header or "dinode core", the inode is expanded to fill a
867 * variable size the leftover area split into a data and an attribute fork.
868 * The format of the data and attribute fork depends on the format of the
869 * inode as indicated by di_format and di_aformat. To access the data and
870 * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros
871 * below.
872 *
873 * There is a very similar struct icdinode in xfs_inode which matches the
874 * layout of the first 96 bytes of this structure, but is kept in native
875 * format instead of big endian.
876 *
877 * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
878 * padding field for v3 inodes.
879 */
880#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
881#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3)
882typedef struct xfs_dinode {
883 __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
884 __be16 di_mode; /* mode and type of file */
885 __u8 di_version; /* inode version */
886 __u8 di_format; /* format of di_c data */
887 __be16 di_onlink; /* old number of links to file */
888 __be32 di_uid; /* owner's user id */
889 __be32 di_gid; /* owner's group id */
890 __be32 di_nlink; /* number of links to file */
891 __be16 di_projid_lo; /* lower part of owner's project id */
892 __be16 di_projid_hi; /* higher part owner's project id */
893 __u8 di_pad[6]; /* unused, zeroed space */
894 __be16 di_flushiter; /* incremented on flush */
895 xfs_timestamp_t di_atime; /* time last accessed */
896 xfs_timestamp_t di_mtime; /* time last modified */
897 xfs_timestamp_t di_ctime; /* time created/inode modified */
898 __be64 di_size; /* number of bytes in file */
899 __be64 di_nblocks; /* # of direct & btree blocks used */
900 __be32 di_extsize; /* basic/minimum extent size for file */
901 __be32 di_nextents; /* number of extents in data fork */
902 __be16 di_anextents; /* number of extents in attribute fork*/
903 __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */
904 __s8 di_aformat; /* format of attr fork's data */
905 __be32 di_dmevmask; /* DMIG event mask */
906 __be16 di_dmstate; /* DMIG state info */
907 __be16 di_flags; /* random flags, XFS_DIFLAG_... */
908 __be32 di_gen; /* generation number */
909
910 /* di_next_unlinked is the only non-core field in the old dinode */
911 __be32 di_next_unlinked;/* agi unlinked list ptr */
912
913 /* start of the extended dinode, writable fields */
914 __le32 di_crc; /* CRC of the inode */
915 __be64 di_changecount; /* number of attribute changes */
916 __be64 di_lsn; /* flush sequence */
917 __be64 di_flags2; /* more random flags */
918 __u8 di_pad2[16]; /* more padding for future expansion */
919
920 /* fields only written to during inode creation */
921 xfs_timestamp_t di_crtime; /* time created */
922 __be64 di_ino; /* inode number */
923 uuid_t di_uuid; /* UUID of the filesystem */
924
925 /* structure must be padded to 64 bit alignment */
926} xfs_dinode_t;
927
928#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc)
929
930#define DI_MAX_FLUSH 0xffff
931
932/*
933 * Size of the core inode on disk. Version 1 and 2 inodes have
934 * the same size, but version 3 has grown a few additional fields.
935 */
936static inline uint xfs_dinode_size(int version)
937{
938 if (version == 3)
939 return sizeof(struct xfs_dinode);
940 return offsetof(struct xfs_dinode, di_crc);
941}
942
943/*
944 * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
945 * Since the pathconf interface is signed, we use 2^31 - 1 instead.
946 * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
947 */
948#define XFS_MAXLINK ((1U << 31) - 1U)
949#define XFS_MAXLINK_1 65535U
950
951/*
952 * Values for di_format
953 */
954typedef enum xfs_dinode_fmt {
955 XFS_DINODE_FMT_DEV, /* xfs_dev_t */
956 XFS_DINODE_FMT_LOCAL, /* bulk data */
957 XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */
958 XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */
959 XFS_DINODE_FMT_UUID /* uuid_t */
960} xfs_dinode_fmt_t;
961
962/*
963 * Inode minimum and maximum sizes.
964 */
965#define XFS_DINODE_MIN_LOG 8
966#define XFS_DINODE_MAX_LOG 11
967#define XFS_DINODE_MIN_SIZE (1 << XFS_DINODE_MIN_LOG)
968#define XFS_DINODE_MAX_SIZE (1 << XFS_DINODE_MAX_LOG)
969
970/*
971 * Inode size for given fs.
972 */
973#define XFS_LITINO(mp, version) \
974 ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
975
976/*
977 * Inode data & attribute fork sizes, per inode.
978 */
979#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0)
980#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3))
981
982#define XFS_DFORK_DSIZE(dip,mp) \
983 (XFS_DFORK_Q(dip) ? \
984 XFS_DFORK_BOFF(dip) : \
985 XFS_LITINO(mp, (dip)->di_version))
986#define XFS_DFORK_ASIZE(dip,mp) \
987 (XFS_DFORK_Q(dip) ? \
988 XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \
989 0)
990#define XFS_DFORK_SIZE(dip,mp,w) \
991 ((w) == XFS_DATA_FORK ? \
992 XFS_DFORK_DSIZE(dip, mp) : \
993 XFS_DFORK_ASIZE(dip, mp))
994
995/*
996 * Return pointers to the data or attribute forks.
997 */
998#define XFS_DFORK_DPTR(dip) \
999 ((char *)dip + xfs_dinode_size(dip->di_version))
1000#define XFS_DFORK_APTR(dip) \
1001 (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
1002#define XFS_DFORK_PTR(dip,w) \
1003 ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
1004
1005#define XFS_DFORK_FORMAT(dip,w) \
1006 ((w) == XFS_DATA_FORK ? \
1007 (dip)->di_format : \
1008 (dip)->di_aformat)
1009#define XFS_DFORK_NEXTENTS(dip,w) \
1010 ((w) == XFS_DATA_FORK ? \
1011 be32_to_cpu((dip)->di_nextents) : \
1012 be16_to_cpu((dip)->di_anextents))
1013
1014/*
1015 * For block and character special files the 32bit dev_t is stored at the
1016 * beginning of the data fork.
1017 */
1018static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
1019{
1020 return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
1021}
1022
1023static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
1024{
1025 *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
1026}
1027
1028/*
1029 * Values for di_flags
1030 * There should be a one-to-one correspondence between these flags and the
1031 * XFS_XFLAG_s.
1032 */
1033#define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */
1034#define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */
1035#define XFS_DIFLAG_NEWRTBM_BIT 2 /* for rtbitmap inode, new format */
1036#define XFS_DIFLAG_IMMUTABLE_BIT 3 /* inode is immutable */
1037#define XFS_DIFLAG_APPEND_BIT 4 /* inode is append-only */
1038#define XFS_DIFLAG_SYNC_BIT 5 /* inode is written synchronously */
1039#define XFS_DIFLAG_NOATIME_BIT 6 /* do not update atime */
1040#define XFS_DIFLAG_NODUMP_BIT 7 /* do not dump */
1041#define XFS_DIFLAG_RTINHERIT_BIT 8 /* create with realtime bit set */
1042#define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */
1043#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */
1044#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */
1045#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */
1046#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */
1047#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */
1048#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT)
1049#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT)
1050#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT)
1051#define XFS_DIFLAG_IMMUTABLE (1 << XFS_DIFLAG_IMMUTABLE_BIT)
1052#define XFS_DIFLAG_APPEND (1 << XFS_DIFLAG_APPEND_BIT)
1053#define XFS_DIFLAG_SYNC (1 << XFS_DIFLAG_SYNC_BIT)
1054#define XFS_DIFLAG_NOATIME (1 << XFS_DIFLAG_NOATIME_BIT)
1055#define XFS_DIFLAG_NODUMP (1 << XFS_DIFLAG_NODUMP_BIT)
1056#define XFS_DIFLAG_RTINHERIT (1 << XFS_DIFLAG_RTINHERIT_BIT)
1057#define XFS_DIFLAG_PROJINHERIT (1 << XFS_DIFLAG_PROJINHERIT_BIT)
1058#define XFS_DIFLAG_NOSYMLINKS (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
1059#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT)
1060#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
1061#define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT)
1062#define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT)
1063
1064#define XFS_DIFLAG_ANY \
1065 (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
1066 XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
1067 XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
1068 XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
1069 XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
1070
1071/*
1072 * Inode number format:
1073 * low inopblog bits - offset in block
1074 * next agblklog bits - block number in ag
1075 * next agno_log bits - ag number
1076 * high agno_log-agblklog-inopblog bits - 0
1077 */
1078#define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1)
1079#define XFS_INO_OFFSET_BITS(mp) (mp)->m_sb.sb_inopblog
1080#define XFS_INO_AGBNO_BITS(mp) (mp)->m_sb.sb_agblklog
1081#define XFS_INO_AGINO_BITS(mp) (mp)->m_agino_log
1082#define XFS_INO_AGNO_BITS(mp) (mp)->m_agno_log
1083#define XFS_INO_BITS(mp) \
1084 XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp)
1085#define XFS_INO_TO_AGNO(mp,i) \
1086 ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp)))
1087#define XFS_INO_TO_AGINO(mp,i) \
1088 ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp)))
1089#define XFS_INO_TO_AGBNO(mp,i) \
1090 (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \
1091 XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp)))
1092#define XFS_INO_TO_OFFSET(mp,i) \
1093 ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
1094#define XFS_INO_TO_FSB(mp,i) \
1095 XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i))
1096#define XFS_AGINO_TO_INO(mp,a,i) \
1097 (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i))
1098#define XFS_AGINO_TO_AGBNO(mp,i) ((i) >> XFS_INO_OFFSET_BITS(mp))
1099#define XFS_AGINO_TO_OFFSET(mp,i) \
1100 ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
1101#define XFS_OFFBNO_TO_AGINO(mp,b,o) \
1102 ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
1103
1104#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL))
1105#define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL))
1106
1107/*
37 * RealTime Device format definitions 1108 * RealTime Device format definitions
38 */ 1109 */
39 1110
@@ -413,4 +1484,40 @@ struct xfs_btree_block {
413#define XFS_BTREE_LBLOCK_CRC_OFF \ 1484#define XFS_BTREE_LBLOCK_CRC_OFF \
414 offsetof(struct xfs_btree_block, bb_u.l.bb_crc) 1485 offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
415 1486
1487/*
1488 * On-disk XFS access control list structure.
1489 */
1490struct xfs_acl_entry {
1491 __be32 ae_tag;
1492 __be32 ae_id;
1493 __be16 ae_perm;
1494 __be16 ae_pad; /* fill the implicit hole in the structure */
1495};
1496
1497struct xfs_acl {
1498 __be32 acl_cnt;
1499 struct xfs_acl_entry acl_entry[0];
1500};
1501
1502/*
1503 * The number of ACL entries allowed is defined by the on-disk format.
1504 * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is
1505 * limited only by the maximum size of the xattr that stores the information.
1506 */
1507#define XFS_ACL_MAX_ENTRIES(mp) \
1508 (xfs_sb_version_hascrc(&mp->m_sb) \
1509 ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
1510 sizeof(struct xfs_acl_entry) \
1511 : 25)
1512
1513#define XFS_ACL_MAX_SIZE(mp) \
1514 (sizeof(struct xfs_acl) + \
1515 sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
1516
1517/* On-disk XFS extended attribute names */
1518#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE"
1519#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT"
1520#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1)
1521#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
1522
416#endif /* __XFS_FORMAT_H__ */ 1523#endif /* __XFS_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 23dcb72fc5e6..116ef1ddb3e3 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -22,9 +22,7 @@
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_bit.h" 24#include "xfs_bit.h"
25#include "xfs_inum.h"
26#include "xfs_sb.h" 25#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h" 26#include "xfs_mount.h"
29#include "xfs_inode.h" 27#include "xfs_inode.h"
30#include "xfs_btree.h" 28#include "xfs_btree.h"
@@ -39,7 +37,6 @@
39#include "xfs_buf_item.h" 37#include "xfs_buf_item.h"
40#include "xfs_icreate_item.h" 38#include "xfs_icreate_item.h"
41#include "xfs_icache.h" 39#include "xfs_icache.h"
42#include "xfs_dinode.h"
43#include "xfs_trace.h" 40#include "xfs_trace.h"
44 41
45 42
@@ -48,12 +45,12 @@
48 */ 45 */
49static inline int 46static inline int
50xfs_ialloc_cluster_alignment( 47xfs_ialloc_cluster_alignment(
51 xfs_alloc_arg_t *args) 48 struct xfs_mount *mp)
52{ 49{
53 if (xfs_sb_version_hasalign(&args->mp->m_sb) && 50 if (xfs_sb_version_hasalign(&mp->m_sb) &&
54 args->mp->m_sb.sb_inoalignmt >= 51 mp->m_sb.sb_inoalignmt >=
55 XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size)) 52 XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
56 return args->mp->m_sb.sb_inoalignmt; 53 return mp->m_sb.sb_inoalignmt;
57 return 1; 54 return 1;
58} 55}
59 56
@@ -412,7 +409,7 @@ xfs_ialloc_ag_alloc(
412 * but not to use them in the actual exact allocation. 409 * but not to use them in the actual exact allocation.
413 */ 410 */
414 args.alignment = 1; 411 args.alignment = 1;
415 args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1; 412 args.minalignslop = xfs_ialloc_cluster_alignment(args.mp) - 1;
416 413
417 /* Allow space for the inode btree to split. */ 414 /* Allow space for the inode btree to split. */
418 args.minleft = args.mp->m_in_maxlevels - 1; 415 args.minleft = args.mp->m_in_maxlevels - 1;
@@ -448,7 +445,7 @@ xfs_ialloc_ag_alloc(
448 args.alignment = args.mp->m_dalign; 445 args.alignment = args.mp->m_dalign;
449 isaligned = 1; 446 isaligned = 1;
450 } else 447 } else
451 args.alignment = xfs_ialloc_cluster_alignment(&args); 448 args.alignment = xfs_ialloc_cluster_alignment(args.mp);
452 /* 449 /*
453 * Need to figure out where to allocate the inode blocks. 450 * Need to figure out where to allocate the inode blocks.
454 * Ideally they should be spaced out through the a.g. 451 * Ideally they should be spaced out through the a.g.
@@ -477,7 +474,7 @@ xfs_ialloc_ag_alloc(
477 args.type = XFS_ALLOCTYPE_NEAR_BNO; 474 args.type = XFS_ALLOCTYPE_NEAR_BNO;
478 args.agbno = be32_to_cpu(agi->agi_root); 475 args.agbno = be32_to_cpu(agi->agi_root);
479 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); 476 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
480 args.alignment = xfs_ialloc_cluster_alignment(&args); 477 args.alignment = xfs_ialloc_cluster_alignment(args.mp);
481 if ((error = xfs_alloc_vextent(&args))) 478 if ((error = xfs_alloc_vextent(&args)))
482 return error; 479 return error;
483 } 480 }
@@ -632,10 +629,24 @@ xfs_ialloc_ag_select(
632 } 629 }
633 630
634 /* 631 /*
635 * Is there enough free space for the file plus a block of 632 * Check that there is enough free space for the file plus a
636 * inodes? (if we need to allocate some)? 633 * chunk of inodes if we need to allocate some. If this is the
634 * first pass across the AGs, take into account the potential
635 * space needed for alignment of inode chunks when checking the
636 * longest contiguous free space in the AG - this prevents us
637 * from getting ENOSPC because we have free space larger than
638 * m_ialloc_blks but alignment constraints prevent us from using
639 * it.
640 *
641 * If we can't find an AG with space for full alignment slack to
642 * be taken into account, we must be near ENOSPC in all AGs.
643 * Hence we don't include alignment for the second pass and so
644 * if we fail allocation due to alignment issues then it is most
645 * likely a real ENOSPC condition.
637 */ 646 */
638 ineed = mp->m_ialloc_blks; 647 ineed = mp->m_ialloc_blks;
648 if (flags && ineed > 1)
649 ineed += xfs_ialloc_cluster_alignment(mp);
639 longest = pag->pagf_longest; 650 longest = pag->pagf_longest;
640 if (!longest) 651 if (!longest)
641 longest = pag->pagf_flcount > 0; 652 longest = pag->pagf_flcount > 0;
@@ -1137,11 +1148,7 @@ xfs_dialloc_ag_update_inobt(
1137 XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) && 1148 XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) &&
1138 (rec.ir_freecount == frec->ir_freecount)); 1149 (rec.ir_freecount == frec->ir_freecount));
1139 1150
1140 error = xfs_inobt_update(cur, &rec); 1151 return xfs_inobt_update(cur, &rec);
1141 if (error)
1142 return error;
1143
1144 return 0;
1145} 1152}
1146 1153
1147/* 1154/*
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 95ad1c002d60..100007d56449 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -160,4 +160,8 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
160 xfs_agnumber_t agno, xfs_agblock_t agbno, 160 xfs_agnumber_t agno, xfs_agblock_t agbno,
161 xfs_agblock_t length, unsigned int gen); 161 xfs_agblock_t length, unsigned int gen);
162 162
163int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
164 xfs_agnumber_t agno, struct xfs_buf **bpp);
165
166
163#endif /* __XFS_IALLOC_H__ */ 167#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index c9b06f30fe86..964c465ca69c 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -22,8 +22,6 @@
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_bit.h" 24#include "xfs_bit.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h" 25#include "xfs_mount.h"
28#include "xfs_inode.h" 26#include "xfs_inode.h"
29#include "xfs_btree.h" 27#include "xfs_btree.h"
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index f18fd2da49f7..002b6b3a1988 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -21,8 +21,6 @@
21#include "xfs_format.h" 21#include "xfs_format.h"
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h" 24#include "xfs_mount.h"
27#include "xfs_inode.h" 25#include "xfs_inode.h"
28#include "xfs_error.h" 26#include "xfs_error.h"
@@ -30,7 +28,6 @@
30#include "xfs_icache.h" 28#include "xfs_icache.h"
31#include "xfs_trans.h" 29#include "xfs_trans.h"
32#include "xfs_ialloc.h" 30#include "xfs_ialloc.h"
33#include "xfs_dinode.h"
34 31
35/* 32/*
36 * Check that none of the inode's in the buffer have a next 33 * Check that none of the inode's in the buffer have a next
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 6a00f7fed69d..0defbd02f62d 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -22,9 +22,6 @@
22#include "xfs_format.h" 22#include "xfs_format.h"
23#include "xfs_log_format.h" 23#include "xfs_log_format.h"
24#include "xfs_trans_resv.h" 24#include "xfs_trans_resv.h"
25#include "xfs_inum.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h" 25#include "xfs_mount.h"
29#include "xfs_inode.h" 26#include "xfs_inode.h"
30#include "xfs_trans.h" 27#include "xfs_trans.h"
@@ -34,7 +31,6 @@
34#include "xfs_error.h" 31#include "xfs_error.h"
35#include "xfs_trace.h" 32#include "xfs_trace.h"
36#include "xfs_attr_sf.h" 33#include "xfs_attr_sf.h"
37#include "xfs_dinode.h"
38 34
39kmem_zone_t *xfs_ifork_zone; 35kmem_zone_t *xfs_ifork_zone;
40 36
diff --git a/fs/xfs/libxfs/xfs_inum.h b/fs/xfs/libxfs/xfs_inum.h
deleted file mode 100644
index 4ff2278e147a..000000000000
--- a/fs/xfs/libxfs/xfs_inum.h
+++ /dev/null
@@ -1,60 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_INUM_H__
19#define __XFS_INUM_H__
20
21/*
22 * Inode number format:
23 * low inopblog bits - offset in block
24 * next agblklog bits - block number in ag
25 * next agno_log bits - ag number
26 * high agno_log-agblklog-inopblog bits - 0
27 */
28
29struct xfs_mount;
30
31#define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1)
32#define XFS_INO_OFFSET_BITS(mp) (mp)->m_sb.sb_inopblog
33#define XFS_INO_AGBNO_BITS(mp) (mp)->m_sb.sb_agblklog
34#define XFS_INO_AGINO_BITS(mp) (mp)->m_agino_log
35#define XFS_INO_AGNO_BITS(mp) (mp)->m_agno_log
36#define XFS_INO_BITS(mp) \
37 XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp)
38#define XFS_INO_TO_AGNO(mp,i) \
39 ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp)))
40#define XFS_INO_TO_AGINO(mp,i) \
41 ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp)))
42#define XFS_INO_TO_AGBNO(mp,i) \
43 (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \
44 XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp)))
45#define XFS_INO_TO_OFFSET(mp,i) \
46 ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
47#define XFS_INO_TO_FSB(mp,i) \
48 XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i))
49#define XFS_AGINO_TO_INO(mp,a,i) \
50 (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i))
51#define XFS_AGINO_TO_AGBNO(mp,i) ((i) >> XFS_INO_OFFSET_BITS(mp))
52#define XFS_AGINO_TO_OFFSET(mp,i) \
53 ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
54#define XFS_OFFBNO_TO_AGINO(mp,b,o) \
55 ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
56
57#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL))
58#define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL))
59
60#endif /* __XFS_INUM_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index aff12f2d4428..265314690415 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -361,7 +361,7 @@ typedef struct xfs_ictimestamp {
361 361
362/* 362/*
363 * NOTE: This structure must be kept identical to struct xfs_dinode 363 * NOTE: This structure must be kept identical to struct xfs_dinode
364 * in xfs_dinode.h except for the endianness annotations. 364 * except for the endianness annotations.
365 */ 365 */
366typedef struct xfs_icdinode { 366typedef struct xfs_icdinode {
367 __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ 367 __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
index ee7e0e80246b..c10597973333 100644
--- a/fs/xfs/libxfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -21,8 +21,6 @@
21#include "xfs_format.h" 21#include "xfs_format.h"
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_ag.h"
25#include "xfs_sb.h"
26#include "xfs_mount.h" 24#include "xfs_mount.h"
27#include "xfs_da_format.h" 25#include "xfs_da_format.h"
28#include "xfs_trans_space.h" 26#include "xfs_trans_space.h"
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 7c818f1e4484..9b59ffa1fc19 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -22,8 +22,6 @@
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_bit.h" 24#include "xfs_bit.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h" 25#include "xfs_mount.h"
28#include "xfs_inode.h" 26#include "xfs_inode.h"
29#include "xfs_bmap.h" 27#include "xfs_bmap.h"
@@ -36,7 +34,6 @@
36#include "xfs_trace.h" 34#include "xfs_trace.h"
37#include "xfs_buf.h" 35#include "xfs_buf.h"
38#include "xfs_icache.h" 36#include "xfs_icache.h"
39#include "xfs_dinode.h"
40#include "xfs_rtalloc.h" 37#include "xfs_rtalloc.h"
41 38
42 39
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 5f902fa7913f..752915fa775a 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -23,7 +23,6 @@
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_bit.h" 24#include "xfs_bit.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h" 26#include "xfs_mount.h"
28#include "xfs_inode.h" 27#include "xfs_inode.h"
29#include "xfs_ialloc.h" 28#include "xfs_ialloc.h"
@@ -33,7 +32,6 @@
33#include "xfs_cksum.h" 32#include "xfs_cksum.h"
34#include "xfs_trans.h" 33#include "xfs_trans.h"
35#include "xfs_buf_item.h" 34#include "xfs_buf_item.h"
36#include "xfs_dinode.h"
37#include "xfs_bmap_btree.h" 35#include "xfs_bmap_btree.h"
38#include "xfs_alloc_btree.h" 36#include "xfs_alloc_btree.h"
39#include "xfs_ialloc_btree.h" 37#include "xfs_ialloc_btree.h"
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 2e739708afd3..8eb1c54bafbf 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -19,590 +19,6 @@
19#define __XFS_SB_H__ 19#define __XFS_SB_H__
20 20
21/* 21/*
22 * Super block
23 * Fits into a sector-sized buffer at address 0 of each allocation group.
24 * Only the first of these is ever updated except during growfs.
25 */
26
27struct xfs_buf;
28struct xfs_mount;
29struct xfs_trans;
30
31#define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */
32#define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */
33#define XFS_SB_VERSION_2 2 /* 6.2 - attributes */
34#define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */
35#define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */
36#define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */
37#define XFS_SB_VERSION_NUMBITS 0x000f
38#define XFS_SB_VERSION_ALLFBITS 0xfff0
39#define XFS_SB_VERSION_ATTRBIT 0x0010
40#define XFS_SB_VERSION_NLINKBIT 0x0020
41#define XFS_SB_VERSION_QUOTABIT 0x0040
42#define XFS_SB_VERSION_ALIGNBIT 0x0080
43#define XFS_SB_VERSION_DALIGNBIT 0x0100
44#define XFS_SB_VERSION_SHAREDBIT 0x0200
45#define XFS_SB_VERSION_LOGV2BIT 0x0400
46#define XFS_SB_VERSION_SECTORBIT 0x0800
47#define XFS_SB_VERSION_EXTFLGBIT 0x1000
48#define XFS_SB_VERSION_DIRV2BIT 0x2000
49#define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */
50#define XFS_SB_VERSION_MOREBITSBIT 0x8000
51
52/*
53 * Supported feature bit list is just all bits in the versionnum field because
54 * we've used them all up and understand them all. Except, of course, for the
55 * shared superblock bit, which nobody knows what it does and so is unsupported.
56 */
57#define XFS_SB_VERSION_OKBITS \
58 ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \
59 ~XFS_SB_VERSION_SHAREDBIT)
60
61/*
62 * There are two words to hold XFS "feature" bits: the original
63 * word, sb_versionnum, and sb_features2. Whenever a bit is set in
64 * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set.
65 *
66 * These defines represent bits in sb_features2.
67 */
68#define XFS_SB_VERSION2_RESERVED1BIT 0x00000001
69#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */
70#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004
71#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */
72#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
73#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */
74#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */
75#define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */
76
77#define XFS_SB_VERSION2_OKBITS \
78 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
79 XFS_SB_VERSION2_ATTR2BIT | \
80 XFS_SB_VERSION2_PROJID32BIT | \
81 XFS_SB_VERSION2_FTYPE)
82
83/*
84 * Superblock - in core version. Must match the ondisk version below.
85 * Must be padded to 64 bit alignment.
86 */
87typedef struct xfs_sb {
88 __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */
89 __uint32_t sb_blocksize; /* logical block size, bytes */
90 xfs_rfsblock_t sb_dblocks; /* number of data blocks */
91 xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */
92 xfs_rtblock_t sb_rextents; /* number of realtime extents */
93 uuid_t sb_uuid; /* file system unique id */
94 xfs_fsblock_t sb_logstart; /* starting block of log if internal */
95 xfs_ino_t sb_rootino; /* root inode number */
96 xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */
97 xfs_ino_t sb_rsumino; /* summary inode for rt bitmap */
98 xfs_agblock_t sb_rextsize; /* realtime extent size, blocks */
99 xfs_agblock_t sb_agblocks; /* size of an allocation group */
100 xfs_agnumber_t sb_agcount; /* number of allocation groups */
101 xfs_extlen_t sb_rbmblocks; /* number of rt bitmap blocks */
102 xfs_extlen_t sb_logblocks; /* number of log blocks */
103 __uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */
104 __uint16_t sb_sectsize; /* volume sector size, bytes */
105 __uint16_t sb_inodesize; /* inode size, bytes */
106 __uint16_t sb_inopblock; /* inodes per block */
107 char sb_fname[12]; /* file system name */
108 __uint8_t sb_blocklog; /* log2 of sb_blocksize */
109 __uint8_t sb_sectlog; /* log2 of sb_sectsize */
110 __uint8_t sb_inodelog; /* log2 of sb_inodesize */
111 __uint8_t sb_inopblog; /* log2 of sb_inopblock */
112 __uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */
113 __uint8_t sb_rextslog; /* log2 of sb_rextents */
114 __uint8_t sb_inprogress; /* mkfs is in progress, don't mount */
115 __uint8_t sb_imax_pct; /* max % of fs for inode space */
116 /* statistics */
117 /*
118 * These fields must remain contiguous. If you really
119 * want to change their layout, make sure you fix the
120 * code in xfs_trans_apply_sb_deltas().
121 */
122 __uint64_t sb_icount; /* allocated inodes */
123 __uint64_t sb_ifree; /* free inodes */
124 __uint64_t sb_fdblocks; /* free data blocks */
125 __uint64_t sb_frextents; /* free realtime extents */
126 /*
127 * End contiguous fields.
128 */
129 xfs_ino_t sb_uquotino; /* user quota inode */
130 xfs_ino_t sb_gquotino; /* group quota inode */
131 __uint16_t sb_qflags; /* quota flags */
132 __uint8_t sb_flags; /* misc. flags */
133 __uint8_t sb_shared_vn; /* shared version number */
134 xfs_extlen_t sb_inoalignmt; /* inode chunk alignment, fsblocks */
135 __uint32_t sb_unit; /* stripe or raid unit */
136 __uint32_t sb_width; /* stripe or raid width */
137 __uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */
138 __uint8_t sb_logsectlog; /* log2 of the log sector size */
139 __uint16_t sb_logsectsize; /* sector size for the log, bytes */
140 __uint32_t sb_logsunit; /* stripe unit size for the log */
141 __uint32_t sb_features2; /* additional feature bits */
142
143 /*
144 * bad features2 field as a result of failing to pad the sb
145 * structure to 64 bits. Some machines will be using this field
146 * for features2 bits. Easiest just to mark it bad and not use
147 * it for anything else.
148 */
149 __uint32_t sb_bad_features2;
150
151 /* version 5 superblock fields start here */
152
153 /* feature masks */
154 __uint32_t sb_features_compat;
155 __uint32_t sb_features_ro_compat;
156 __uint32_t sb_features_incompat;
157 __uint32_t sb_features_log_incompat;
158
159 __uint32_t sb_crc; /* superblock crc */
160 __uint32_t sb_pad;
161
162 xfs_ino_t sb_pquotino; /* project quota inode */
163 xfs_lsn_t sb_lsn; /* last write sequence */
164
165 /* must be padded to 64 bit alignment */
166} xfs_sb_t;
167
168#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc)
169
170/*
171 * Superblock - on disk version. Must match the in core version above.
172 * Must be padded to 64 bit alignment.
173 */
174typedef struct xfs_dsb {
175 __be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */
176 __be32 sb_blocksize; /* logical block size, bytes */
177 __be64 sb_dblocks; /* number of data blocks */
178 __be64 sb_rblocks; /* number of realtime blocks */
179 __be64 sb_rextents; /* number of realtime extents */
180 uuid_t sb_uuid; /* file system unique id */
181 __be64 sb_logstart; /* starting block of log if internal */
182 __be64 sb_rootino; /* root inode number */
183 __be64 sb_rbmino; /* bitmap inode for realtime extents */
184 __be64 sb_rsumino; /* summary inode for rt bitmap */
185 __be32 sb_rextsize; /* realtime extent size, blocks */
186 __be32 sb_agblocks; /* size of an allocation group */
187 __be32 sb_agcount; /* number of allocation groups */
188 __be32 sb_rbmblocks; /* number of rt bitmap blocks */
189 __be32 sb_logblocks; /* number of log blocks */
190 __be16 sb_versionnum; /* header version == XFS_SB_VERSION */
191 __be16 sb_sectsize; /* volume sector size, bytes */
192 __be16 sb_inodesize; /* inode size, bytes */
193 __be16 sb_inopblock; /* inodes per block */
194 char sb_fname[12]; /* file system name */
195 __u8 sb_blocklog; /* log2 of sb_blocksize */
196 __u8 sb_sectlog; /* log2 of sb_sectsize */
197 __u8 sb_inodelog; /* log2 of sb_inodesize */
198 __u8 sb_inopblog; /* log2 of sb_inopblock */
199 __u8 sb_agblklog; /* log2 of sb_agblocks (rounded up) */
200 __u8 sb_rextslog; /* log2 of sb_rextents */
201 __u8 sb_inprogress; /* mkfs is in progress, don't mount */
202 __u8 sb_imax_pct; /* max % of fs for inode space */
203 /* statistics */
204 /*
205 * These fields must remain contiguous. If you really
206 * want to change their layout, make sure you fix the
207 * code in xfs_trans_apply_sb_deltas().
208 */
209 __be64 sb_icount; /* allocated inodes */
210 __be64 sb_ifree; /* free inodes */
211 __be64 sb_fdblocks; /* free data blocks */
212 __be64 sb_frextents; /* free realtime extents */
213 /*
214 * End contiguous fields.
215 */
216 __be64 sb_uquotino; /* user quota inode */
217 __be64 sb_gquotino; /* group quota inode */
218 __be16 sb_qflags; /* quota flags */
219 __u8 sb_flags; /* misc. flags */
220 __u8 sb_shared_vn; /* shared version number */
221 __be32 sb_inoalignmt; /* inode chunk alignment, fsblocks */
222 __be32 sb_unit; /* stripe or raid unit */
223 __be32 sb_width; /* stripe or raid width */
224 __u8 sb_dirblklog; /* log2 of dir block size (fsbs) */
225 __u8 sb_logsectlog; /* log2 of the log sector size */
226 __be16 sb_logsectsize; /* sector size for the log, bytes */
227 __be32 sb_logsunit; /* stripe unit size for the log */
228 __be32 sb_features2; /* additional feature bits */
229 /*
230 * bad features2 field as a result of failing to pad the sb
231 * structure to 64 bits. Some machines will be using this field
232 * for features2 bits. Easiest just to mark it bad and not use
233 * it for anything else.
234 */
235 __be32 sb_bad_features2;
236
237 /* version 5 superblock fields start here */
238
239 /* feature masks */
240 __be32 sb_features_compat;
241 __be32 sb_features_ro_compat;
242 __be32 sb_features_incompat;
243 __be32 sb_features_log_incompat;
244
245 __le32 sb_crc; /* superblock crc */
246 __be32 sb_pad;
247
248 __be64 sb_pquotino; /* project quota inode */
249 __be64 sb_lsn; /* last write sequence */
250
251 /* must be padded to 64 bit alignment */
252} xfs_dsb_t;
253
254/*
255 * Sequence number values for the fields.
256 */
257typedef enum {
258 XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS,
259 XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO,
260 XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS,
261 XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS,
262 XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE,
263 XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG,
264 XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG,
265 XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT,
266 XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO,
267 XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
268 XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
269 XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
270 XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT,
271 XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT,
272 XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD,
273 XFS_SBS_PQUOTINO, XFS_SBS_LSN,
274 XFS_SBS_FIELDCOUNT
275} xfs_sb_field_t;
276
277/*
278 * Mask values, defined based on the xfs_sb_field_t values.
279 * Only define the ones we're using.
280 */
281#define XFS_SB_MVAL(x) (1LL << XFS_SBS_ ## x)
282#define XFS_SB_UUID XFS_SB_MVAL(UUID)
283#define XFS_SB_FNAME XFS_SB_MVAL(FNAME)
284#define XFS_SB_ROOTINO XFS_SB_MVAL(ROOTINO)
285#define XFS_SB_RBMINO XFS_SB_MVAL(RBMINO)
286#define XFS_SB_RSUMINO XFS_SB_MVAL(RSUMINO)
287#define XFS_SB_VERSIONNUM XFS_SB_MVAL(VERSIONNUM)
288#define XFS_SB_UQUOTINO XFS_SB_MVAL(UQUOTINO)
289#define XFS_SB_GQUOTINO XFS_SB_MVAL(GQUOTINO)
290#define XFS_SB_QFLAGS XFS_SB_MVAL(QFLAGS)
291#define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN)
292#define XFS_SB_UNIT XFS_SB_MVAL(UNIT)
293#define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH)
294#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT)
295#define XFS_SB_IFREE XFS_SB_MVAL(IFREE)
296#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS)
297#define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2)
298#define XFS_SB_BAD_FEATURES2 XFS_SB_MVAL(BAD_FEATURES2)
299#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT)
300#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
301#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
302#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT)
303#define XFS_SB_CRC XFS_SB_MVAL(CRC)
304#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO)
305#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT)
306#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1)
307#define XFS_SB_MOD_BITS \
308 (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
309 XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
310 XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
311 XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
312 XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \
313 XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \
314 XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO)
315
316
317/*
318 * Misc. Flags - warning - these will be cleared by xfs_repair unless
319 * a feature bit is set when the flag is used.
320 */
321#define XFS_SBF_NOFLAGS 0x00 /* no flags set */
322#define XFS_SBF_READONLY 0x01 /* only read-only mounts allowed */
323
324/*
325 * define max. shared version we can interoperate with
326 */
327#define XFS_SB_MAX_SHARED_VN 0
328
329#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
330
331/*
332 * The first XFS version we support is a v4 superblock with V2 directories.
333 */
334static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp)
335{
336 if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
337 return false;
338
339 /* check for unknown features in the fs */
340 if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
341 ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
342 (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
343 return false;
344
345 return true;
346}
347
348static inline bool xfs_sb_good_version(struct xfs_sb *sbp)
349{
350 if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
351 return true;
352 if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
353 return xfs_sb_good_v4_features(sbp);
354 return false;
355}
356
357/*
358 * Detect a mismatched features2 field. Older kernels read/wrote
359 * this into the wrong slot, so to be safe we keep them in sync.
360 */
361static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp)
362{
363 return sbp->sb_bad_features2 != sbp->sb_features2;
364}
365
366static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp)
367{
368 return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT);
369}
370
371static inline void xfs_sb_version_addattr(struct xfs_sb *sbp)
372{
373 sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
374}
375
376static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp)
377{
378 return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
379}
380
381static inline void xfs_sb_version_addquota(struct xfs_sb *sbp)
382{
383 sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
384}
385
386static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp)
387{
388 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
389 (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT));
390}
391
392static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp)
393{
394 return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
395}
396
397static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp)
398{
399 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
400 (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
401}
402
403static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp)
404{
405 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
406 (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
407}
408
409static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp)
410{
411 return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
412}
413
414static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp)
415{
416 return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
417}
418
419static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
420{
421 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
422 (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
423}
424
425/*
426 * sb_features2 bit version macros.
427 */
428static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp)
429{
430 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
431 (xfs_sb_version_hasmorebits(sbp) &&
432 (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
433}
434
435static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp)
436{
437 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
438 (xfs_sb_version_hasmorebits(sbp) &&
439 (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT));
440}
441
442static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
443{
444 sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
445 sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
446 sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT;
447}
448
449static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp)
450{
451 sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
452 sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
453 if (!sbp->sb_features2)
454 sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
455}
456
457static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp)
458{
459 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
460 (xfs_sb_version_hasmorebits(sbp) &&
461 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
462}
463
464static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
465{
466 sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
467 sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
468 sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT;
469}
470
471/*
472 * Extended v5 superblock feature masks. These are to be used for new v5
473 * superblock features only.
474 *
475 * Compat features are new features that old kernels will not notice or affect
476 * and so can mount read-write without issues.
477 *
478 * RO-Compat (read only) are features that old kernels can read but will break
479 * if they write. Hence only read-only mounts of such filesystems are allowed on
480 * kernels that don't support the feature bit.
481 *
482 * InCompat features are features which old kernels will not understand and so
483 * must not mount.
484 *
485 * Log-InCompat features are for changes to log formats or new transactions that
486 * can't be replayed on older kernels. The fields are set when the filesystem is
487 * mounted, and a clean unmount clears the fields.
488 */
489#define XFS_SB_FEAT_COMPAT_ALL 0
490#define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL
491static inline bool
492xfs_sb_has_compat_feature(
493 struct xfs_sb *sbp,
494 __uint32_t feature)
495{
496 return (sbp->sb_features_compat & feature) != 0;
497}
498
499#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */
500#define XFS_SB_FEAT_RO_COMPAT_ALL \
501 (XFS_SB_FEAT_RO_COMPAT_FINOBT)
502#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
503static inline bool
504xfs_sb_has_ro_compat_feature(
505 struct xfs_sb *sbp,
506 __uint32_t feature)
507{
508 return (sbp->sb_features_ro_compat & feature) != 0;
509}
510
511#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
512#define XFS_SB_FEAT_INCOMPAT_ALL \
513 (XFS_SB_FEAT_INCOMPAT_FTYPE)
514
515#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
516static inline bool
517xfs_sb_has_incompat_feature(
518 struct xfs_sb *sbp,
519 __uint32_t feature)
520{
521 return (sbp->sb_features_incompat & feature) != 0;
522}
523
524#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0
525#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
526static inline bool
527xfs_sb_has_incompat_log_feature(
528 struct xfs_sb *sbp,
529 __uint32_t feature)
530{
531 return (sbp->sb_features_log_incompat & feature) != 0;
532}
533
534/*
535 * V5 superblock specific feature checks
536 */
537static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp)
538{
539 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
540}
541
542static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
543{
544 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
545}
546
547static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
548{
549 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
550 xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) ||
551 (xfs_sb_version_hasmorebits(sbp) &&
552 (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
553}
554
555static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
556{
557 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
558 (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
559}
560
561/*
562 * end of superblock version macros
563 */
564
565static inline bool
566xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
567{
568 return (ino == sbp->sb_uquotino ||
569 ino == sbp->sb_gquotino ||
570 ino == sbp->sb_pquotino);
571}
572
573#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */
574#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
575#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr))
576
577#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
578#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \
579 xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d))
580#define XFS_FSB_TO_DADDR(mp,fsbno) XFS_AGB_TO_DADDR(mp, \
581 XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno))
582
583/*
584 * File system sector to basic block conversions.
585 */
586#define XFS_FSS_TO_BB(mp,sec) ((sec) << (mp)->m_sectbb_log)
587
588/*
589 * File system block to basic block conversions.
590 */
591#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log)
592#define XFS_BB_TO_FSB(mp,bb) \
593 (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
594#define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log)
595
596/*
597 * File system block to byte conversions.
598 */
599#define XFS_FSB_TO_B(mp,fsbno) ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog)
600#define XFS_B_TO_FSB(mp,b) \
601 ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
602#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
603#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask)
604
605/*
606 * perag get/put wrappers for ref counting 22 * perag get/put wrappers for ref counting
607 */ 23 */
608extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t); 24extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t);
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 5782f037eab4..c80c5236c3da 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -22,8 +22,6 @@
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_shared.h" 23#include "xfs_shared.h"
24#include "xfs_trans_resv.h" 24#include "xfs_trans_resv.h"
25#include "xfs_ag.h"
26#include "xfs_sb.h"
27#include "xfs_mount.h" 25#include "xfs_mount.h"
28#include "xfs_bmap_btree.h" 26#include "xfs_bmap_btree.h"
29#include "xfs_inode.h" 27#include "xfs_inode.h"
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index f2bda7c76b8a..6c1330f29050 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -22,8 +22,6 @@
22#include "xfs_format.h" 22#include "xfs_format.h"
23#include "xfs_log_format.h" 23#include "xfs_log_format.h"
24#include "xfs_trans_resv.h" 24#include "xfs_trans_resv.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h" 25#include "xfs_mount.h"
28#include "xfs_da_format.h" 26#include "xfs_da_format.h"
29#include "xfs_da_btree.h" 27#include "xfs_da_btree.h"
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index a65fa5dde6e9..4b641676f258 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -19,8 +19,6 @@
19#include "xfs_format.h" 19#include "xfs_format.h"
20#include "xfs_log_format.h" 20#include "xfs_log_format.h"
21#include "xfs_trans_resv.h" 21#include "xfs_trans_resv.h"
22#include "xfs_ag.h"
23#include "xfs_sb.h"
24#include "xfs_mount.h" 22#include "xfs_mount.h"
25#include "xfs_inode.h" 23#include "xfs_inode.h"
26#include "xfs_acl.h" 24#include "xfs_acl.h"
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 5dc163744511..3841b07f27bf 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -22,42 +22,6 @@ struct inode;
22struct posix_acl; 22struct posix_acl;
23struct xfs_inode; 23struct xfs_inode;
24 24
25#define XFS_ACL_NOT_PRESENT (-1)
26
27/* On-disk XFS access control list structure */
28struct xfs_acl_entry {
29 __be32 ae_tag;
30 __be32 ae_id;
31 __be16 ae_perm;
32 __be16 ae_pad; /* fill the implicit hole in the structure */
33};
34
35struct xfs_acl {
36 __be32 acl_cnt;
37 struct xfs_acl_entry acl_entry[0];
38};
39
40/*
41 * The number of ACL entries allowed is defined by the on-disk format.
42 * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is
43 * limited only by the maximum size of the xattr that stores the information.
44 */
45#define XFS_ACL_MAX_ENTRIES(mp) \
46 (xfs_sb_version_hascrc(&mp->m_sb) \
47 ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
48 sizeof(struct xfs_acl_entry) \
49 : 25)
50
51#define XFS_ACL_MAX_SIZE(mp) \
52 (sizeof(struct xfs_acl) + \
53 sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
54
55/* On-disk XFS extended attribute names */
56#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE"
57#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT"
58#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1)
59#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
60
61#ifdef CONFIG_XFS_POSIX_ACL 25#ifdef CONFIG_XFS_POSIX_ACL
62extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); 26extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
63extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); 27extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f5b2453a43b2..18e2f3bbae5e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -20,8 +20,6 @@
20#include "xfs_format.h" 20#include "xfs_format.h"
21#include "xfs_log_format.h" 21#include "xfs_log_format.h"
22#include "xfs_trans_resv.h" 22#include "xfs_trans_resv.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_mount.h" 23#include "xfs_mount.h"
26#include "xfs_inode.h" 24#include "xfs_inode.h"
27#include "xfs_trans.h" 25#include "xfs_trans.h"
@@ -33,7 +31,6 @@
33#include "xfs_bmap.h" 31#include "xfs_bmap.h"
34#include "xfs_bmap_util.h" 32#include "xfs_bmap_util.h"
35#include "xfs_bmap_btree.h" 33#include "xfs_bmap_btree.h"
36#include "xfs_dinode.h"
37#include <linux/aio.h> 34#include <linux/aio.h>
38#include <linux/gfp.h> 35#include <linux/gfp.h>
39#include <linux/mpage.h> 36#include <linux/mpage.h>
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index aa2a8b1838a2..83af4c149635 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -23,8 +23,6 @@
23#include "xfs_log_format.h" 23#include "xfs_log_format.h"
24#include "xfs_trans_resv.h" 24#include "xfs_trans_resv.h"
25#include "xfs_bit.h" 25#include "xfs_bit.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h" 26#include "xfs_mount.h"
29#include "xfs_da_format.h" 27#include "xfs_da_format.h"
30#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
@@ -39,7 +37,6 @@
39#include "xfs_error.h" 37#include "xfs_error.h"
40#include "xfs_quota.h" 38#include "xfs_quota.h"
41#include "xfs_trace.h" 39#include "xfs_trace.h"
42#include "xfs_dinode.h"
43#include "xfs_dir2.h" 40#include "xfs_dir2.h"
44 41
45/* 42/*
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 62db83ab6cbc..a43d370d2c58 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -22,8 +22,6 @@
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_bit.h" 24#include "xfs_bit.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h" 25#include "xfs_mount.h"
28#include "xfs_da_format.h" 26#include "xfs_da_format.h"
29#include "xfs_da_btree.h" 27#include "xfs_da_btree.h"
@@ -39,7 +37,6 @@
39#include "xfs_trace.h" 37#include "xfs_trace.h"
40#include "xfs_buf_item.h" 38#include "xfs_buf_item.h"
41#include "xfs_cksum.h" 39#include "xfs_cksum.h"
42#include "xfs_dinode.h"
43#include "xfs_dir2.h" 40#include "xfs_dir2.h"
44 41
45STATIC int 42STATIC int
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 281002689d64..22a5dcb70b32 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -23,8 +23,6 @@
23#include "xfs_log_format.h" 23#include "xfs_log_format.h"
24#include "xfs_trans_resv.h" 24#include "xfs_trans_resv.h"
25#include "xfs_bit.h" 25#include "xfs_bit.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h" 26#include "xfs_mount.h"
29#include "xfs_da_format.h" 27#include "xfs_da_format.h"
30#include "xfs_inode.h" 28#include "xfs_inode.h"
@@ -42,7 +40,6 @@
42#include "xfs_trace.h" 40#include "xfs_trace.h"
43#include "xfs_icache.h" 41#include "xfs_icache.h"
44#include "xfs_log.h" 42#include "xfs_log.h"
45#include "xfs_dinode.h"
46 43
47/* Kernel only BMAP related definitions and functions */ 44/* Kernel only BMAP related definitions and functions */
48 45
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 24b4ebea0d4d..bb502a391792 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -34,18 +34,16 @@
34#include <linux/backing-dev.h> 34#include <linux/backing-dev.h>
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36 36
37#include "xfs_format.h"
37#include "xfs_log_format.h" 38#include "xfs_log_format.h"
38#include "xfs_trans_resv.h" 39#include "xfs_trans_resv.h"
39#include "xfs_sb.h" 40#include "xfs_sb.h"
40#include "xfs_ag.h"
41#include "xfs_mount.h" 41#include "xfs_mount.h"
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43#include "xfs_log.h" 43#include "xfs_log.h"
44 44
45static kmem_zone_t *xfs_buf_zone; 45static kmem_zone_t *xfs_buf_zone;
46 46
47static struct workqueue_struct *xfslogd_workqueue;
48
49#ifdef XFS_BUF_LOCK_TRACKING 47#ifdef XFS_BUF_LOCK_TRACKING
50# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) 48# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid)
51# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) 49# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1)
@@ -463,7 +461,7 @@ _xfs_buf_find(
463 * have to check that the buffer falls within the filesystem bounds. 461 * have to check that the buffer falls within the filesystem bounds.
464 */ 462 */
465 eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); 463 eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
466 if (blkno >= eofs) { 464 if (blkno < 0 || blkno >= eofs) {
467 /* 465 /*
468 * XXX (dgc): we should really be returning -EFSCORRUPTED here, 466 * XXX (dgc): we should really be returning -EFSCORRUPTED here,
469 * but none of the higher level infrastructure supports 467 * but none of the higher level infrastructure supports
@@ -1043,7 +1041,7 @@ xfs_buf_ioend_work(
1043 struct work_struct *work) 1041 struct work_struct *work)
1044{ 1042{
1045 struct xfs_buf *bp = 1043 struct xfs_buf *bp =
1046 container_of(work, xfs_buf_t, b_iodone_work); 1044 container_of(work, xfs_buf_t, b_ioend_work);
1047 1045
1048 xfs_buf_ioend(bp); 1046 xfs_buf_ioend(bp);
1049} 1047}
@@ -1052,8 +1050,8 @@ void
1052xfs_buf_ioend_async( 1050xfs_buf_ioend_async(
1053 struct xfs_buf *bp) 1051 struct xfs_buf *bp)
1054{ 1052{
1055 INIT_WORK(&bp->b_iodone_work, xfs_buf_ioend_work); 1053 INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
1056 queue_work(xfslogd_workqueue, &bp->b_iodone_work); 1054 queue_work(bp->b_ioend_wq, &bp->b_ioend_work);
1057} 1055}
1058 1056
1059void 1057void
@@ -1222,6 +1220,13 @@ _xfs_buf_ioapply(
1222 */ 1220 */
1223 bp->b_error = 0; 1221 bp->b_error = 0;
1224 1222
1223 /*
1224 * Initialize the I/O completion workqueue if we haven't yet or the
1225 * submitter has not opted to specify a custom one.
1226 */
1227 if (!bp->b_ioend_wq)
1228 bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue;
1229
1225 if (bp->b_flags & XBF_WRITE) { 1230 if (bp->b_flags & XBF_WRITE) {
1226 if (bp->b_flags & XBF_SYNCIO) 1231 if (bp->b_flags & XBF_SYNCIO)
1227 rw = WRITE_SYNC; 1232 rw = WRITE_SYNC;
@@ -1882,15 +1887,8 @@ xfs_buf_init(void)
1882 if (!xfs_buf_zone) 1887 if (!xfs_buf_zone)
1883 goto out; 1888 goto out;
1884 1889
1885 xfslogd_workqueue = alloc_workqueue("xfslogd",
1886 WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 1);
1887 if (!xfslogd_workqueue)
1888 goto out_free_buf_zone;
1889
1890 return 0; 1890 return 0;
1891 1891
1892 out_free_buf_zone:
1893 kmem_zone_destroy(xfs_buf_zone);
1894 out: 1892 out:
1895 return -ENOMEM; 1893 return -ENOMEM;
1896} 1894}
@@ -1898,6 +1896,5 @@ xfs_buf_init(void)
1898void 1896void
1899xfs_buf_terminate(void) 1897xfs_buf_terminate(void)
1900{ 1898{
1901 destroy_workqueue(xfslogd_workqueue);
1902 kmem_zone_destroy(xfs_buf_zone); 1899 kmem_zone_destroy(xfs_buf_zone);
1903} 1900}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 82002c00af90..75ff5d5a7d2e 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -164,7 +164,8 @@ typedef struct xfs_buf {
164 struct xfs_perag *b_pag; /* contains rbtree root */ 164 struct xfs_perag *b_pag; /* contains rbtree root */
165 xfs_buftarg_t *b_target; /* buffer target (device) */ 165 xfs_buftarg_t *b_target; /* buffer target (device) */
166 void *b_addr; /* virtual address of buffer */ 166 void *b_addr; /* virtual address of buffer */
167 struct work_struct b_iodone_work; 167 struct work_struct b_ioend_work;
168 struct workqueue_struct *b_ioend_wq; /* I/O completion wq */
168 xfs_buf_iodone_t b_iodone; /* I/O completion function */ 169 xfs_buf_iodone_t b_iodone; /* I/O completion function */
169 struct completion b_iowait; /* queue for I/O waiters */ 170 struct completion b_iowait; /* queue for I/O waiters */
170 void *b_fspriv; 171 void *b_fspriv;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index f15969543326..3f9bd58edec7 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -17,11 +17,11 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_log_format.h" 21#include "xfs_log_format.h"
21#include "xfs_trans_resv.h" 22#include "xfs_trans_resv.h"
22#include "xfs_bit.h" 23#include "xfs_bit.h"
23#include "xfs_sb.h" 24#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_mount.h" 25#include "xfs_mount.h"
26#include "xfs_trans.h" 26#include "xfs_trans.h"
27#include "xfs_buf_item.h" 27#include "xfs_buf_item.h"
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index f1b69edcdf31..098cd78fe708 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -22,8 +22,6 @@
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_bit.h" 24#include "xfs_bit.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h" 25#include "xfs_mount.h"
28#include "xfs_da_format.h" 26#include "xfs_da_format.h"
29#include "xfs_da_btree.h" 27#include "xfs_da_btree.h"
@@ -34,7 +32,6 @@
34#include "xfs_trace.h" 32#include "xfs_trace.h"
35#include "xfs_bmap.h" 33#include "xfs_bmap.h"
36#include "xfs_trans.h" 34#include "xfs_trans.h"
37#include "xfs_dinode.h"
38 35
39/* 36/*
40 * Directory file type support functions 37 * Directory file type support functions
@@ -44,7 +41,7 @@ static unsigned char xfs_dir3_filetype_table[] = {
44 DT_FIFO, DT_SOCK, DT_LNK, DT_WHT, 41 DT_FIFO, DT_SOCK, DT_LNK, DT_WHT,
45}; 42};
46 43
47unsigned char 44static unsigned char
48xfs_dir3_get_dtype( 45xfs_dir3_get_dtype(
49 struct xfs_mount *mp, 46 struct xfs_mount *mp,
50 __uint8_t filetype) 47 __uint8_t filetype)
@@ -57,22 +54,6 @@ xfs_dir3_get_dtype(
57 54
58 return xfs_dir3_filetype_table[filetype]; 55 return xfs_dir3_filetype_table[filetype];
59} 56}
60/*
61 * @mode, if set, indicates that the type field needs to be set up.
62 * This uses the transformation from file mode to DT_* as defined in linux/fs.h
63 * for file type specification. This will be propagated into the directory
64 * structure if appropriate for the given operation and filesystem config.
65 */
66const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = {
67 [0] = XFS_DIR3_FT_UNKNOWN,
68 [S_IFREG >> S_SHIFT] = XFS_DIR3_FT_REG_FILE,
69 [S_IFDIR >> S_SHIFT] = XFS_DIR3_FT_DIR,
70 [S_IFCHR >> S_SHIFT] = XFS_DIR3_FT_CHRDEV,
71 [S_IFBLK >> S_SHIFT] = XFS_DIR3_FT_BLKDEV,
72 [S_IFIFO >> S_SHIFT] = XFS_DIR3_FT_FIFO,
73 [S_IFSOCK >> S_SHIFT] = XFS_DIR3_FT_SOCK,
74 [S_IFLNK >> S_SHIFT] = XFS_DIR3_FT_SYMLINK,
75};
76 57
77STATIC int 58STATIC int
78xfs_dir2_sf_getdents( 59xfs_dir2_sf_getdents(
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 13d08a1b390e..799e5a2d334d 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -20,7 +20,6 @@
20#include "xfs_log_format.h" 20#include "xfs_log_format.h"
21#include "xfs_trans_resv.h" 21#include "xfs_trans_resv.h"
22#include "xfs_sb.h" 22#include "xfs_sb.h"
23#include "xfs_ag.h"
24#include "xfs_mount.h" 23#include "xfs_mount.h"
25#include "xfs_quota.h" 24#include "xfs_quota.h"
26#include "xfs_inode.h" 25#include "xfs_inode.h"
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 63c2de49f61d..02c01bbbc789 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -22,8 +22,6 @@
22#include "xfs_shared.h" 22#include "xfs_shared.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_bit.h" 24#include "xfs_bit.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h" 25#include "xfs_mount.h"
28#include "xfs_inode.h" 26#include "xfs_inode.h"
29#include "xfs_bmap.h" 27#include "xfs_bmap.h"
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index f33fbaaa4d8a..814cff94e78f 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -20,8 +20,6 @@
20#include "xfs_format.h" 20#include "xfs_format.h"
21#include "xfs_log_format.h" 21#include "xfs_log_format.h"
22#include "xfs_trans_resv.h" 22#include "xfs_trans_resv.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_mount.h" 23#include "xfs_mount.h"
26#include "xfs_inode.h" 24#include "xfs_inode.h"
27#include "xfs_quota.h" 25#include "xfs_quota.h"
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index b92fd7bc49e3..3ee186ac1093 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -20,8 +20,6 @@
20#include "xfs_fs.h" 20#include "xfs_fs.h"
21#include "xfs_log_format.h" 21#include "xfs_log_format.h"
22#include "xfs_trans_resv.h" 22#include "xfs_trans_resv.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_mount.h" 23#include "xfs_mount.h"
26#include "xfs_error.h" 24#include "xfs_error.h"
27 25
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 5a6bd5d8779a..5eb4a14e0a0f 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -19,10 +19,9 @@
19#include "xfs_format.h" 19#include "xfs_format.h"
20#include "xfs_log_format.h" 20#include "xfs_log_format.h"
21#include "xfs_trans_resv.h" 21#include "xfs_trans_resv.h"
22#include "xfs_sb.h"
23#include "xfs_ag.h"
24#include "xfs_mount.h" 22#include "xfs_mount.h"
25#include "xfs_da_format.h" 23#include "xfs_da_format.h"
24#include "xfs_da_btree.h"
26#include "xfs_dir2.h" 25#include "xfs_dir2.h"
27#include "xfs_export.h" 26#include "xfs_export.h"
28#include "xfs_inode.h" 27#include "xfs_inode.h"
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index fd22f69049d4..c263e079273e 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -24,7 +24,6 @@
24#include "xfs_shared.h" 24#include "xfs_shared.h"
25#include "xfs_trans_resv.h" 25#include "xfs_trans_resv.h"
26#include "xfs_sb.h" 26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_alloc.h" 28#include "xfs_alloc.h"
30#include "xfs_extent_busy.h" 29#include "xfs_extent_busy.h"
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index c4327419dc5c..cb7fe64cdbfa 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -17,10 +17,9 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_log_format.h" 21#include "xfs_log_format.h"
21#include "xfs_trans_resv.h" 22#include "xfs_trans_resv.h"
22#include "xfs_sb.h"
23#include "xfs_ag.h"
24#include "xfs_mount.h" 23#include "xfs_mount.h"
25#include "xfs_trans.h" 24#include "xfs_trans.h"
26#include "xfs_trans_priv.h" 25#include "xfs_trans_priv.h"
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index eb596b419942..13e974e6a889 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -21,8 +21,6 @@
21#include "xfs_format.h" 21#include "xfs_format.h"
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h" 24#include "xfs_mount.h"
27#include "xfs_da_format.h" 25#include "xfs_da_format.h"
28#include "xfs_da_btree.h" 26#include "xfs_da_btree.h"
@@ -37,7 +35,6 @@
37#include "xfs_ioctl.h" 35#include "xfs_ioctl.h"
38#include "xfs_trace.h" 36#include "xfs_trace.h"
39#include "xfs_log.h" 37#include "xfs_log.h"
40#include "xfs_dinode.h"
41#include "xfs_icache.h" 38#include "xfs_icache.h"
42 39
43#include <linux/aio.h> 40#include <linux/aio.h>
@@ -933,7 +930,6 @@ xfs_file_readdir(
933{ 930{
934 struct inode *inode = file_inode(file); 931 struct inode *inode = file_inode(file);
935 xfs_inode_t *ip = XFS_I(inode); 932 xfs_inode_t *ip = XFS_I(inode);
936 int error;
937 size_t bufsize; 933 size_t bufsize;
938 934
939 /* 935 /*
@@ -950,10 +946,7 @@ xfs_file_readdir(
950 */ 946 */
951 bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); 947 bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
952 948
953 error = xfs_readdir(ip, ctx, bufsize); 949 return xfs_readdir(ip, ctx, bufsize);
954 if (error)
955 return error;
956 return 0;
957} 950}
958 951
959STATIC int 952STATIC int
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index e92730c1d3ca..a2e86e8a0fea 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -20,16 +20,13 @@
20#include "xfs_format.h" 20#include "xfs_format.h"
21#include "xfs_log_format.h" 21#include "xfs_log_format.h"
22#include "xfs_trans_resv.h" 22#include "xfs_trans_resv.h"
23#include "xfs_ag.h"
24#include "xfs_sb.h" 23#include "xfs_sb.h"
25#include "xfs_mount.h" 24#include "xfs_mount.h"
26#include "xfs_inum.h"
27#include "xfs_inode.h" 25#include "xfs_inode.h"
28#include "xfs_bmap.h" 26#include "xfs_bmap.h"
29#include "xfs_bmap_util.h" 27#include "xfs_bmap_util.h"
30#include "xfs_alloc.h" 28#include "xfs_alloc.h"
31#include "xfs_mru_cache.h" 29#include "xfs_mru_cache.h"
32#include "xfs_dinode.h"
33#include "xfs_filestream.h" 30#include "xfs_filestream.h"
34#include "xfs_trace.h" 31#include "xfs_trace.h"
35 32
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index c05ac8b70fa9..fdc64220fcb0 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -22,7 +22,6 @@
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h" 25#include "xfs_mount.h"
27#include "xfs_da_format.h" 26#include "xfs_da_format.h"
28#include "xfs_da_btree.h" 27#include "xfs_da_btree.h"
@@ -40,7 +39,6 @@
40#include "xfs_rtalloc.h" 39#include "xfs_rtalloc.h"
41#include "xfs_trace.h" 40#include "xfs_trace.h"
42#include "xfs_log.h" 41#include "xfs_log.h"
43#include "xfs_dinode.h"
44#include "xfs_filestream.h" 42#include "xfs_filestream.h"
45 43
46/* 44/*
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index b45f7b27b5df..9771b7ef62ed 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -20,9 +20,7 @@
20#include "xfs_format.h" 20#include "xfs_format.h"
21#include "xfs_log_format.h" 21#include "xfs_log_format.h"
22#include "xfs_trans_resv.h" 22#include "xfs_trans_resv.h"
23#include "xfs_inum.h"
24#include "xfs_sb.h" 23#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h" 24#include "xfs_mount.h"
27#include "xfs_inode.h" 25#include "xfs_inode.h"
28#include "xfs_error.h" 26#include "xfs_error.h"
@@ -65,6 +63,7 @@ xfs_inode_alloc(
65 return NULL; 63 return NULL;
66 } 64 }
67 65
66 XFS_STATS_INC(vn_active);
68 ASSERT(atomic_read(&ip->i_pincount) == 0); 67 ASSERT(atomic_read(&ip->i_pincount) == 0);
69 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 68 ASSERT(!spin_is_locked(&ip->i_flags_lock));
70 ASSERT(!xfs_isiflocked(ip)); 69 ASSERT(!xfs_isiflocked(ip));
@@ -130,6 +129,7 @@ xfs_inode_free(
130 /* asserts to verify all state is correct here */ 129 /* asserts to verify all state is correct here */
131 ASSERT(atomic_read(&ip->i_pincount) == 0); 130 ASSERT(atomic_read(&ip->i_pincount) == 0);
132 ASSERT(!xfs_isiflocked(ip)); 131 ASSERT(!xfs_isiflocked(ip));
132 XFS_STATS_DEC(vn_active);
133 133
134 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); 134 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
135} 135}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 46748b86b12f..62f1f91c32cb 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -34,6 +34,14 @@ struct xfs_eofblocks {
34#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ 34#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
35 35
36/* 36/*
37 * tags for inode radix tree
38 */
39#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
40 in xfs_inode_ag_iterator */
41#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
42#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */
43
44/*
37 * Flags for xfs_iget() 45 * Flags for xfs_iget()
38 */ 46 */
39#define XFS_IGET_CREATE 0x1 47#define XFS_IGET_CREATE 0x1
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 7e4549233251..d45ca72af6fb 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -18,11 +18,10 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_shared.h" 20#include "xfs_shared.h"
21#include "xfs_format.h"
21#include "xfs_log_format.h" 22#include "xfs_log_format.h"
22#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
23#include "xfs_bit.h" 24#include "xfs_bit.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h" 25#include "xfs_mount.h"
27#include "xfs_trans.h" 26#include "xfs_trans.h"
28#include "xfs_trans_priv.h" 27#include "xfs_trans_priv.h"
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8ed049d1e332..41f804e740d7 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -23,9 +23,7 @@
23#include "xfs_format.h" 23#include "xfs_format.h"
24#include "xfs_log_format.h" 24#include "xfs_log_format.h"
25#include "xfs_trans_resv.h" 25#include "xfs_trans_resv.h"
26#include "xfs_inum.h"
27#include "xfs_sb.h" 26#include "xfs_sb.h"
28#include "xfs_ag.h"
29#include "xfs_mount.h" 27#include "xfs_mount.h"
30#include "xfs_inode.h" 28#include "xfs_inode.h"
31#include "xfs_da_format.h" 29#include "xfs_da_format.h"
@@ -1082,7 +1080,7 @@ xfs_create(
1082 struct xfs_dquot *udqp = NULL; 1080 struct xfs_dquot *udqp = NULL;
1083 struct xfs_dquot *gdqp = NULL; 1081 struct xfs_dquot *gdqp = NULL;
1084 struct xfs_dquot *pdqp = NULL; 1082 struct xfs_dquot *pdqp = NULL;
1085 struct xfs_trans_res tres; 1083 struct xfs_trans_res *tres;
1086 uint resblks; 1084 uint resblks;
1087 1085
1088 trace_xfs_create(dp, name); 1086 trace_xfs_create(dp, name);
@@ -1105,13 +1103,11 @@ xfs_create(
1105 if (is_dir) { 1103 if (is_dir) {
1106 rdev = 0; 1104 rdev = 0;
1107 resblks = XFS_MKDIR_SPACE_RES(mp, name->len); 1105 resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
1108 tres.tr_logres = M_RES(mp)->tr_mkdir.tr_logres; 1106 tres = &M_RES(mp)->tr_mkdir;
1109 tres.tr_logcount = XFS_MKDIR_LOG_COUNT;
1110 tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); 1107 tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
1111 } else { 1108 } else {
1112 resblks = XFS_CREATE_SPACE_RES(mp, name->len); 1109 resblks = XFS_CREATE_SPACE_RES(mp, name->len);
1113 tres.tr_logres = M_RES(mp)->tr_create.tr_logres; 1110 tres = &M_RES(mp)->tr_create;
1114 tres.tr_logcount = XFS_CREATE_LOG_COUNT;
1115 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); 1111 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1116 } 1112 }
1117 1113
@@ -1123,17 +1119,16 @@ xfs_create(
1123 * the case we'll drop the one we have and get a more 1119 * the case we'll drop the one we have and get a more
1124 * appropriate transaction later. 1120 * appropriate transaction later.
1125 */ 1121 */
1126 tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; 1122 error = xfs_trans_reserve(tp, tres, resblks, 0);
1127 error = xfs_trans_reserve(tp, &tres, resblks, 0);
1128 if (error == -ENOSPC) { 1123 if (error == -ENOSPC) {
1129 /* flush outstanding delalloc blocks and retry */ 1124 /* flush outstanding delalloc blocks and retry */
1130 xfs_flush_inodes(mp); 1125 xfs_flush_inodes(mp);
1131 error = xfs_trans_reserve(tp, &tres, resblks, 0); 1126 error = xfs_trans_reserve(tp, tres, resblks, 0);
1132 } 1127 }
1133 if (error == -ENOSPC) { 1128 if (error == -ENOSPC) {
1134 /* No space at all so try a "no-allocation" reservation */ 1129 /* No space at all so try a "no-allocation" reservation */
1135 resblks = 0; 1130 resblks = 0;
1136 error = xfs_trans_reserve(tp, &tres, 0, 0); 1131 error = xfs_trans_reserve(tp, tres, 0, 0);
1137 } 1132 }
1138 if (error) { 1133 if (error) {
1139 cancel_flags = 0; 1134 cancel_flags = 0;
@@ -2488,9 +2483,7 @@ xfs_remove(
2488 xfs_fsblock_t first_block; 2483 xfs_fsblock_t first_block;
2489 int cancel_flags; 2484 int cancel_flags;
2490 int committed; 2485 int committed;
2491 int link_zero;
2492 uint resblks; 2486 uint resblks;
2493 uint log_count;
2494 2487
2495 trace_xfs_remove(dp, name); 2488 trace_xfs_remove(dp, name);
2496 2489
@@ -2505,13 +2498,10 @@ xfs_remove(
2505 if (error) 2498 if (error)
2506 goto std_return; 2499 goto std_return;
2507 2500
2508 if (is_dir) { 2501 if (is_dir)
2509 tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); 2502 tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
2510 log_count = XFS_DEFAULT_LOG_COUNT; 2503 else
2511 } else {
2512 tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); 2504 tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2513 log_count = XFS_REMOVE_LOG_COUNT;
2514 }
2515 cancel_flags = XFS_TRANS_RELEASE_LOG_RES; 2505 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2516 2506
2517 /* 2507 /*
@@ -2579,9 +2569,6 @@ xfs_remove(
2579 if (error) 2569 if (error)
2580 goto out_trans_cancel; 2570 goto out_trans_cancel;
2581 2571
2582 /* Determine if this is the last link while the inode is locked */
2583 link_zero = (ip->i_d.di_nlink == 0);
2584
2585 xfs_bmap_init(&free_list, &first_block); 2572 xfs_bmap_init(&free_list, &first_block);
2586 error = xfs_dir_removename(tp, dp, name, ip->i_ino, 2573 error = xfs_dir_removename(tp, dp, name, ip->i_ino,
2587 &first_block, &free_list, resblks); 2574 &first_block, &free_list, resblks);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 9af2882e1f4c..4ed2ba9342dc 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -20,7 +20,6 @@
20 20
21#include "xfs_inode_buf.h" 21#include "xfs_inode_buf.h"
22#include "xfs_inode_fork.h" 22#include "xfs_inode_fork.h"
23#include "xfs_dinode.h"
24 23
25/* 24/*
26 * Kernel only inode definitions 25 * Kernel only inode definitions
@@ -324,7 +323,6 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
324 (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \ 323 (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
325 ((pip)->i_d.di_mode & S_ISGID)) 324 ((pip)->i_d.di_mode & S_ISGID))
326 325
327
328int xfs_release(struct xfs_inode *ip); 326int xfs_release(struct xfs_inode *ip);
329void xfs_inactive(struct xfs_inode *ip); 327void xfs_inactive(struct xfs_inode *ip);
330int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, 328int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 63de0b0acc32..bf13a5a7e2f4 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -20,8 +20,6 @@
20#include "xfs_format.h" 20#include "xfs_format.h"
21#include "xfs_log_format.h" 21#include "xfs_log_format.h"
22#include "xfs_trans_resv.h" 22#include "xfs_trans_resv.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_mount.h" 23#include "xfs_mount.h"
26#include "xfs_inode.h" 24#include "xfs_inode.h"
27#include "xfs_trans.h" 25#include "xfs_trans.h"
@@ -29,7 +27,6 @@
29#include "xfs_error.h" 27#include "xfs_error.h"
30#include "xfs_trace.h" 28#include "xfs_trace.h"
31#include "xfs_trans_priv.h" 29#include "xfs_trans_priv.h"
32#include "xfs_dinode.h"
33#include "xfs_log.h" 30#include "xfs_log.h"
34 31
35 32
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 24c926b6fe85..a1831980a68e 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -21,8 +21,6 @@
21#include "xfs_format.h" 21#include "xfs_format.h"
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h" 24#include "xfs_mount.h"
27#include "xfs_inode.h" 25#include "xfs_inode.h"
28#include "xfs_ioctl.h" 26#include "xfs_ioctl.h"
@@ -40,7 +38,6 @@
40#include "xfs_trace.h" 38#include "xfs_trace.h"
41#include "xfs_icache.h" 39#include "xfs_icache.h"
42#include "xfs_symlink.h" 40#include "xfs_symlink.h"
43#include "xfs_dinode.h"
44#include "xfs_trans.h" 41#include "xfs_trans.h"
45 42
46#include <linux/capability.h> 43#include <linux/capability.h>
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 94ce027e28e3..ec6772866f3d 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -25,8 +25,6 @@
25#include "xfs_format.h" 25#include "xfs_format.h"
26#include "xfs_log_format.h" 26#include "xfs_log_format.h"
27#include "xfs_trans_resv.h" 27#include "xfs_trans_resv.h"
28#include "xfs_sb.h"
29#include "xfs_ag.h"
30#include "xfs_mount.h" 28#include "xfs_mount.h"
31#include "xfs_inode.h" 29#include "xfs_inode.h"
32#include "xfs_itable.h" 30#include "xfs_itable.h"
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index afcf3c926565..c980e2a5086b 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -21,8 +21,6 @@
21#include "xfs_format.h" 21#include "xfs_format.h"
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h" 24#include "xfs_mount.h"
27#include "xfs_inode.h" 25#include "xfs_inode.h"
28#include "xfs_btree.h" 26#include "xfs_btree.h"
@@ -38,7 +36,6 @@
38#include "xfs_quota.h" 36#include "xfs_quota.h"
39#include "xfs_dquot_item.h" 37#include "xfs_dquot_item.h"
40#include "xfs_dquot.h" 38#include "xfs_dquot.h"
41#include "xfs_dinode.h"
42 39
43 40
44#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ 41#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
@@ -52,7 +49,6 @@ xfs_iomap_eof_align_last_fsb(
52 xfs_extlen_t extsize, 49 xfs_extlen_t extsize,
53 xfs_fileoff_t *last_fsb) 50 xfs_fileoff_t *last_fsb)
54{ 51{
55 xfs_fileoff_t new_last_fsb = 0;
56 xfs_extlen_t align = 0; 52 xfs_extlen_t align = 0;
57 int eof, error; 53 int eof, error;
58 54
@@ -70,8 +66,8 @@ xfs_iomap_eof_align_last_fsb(
70 else if (mp->m_dalign) 66 else if (mp->m_dalign)
71 align = mp->m_dalign; 67 align = mp->m_dalign;
72 68
73 if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align)) 69 if (align && XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, align))
74 new_last_fsb = roundup_64(*last_fsb, align); 70 align = 0;
75 } 71 }
76 72
77 /* 73 /*
@@ -79,14 +75,14 @@ xfs_iomap_eof_align_last_fsb(
79 * (when file on a real-time subvolume or has di_extsize hint). 75 * (when file on a real-time subvolume or has di_extsize hint).
80 */ 76 */
81 if (extsize) { 77 if (extsize) {
82 if (new_last_fsb) 78 if (align)
83 align = roundup_64(new_last_fsb, extsize); 79 align = roundup_64(align, extsize);
84 else 80 else
85 align = extsize; 81 align = extsize;
86 new_last_fsb = roundup_64(*last_fsb, align);
87 } 82 }
88 83
89 if (new_last_fsb) { 84 if (align) {
85 xfs_fileoff_t new_last_fsb = roundup_64(*last_fsb, align);
90 error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof); 86 error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
91 if (error) 87 if (error)
92 return error; 88 return error;
@@ -264,7 +260,6 @@ xfs_iomap_eof_want_preallocate(
264{ 260{
265 xfs_fileoff_t start_fsb; 261 xfs_fileoff_t start_fsb;
266 xfs_filblks_t count_fsb; 262 xfs_filblks_t count_fsb;
267 xfs_fsblock_t firstblock;
268 int n, error, imaps; 263 int n, error, imaps;
269 int found_delalloc = 0; 264 int found_delalloc = 0;
270 265
@@ -289,7 +284,6 @@ xfs_iomap_eof_want_preallocate(
289 count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); 284 count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
290 while (count_fsb > 0) { 285 while (count_fsb > 0) {
291 imaps = nimaps; 286 imaps = nimaps;
292 firstblock = NULLFSBLOCK;
293 error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps, 287 error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps,
294 0); 288 0);
295 if (error) 289 if (error)
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ec6dcdc181ee..c50311cae1b1 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -21,8 +21,6 @@
21#include "xfs_format.h" 21#include "xfs_format.h"
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h" 24#include "xfs_mount.h"
27#include "xfs_da_format.h" 25#include "xfs_da_format.h"
28#include "xfs_inode.h" 26#include "xfs_inode.h"
@@ -37,8 +35,7 @@
37#include "xfs_icache.h" 35#include "xfs_icache.h"
38#include "xfs_symlink.h" 36#include "xfs_symlink.h"
39#include "xfs_da_btree.h" 37#include "xfs_da_btree.h"
40#include "xfs_dir2_priv.h" 38#include "xfs_dir2.h"
41#include "xfs_dinode.h"
42#include "xfs_trans_space.h" 39#include "xfs_trans_space.h"
43 40
44#include <linux/capability.h> 41#include <linux/capability.h>
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 894924a5129b..82e314258f73 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -21,9 +21,6 @@
21#include "xfs_format.h" 21#include "xfs_format.h"
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_inum.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h" 24#include "xfs_mount.h"
28#include "xfs_inode.h" 25#include "xfs_inode.h"
29#include "xfs_btree.h" 26#include "xfs_btree.h"
@@ -33,7 +30,6 @@
33#include "xfs_error.h" 30#include "xfs_error.h"
34#include "xfs_trace.h" 31#include "xfs_trace.h"
35#include "xfs_icache.h" 32#include "xfs_icache.h"
36#include "xfs_dinode.h"
37 33
38STATIC int 34STATIC int
39xfs_internal_inum( 35xfs_internal_inum(
@@ -352,7 +348,6 @@ xfs_bulkstat(
352 int *done) /* 1 if there are more stats to get */ 348 int *done) /* 1 if there are more stats to get */
353{ 349{
354 xfs_buf_t *agbp; /* agi header buffer */ 350 xfs_buf_t *agbp; /* agi header buffer */
355 xfs_agi_t *agi; /* agi header data */
356 xfs_agino_t agino; /* inode # in allocation group */ 351 xfs_agino_t agino; /* inode # in allocation group */
357 xfs_agnumber_t agno; /* allocation group number */ 352 xfs_agnumber_t agno; /* allocation group number */
358 xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ 353 xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */
@@ -403,7 +398,6 @@ xfs_bulkstat(
403 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); 398 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
404 if (error) 399 if (error)
405 break; 400 break;
406 agi = XFS_BUF_TO_AGI(agbp);
407 /* 401 /*
408 * Allocate and initialize a btree cursor for ialloc btree. 402 * Allocate and initialize a btree cursor for ialloc btree.
409 */ 403 */
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 6a51619d8690..c31d2c2eadc4 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -384,4 +384,10 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
384#endif /* XFS_WARN */ 384#endif /* XFS_WARN */
385#endif /* DEBUG */ 385#endif /* DEBUG */
386 386
387#ifdef CONFIG_XFS_RT
388#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME)
389#else
390#define XFS_IS_REALTIME_INODE(ip) (0)
391#endif
392
387#endif /* __XFS_LINUX__ */ 393#endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index fe88ef67f93a..e408bf5a3ff7 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -21,8 +21,6 @@
21#include "xfs_format.h" 21#include "xfs_format.h"
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h" 24#include "xfs_mount.h"
27#include "xfs_error.h" 25#include "xfs_error.h"
28#include "xfs_trans.h" 26#include "xfs_trans.h"
@@ -1031,7 +1029,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
1031 struct xlog *log = mp->m_log; 1029 struct xlog *log = mp->m_log;
1032 int needed = 0; 1030 int needed = 0;
1033 1031
1034 if (!xfs_fs_writable(mp)) 1032 if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
1035 return 0; 1033 return 0;
1036 1034
1037 if (!xlog_cil_empty(log)) 1035 if (!xlog_cil_empty(log))
@@ -1808,6 +1806,8 @@ xlog_sync(
1808 XFS_BUF_ZEROFLAGS(bp); 1806 XFS_BUF_ZEROFLAGS(bp);
1809 XFS_BUF_ASYNC(bp); 1807 XFS_BUF_ASYNC(bp);
1810 bp->b_flags |= XBF_SYNCIO; 1808 bp->b_flags |= XBF_SYNCIO;
1809 /* use high priority completion wq */
1810 bp->b_ioend_wq = log->l_mp->m_log_workqueue;
1811 1811
1812 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) { 1812 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
1813 bp->b_flags |= XBF_FUA; 1813 bp->b_flags |= XBF_FUA;
@@ -1856,6 +1856,8 @@ xlog_sync(
1856 bp->b_flags |= XBF_SYNCIO; 1856 bp->b_flags |= XBF_SYNCIO;
1857 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) 1857 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
1858 bp->b_flags |= XBF_FUA; 1858 bp->b_flags |= XBF_FUA;
1859 /* use high priority completion wq */
1860 bp->b_ioend_wq = log->l_mp->m_log_workqueue;
1859 1861
1860 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); 1862 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
1861 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); 1863 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index f506c457011e..45cc0ce18adf 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -17,11 +17,10 @@
17 17
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_log_format.h" 21#include "xfs_log_format.h"
21#include "xfs_shared.h" 22#include "xfs_shared.h"
22#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_mount.h" 24#include "xfs_mount.h"
26#include "xfs_error.h" 25#include "xfs_error.h"
27#include "xfs_alloc.h" 26#include "xfs_alloc.h"
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 00cd7f3a8f59..a5a945fc3bdc 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -22,11 +22,10 @@
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_bit.h" 24#include "xfs_bit.h"
25#include "xfs_inum.h"
26#include "xfs_sb.h" 25#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h" 26#include "xfs_mount.h"
29#include "xfs_da_format.h" 27#include "xfs_da_format.h"
28#include "xfs_da_btree.h"
30#include "xfs_inode.h" 29#include "xfs_inode.h"
31#include "xfs_trans.h" 30#include "xfs_trans.h"
32#include "xfs_log.h" 31#include "xfs_log.h"
@@ -42,7 +41,6 @@
42#include "xfs_trace.h" 41#include "xfs_trace.h"
43#include "xfs_icache.h" 42#include "xfs_icache.h"
44#include "xfs_bmap_btree.h" 43#include "xfs_bmap_btree.h"
45#include "xfs_dinode.h"
46#include "xfs_error.h" 44#include "xfs_error.h"
47#include "xfs_dir2.h" 45#include "xfs_dir2.h"
48 46
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index 63ca2f0420b1..d8b67547ab34 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -17,10 +17,9 @@
17 17
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_log_format.h" 21#include "xfs_log_format.h"
21#include "xfs_trans_resv.h" 22#include "xfs_trans_resv.h"
22#include "xfs_sb.h"
23#include "xfs_ag.h"
24#include "xfs_mount.h" 23#include "xfs_mount.h"
25 24
26/* 25/*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 51435dbce9c4..d3d38836f87f 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -22,11 +22,10 @@
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_bit.h" 24#include "xfs_bit.h"
25#include "xfs_inum.h"
26#include "xfs_sb.h" 25#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h" 26#include "xfs_mount.h"
29#include "xfs_da_format.h" 27#include "xfs_da_format.h"
28#include "xfs_da_btree.h"
30#include "xfs_inode.h" 29#include "xfs_inode.h"
31#include "xfs_dir2.h" 30#include "xfs_dir2.h"
32#include "xfs_ialloc.h" 31#include "xfs_ialloc.h"
@@ -41,7 +40,6 @@
41#include "xfs_fsops.h" 40#include "xfs_fsops.h"
42#include "xfs_trace.h" 41#include "xfs_trace.h"
43#include "xfs_icache.h" 42#include "xfs_icache.h"
44#include "xfs_dinode.h"
45#include "xfs_sysfs.h" 43#include "xfs_sysfs.h"
46 44
47 45
@@ -1074,11 +1072,23 @@ xfs_unmountfs(
1074 xfs_sysfs_del(&mp->m_kobj); 1072 xfs_sysfs_del(&mp->m_kobj);
1075} 1073}
1076 1074
1077int 1075/*
1078xfs_fs_writable(xfs_mount_t *mp) 1076 * Determine whether modifications can proceed. The caller specifies the minimum
1077 * freeze level for which modifications should not be allowed. This allows
1078 * certain operations to proceed while the freeze sequence is in progress, if
1079 * necessary.
1080 */
1081bool
1082xfs_fs_writable(
1083 struct xfs_mount *mp,
1084 int level)
1079{ 1085{
1080 return !(mp->m_super->s_writers.frozen || XFS_FORCED_SHUTDOWN(mp) || 1086 ASSERT(level > SB_UNFROZEN);
1081 (mp->m_flags & XFS_MOUNT_RDONLY)); 1087 if ((mp->m_super->s_writers.frozen >= level) ||
1088 XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_RDONLY))
1089 return false;
1090
1091 return true;
1082} 1092}
1083 1093
1084/* 1094/*
@@ -1086,9 +1096,9 @@ xfs_fs_writable(xfs_mount_t *mp)
1086 * 1096 *
1087 * Sync the superblock counters to disk. 1097 * Sync the superblock counters to disk.
1088 * 1098 *
1089 * Note this code can be called during the process of freezing, so 1099 * Note this code can be called during the process of freezing, so we use the
1090 * we may need to use the transaction allocator which does not 1100 * transaction allocator that does not block when the transaction subsystem is
1091 * block when the transaction subsystem is in its frozen state. 1101 * in its frozen state.
1092 */ 1102 */
1093int 1103int
1094xfs_log_sbcount(xfs_mount_t *mp) 1104xfs_log_sbcount(xfs_mount_t *mp)
@@ -1096,7 +1106,8 @@ xfs_log_sbcount(xfs_mount_t *mp)
1096 xfs_trans_t *tp; 1106 xfs_trans_t *tp;
1097 int error; 1107 int error;
1098 1108
1099 if (!xfs_fs_writable(mp)) 1109 /* allow this to proceed during the freeze sequence... */
1110 if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
1100 return 0; 1111 return 0;
1101 1112
1102 xfs_icsb_sync_counters(mp, 0); 1113 xfs_icsb_sync_counters(mp, 0);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b0447c86e7e2..22ccf69d4d3c 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -168,6 +168,7 @@ typedef struct xfs_mount {
168 /* low free space thresholds */ 168 /* low free space thresholds */
169 struct xfs_kobj m_kobj; 169 struct xfs_kobj m_kobj;
170 170
171 struct workqueue_struct *m_buf_workqueue;
171 struct workqueue_struct *m_data_workqueue; 172 struct workqueue_struct *m_data_workqueue;
172 struct workqueue_struct *m_unwritten_workqueue; 173 struct workqueue_struct *m_unwritten_workqueue;
173 struct workqueue_struct *m_cil_workqueue; 174 struct workqueue_struct *m_cil_workqueue;
@@ -320,10 +321,7 @@ typedef struct xfs_mod_sb {
320 321
321/* 322/*
322 * Per-ag incore structure, copies of information in agf and agi, to improve the 323 * Per-ag incore structure, copies of information in agf and agi, to improve the
323 * performance of allocation group selection. This is defined for the kernel 324 * performance of allocation group selection.
324 * only, and hence is defined here instead of in xfs_ag.h. You need the struct
325 * xfs_mount to be defined to look up a xfs_perag anyway (via mp->m_perag_tree),
326 * so this doesn't introduce any strange header file dependencies.
327 */ 325 */
328typedef struct xfs_perag { 326typedef struct xfs_perag {
329 struct xfs_mount *pag_mount; /* owner filesystem */ 327 struct xfs_mount *pag_mount; /* owner filesystem */
@@ -384,7 +382,7 @@ extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t);
384extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); 382extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
385extern int xfs_readsb(xfs_mount_t *, int); 383extern int xfs_readsb(xfs_mount_t *, int);
386extern void xfs_freesb(xfs_mount_t *); 384extern void xfs_freesb(xfs_mount_t *);
387extern int xfs_fs_writable(xfs_mount_t *); 385extern bool xfs_fs_writable(struct xfs_mount *mp, int level);
388extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); 386extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
389 387
390extern int xfs_dev_is_read_only(struct xfs_mount *, char *); 388extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index d68f23021af3..79fb19dd9c83 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -23,7 +23,6 @@
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_bit.h" 24#include "xfs_bit.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h" 26#include "xfs_mount.h"
28#include "xfs_inode.h" 27#include "xfs_inode.h"
29#include "xfs_ialloc.h" 28#include "xfs_ialloc.h"
@@ -38,7 +37,6 @@
38#include "xfs_trace.h" 37#include "xfs_trace.h"
39#include "xfs_icache.h" 38#include "xfs_icache.h"
40#include "xfs_cksum.h" 39#include "xfs_cksum.h"
41#include "xfs_dinode.h"
42 40
43/* 41/*
44 * The global quota manager. There is only one of these for the entire 42 * The global quota manager. There is only one of these for the entire
@@ -1749,23 +1747,21 @@ xfs_qm_vop_dqalloc(
1749 xfs_iunlock(ip, lockflags); 1747 xfs_iunlock(ip, lockflags);
1750 if (O_udqpp) 1748 if (O_udqpp)
1751 *O_udqpp = uq; 1749 *O_udqpp = uq;
1752 else if (uq) 1750 else
1753 xfs_qm_dqrele(uq); 1751 xfs_qm_dqrele(uq);
1754 if (O_gdqpp) 1752 if (O_gdqpp)
1755 *O_gdqpp = gq; 1753 *O_gdqpp = gq;
1756 else if (gq) 1754 else
1757 xfs_qm_dqrele(gq); 1755 xfs_qm_dqrele(gq);
1758 if (O_pdqpp) 1756 if (O_pdqpp)
1759 *O_pdqpp = pq; 1757 *O_pdqpp = pq;
1760 else if (pq) 1758 else
1761 xfs_qm_dqrele(pq); 1759 xfs_qm_dqrele(pq);
1762 return 0; 1760 return 0;
1763 1761
1764error_rele: 1762error_rele:
1765 if (gq) 1763 xfs_qm_dqrele(gq);
1766 xfs_qm_dqrele(gq); 1764 xfs_qm_dqrele(uq);
1767 if (uq)
1768 xfs_qm_dqrele(uq);
1769 return error; 1765 return error;
1770} 1766}
1771 1767
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 2c61e61b0205..3e52d5de7ae1 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -20,8 +20,6 @@
20#include "xfs_format.h" 20#include "xfs_format.h"
21#include "xfs_log_format.h" 21#include "xfs_log_format.h"
22#include "xfs_trans_resv.h" 22#include "xfs_trans_resv.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_quota.h" 23#include "xfs_quota.h"
26#include "xfs_mount.h" 24#include "xfs_mount.h"
27#include "xfs_inode.h" 25#include "xfs_inode.h"
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 80f2d77d929a..74fca68e43b6 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -26,7 +26,6 @@
26#include "xfs_trans_resv.h" 26#include "xfs_trans_resv.h"
27#include "xfs_bit.h" 27#include "xfs_bit.h"
28#include "xfs_sb.h" 28#include "xfs_sb.h"
29#include "xfs_ag.h"
30#include "xfs_mount.h" 29#include "xfs_mount.h"
31#include "xfs_inode.h" 30#include "xfs_inode.h"
32#include "xfs_trans.h" 31#include "xfs_trans.h"
@@ -784,19 +783,21 @@ xfs_qm_log_quotaoff(
784{ 783{
785 xfs_trans_t *tp; 784 xfs_trans_t *tp;
786 int error; 785 int error;
787 xfs_qoff_logitem_t *qoffi=NULL; 786 xfs_qoff_logitem_t *qoffi;
788 uint oldsbqflag=0; 787
788 *qoffstartp = NULL;
789 789
790 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); 790 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
791 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0); 791 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
792 if (error) 792 if (error) {
793 goto error0; 793 xfs_trans_cancel(tp, 0);
794 goto out;
795 }
794 796
795 qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); 797 qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
796 xfs_trans_log_quotaoff_item(tp, qoffi); 798 xfs_trans_log_quotaoff_item(tp, qoffi);
797 799
798 spin_lock(&mp->m_sb_lock); 800 spin_lock(&mp->m_sb_lock);
799 oldsbqflag = mp->m_sb.sb_qflags;
800 mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; 801 mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
801 spin_unlock(&mp->m_sb_lock); 802 spin_unlock(&mp->m_sb_lock);
802 803
@@ -809,19 +810,11 @@ xfs_qm_log_quotaoff(
809 */ 810 */
810 xfs_trans_set_sync(tp); 811 xfs_trans_set_sync(tp);
811 error = xfs_trans_commit(tp, 0); 812 error = xfs_trans_commit(tp, 0);
813 if (error)
814 goto out;
812 815
813error0:
814 if (error) {
815 xfs_trans_cancel(tp, 0);
816 /*
817 * No one else is modifying sb_qflags, so this is OK.
818 * We still hold the quotaofflock.
819 */
820 spin_lock(&mp->m_sb_lock);
821 mp->m_sb.sb_qflags = oldsbqflag;
822 spin_unlock(&mp->m_sb_lock);
823 }
824 *qoffstartp = qoffi; 816 *qoffstartp = qoffi;
817out:
825 return error; 818 return error;
826} 819}
827 820
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index b238027df987..7542bbeca6a1 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -19,8 +19,6 @@
19#include "xfs_format.h" 19#include "xfs_format.h"
20#include "xfs_log_format.h" 20#include "xfs_log_format.h"
21#include "xfs_trans_resv.h" 21#include "xfs_trans_resv.h"
22#include "xfs_sb.h"
23#include "xfs_ag.h"
24#include "xfs_mount.h" 22#include "xfs_mount.h"
25#include "xfs_inode.h" 23#include "xfs_inode.h"
26#include "xfs_quota.h" 24#include "xfs_quota.h"
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index e1175ea9b551..f2079b6911cc 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -22,8 +22,6 @@
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_bit.h" 24#include "xfs_bit.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h" 25#include "xfs_mount.h"
28#include "xfs_inode.h" 26#include "xfs_inode.h"
29#include "xfs_bmap.h" 27#include "xfs_bmap.h"
@@ -36,7 +34,6 @@
36#include "xfs_trace.h" 34#include "xfs_trace.h"
37#include "xfs_buf.h" 35#include "xfs_buf.h"
38#include "xfs_icache.h" 36#include "xfs_icache.h"
39#include "xfs_dinode.h"
40#include "xfs_rtalloc.h" 37#include "xfs_rtalloc.h"
41 38
42 39
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 206b97fd1d8a..19cbda196369 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -21,9 +21,7 @@
21#include "xfs_format.h" 21#include "xfs_format.h"
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_inum.h"
25#include "xfs_sb.h" 24#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h" 25#include "xfs_mount.h"
28#include "xfs_da_format.h" 26#include "xfs_da_format.h"
29#include "xfs_inode.h" 27#include "xfs_inode.h"
@@ -44,7 +42,6 @@
44#include "xfs_icache.h" 42#include "xfs_icache.h"
45#include "xfs_trace.h" 43#include "xfs_trace.h"
46#include "xfs_icreate_item.h" 44#include "xfs_icreate_item.h"
47#include "xfs_dinode.h"
48#include "xfs_filestream.h" 45#include "xfs_filestream.h"
49#include "xfs_quota.h" 46#include "xfs_quota.h"
50#include "xfs_sysfs.h" 47#include "xfs_sysfs.h"
@@ -796,8 +793,7 @@ xfs_open_devices(
796 out_free_ddev_targ: 793 out_free_ddev_targ:
797 xfs_free_buftarg(mp, mp->m_ddev_targp); 794 xfs_free_buftarg(mp, mp->m_ddev_targp);
798 out_close_rtdev: 795 out_close_rtdev:
799 if (rtdev) 796 xfs_blkdev_put(rtdev);
800 xfs_blkdev_put(rtdev);
801 out_close_logdev: 797 out_close_logdev:
802 if (logdev && logdev != ddev) 798 if (logdev && logdev != ddev)
803 xfs_blkdev_put(logdev); 799 xfs_blkdev_put(logdev);
@@ -842,10 +838,15 @@ STATIC int
842xfs_init_mount_workqueues( 838xfs_init_mount_workqueues(
843 struct xfs_mount *mp) 839 struct xfs_mount *mp)
844{ 840{
841 mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s",
842 WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_fsname);
843 if (!mp->m_buf_workqueue)
844 goto out;
845
845 mp->m_data_workqueue = alloc_workqueue("xfs-data/%s", 846 mp->m_data_workqueue = alloc_workqueue("xfs-data/%s",
846 WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); 847 WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
847 if (!mp->m_data_workqueue) 848 if (!mp->m_data_workqueue)
848 goto out; 849 goto out_destroy_buf;
849 850
850 mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", 851 mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
851 WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); 852 WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
@@ -863,7 +864,7 @@ xfs_init_mount_workqueues(
863 goto out_destroy_cil; 864 goto out_destroy_cil;
864 865
865 mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", 866 mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
866 WQ_FREEZABLE, 0, mp->m_fsname); 867 WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname);
867 if (!mp->m_log_workqueue) 868 if (!mp->m_log_workqueue)
868 goto out_destroy_reclaim; 869 goto out_destroy_reclaim;
869 870
@@ -884,6 +885,8 @@ out_destroy_unwritten:
884 destroy_workqueue(mp->m_unwritten_workqueue); 885 destroy_workqueue(mp->m_unwritten_workqueue);
885out_destroy_data_iodone_queue: 886out_destroy_data_iodone_queue:
886 destroy_workqueue(mp->m_data_workqueue); 887 destroy_workqueue(mp->m_data_workqueue);
888out_destroy_buf:
889 destroy_workqueue(mp->m_buf_workqueue);
887out: 890out:
888 return -ENOMEM; 891 return -ENOMEM;
889} 892}
@@ -898,6 +901,7 @@ xfs_destroy_mount_workqueues(
898 destroy_workqueue(mp->m_cil_workqueue); 901 destroy_workqueue(mp->m_cil_workqueue);
899 destroy_workqueue(mp->m_data_workqueue); 902 destroy_workqueue(mp->m_data_workqueue);
900 destroy_workqueue(mp->m_unwritten_workqueue); 903 destroy_workqueue(mp->m_unwritten_workqueue);
904 destroy_workqueue(mp->m_buf_workqueue);
901} 905}
902 906
903/* 907/*
@@ -1000,7 +1004,6 @@ xfs_fs_evict_inode(
1000 clear_inode(inode); 1004 clear_inode(inode);
1001 XFS_STATS_INC(vn_rele); 1005 XFS_STATS_INC(vn_rele);
1002 XFS_STATS_INC(vn_remove); 1006 XFS_STATS_INC(vn_remove);
1003 XFS_STATS_DEC(vn_active);
1004 1007
1005 xfs_inactive(ip); 1008 xfs_inactive(ip);
1006} 1009}
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 02ae62a998e0..25791df6f638 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -23,8 +23,6 @@
23#include "xfs_log_format.h" 23#include "xfs_log_format.h"
24#include "xfs_trans_resv.h" 24#include "xfs_trans_resv.h"
25#include "xfs_bit.h" 25#include "xfs_bit.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h" 26#include "xfs_mount.h"
29#include "xfs_da_format.h" 27#include "xfs_da_format.h"
30#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
@@ -42,7 +40,6 @@
42#include "xfs_symlink.h" 40#include "xfs_symlink.h"
43#include "xfs_trans.h" 41#include "xfs_trans.h"
44#include "xfs_log.h" 42#include "xfs_log.h"
45#include "xfs_dinode.h"
46 43
47/* ----- Kernel only functions below ----- */ 44/* ----- Kernel only functions below ----- */
48STATIC int 45STATIC int
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 1e85bcd0e418..13a029806805 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -21,8 +21,6 @@
21#include "xfs_format.h" 21#include "xfs_format.h"
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h" 24#include "xfs_mount.h"
27#include "xfs_da_format.h" 25#include "xfs_da_format.h"
28#include "xfs_inode.h" 26#include "xfs_inode.h"
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 30e8e3410955..fa3135b9bf04 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -22,8 +22,6 @@
22#include "xfs_format.h" 22#include "xfs_format.h"
23#include "xfs_log_format.h" 23#include "xfs_log_format.h"
24#include "xfs_trans_resv.h" 24#include "xfs_trans_resv.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h" 25#include "xfs_mount.h"
28#include "xfs_inode.h" 26#include "xfs_inode.h"
29#include "xfs_extent_busy.h" 27#include "xfs_extent_busy.h"
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 859482f53b5a..573aefb5a573 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -18,10 +18,9 @@
18 */ 18 */
19#include "xfs.h" 19#include "xfs.h"
20#include "xfs_fs.h" 20#include "xfs_fs.h"
21#include "xfs_format.h"
21#include "xfs_log_format.h" 22#include "xfs_log_format.h"
22#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_mount.h" 24#include "xfs_mount.h"
26#include "xfs_trans.h" 25#include "xfs_trans.h"
27#include "xfs_trans_priv.h" 26#include "xfs_trans_priv.h"
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index e2b2216b1635..0a4d4ab6d9a9 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -21,8 +21,6 @@
21#include "xfs_format.h" 21#include "xfs_format.h"
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h" 24#include "xfs_mount.h"
27#include "xfs_inode.h" 25#include "xfs_inode.h"
28#include "xfs_trans.h" 26#include "xfs_trans.h"
@@ -229,13 +227,6 @@ xfs_trans_getsb(xfs_trans_t *tp,
229 return bp; 227 return bp;
230} 228}
231 229
232#ifdef DEBUG
233xfs_buftarg_t *xfs_error_target;
234int xfs_do_error;
235int xfs_req_num;
236int xfs_error_mod = 33;
237#endif
238
239/* 230/*
240 * Get and lock the buffer for the caller if it is not already 231 * Get and lock the buffer for the caller if it is not already
241 * locked within the given transaction. If it has not yet been 232 * locked within the given transaction. If it has not yet been
@@ -257,46 +248,11 @@ xfs_trans_read_buf_map(
257 struct xfs_buf **bpp, 248 struct xfs_buf **bpp,
258 const struct xfs_buf_ops *ops) 249 const struct xfs_buf_ops *ops)
259{ 250{
260 xfs_buf_t *bp; 251 struct xfs_buf *bp = NULL;
261 xfs_buf_log_item_t *bip; 252 struct xfs_buf_log_item *bip;
262 int error; 253 int error;
263 254
264 *bpp = NULL; 255 *bpp = NULL;
265 if (!tp) {
266 bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
267 if (!bp)
268 return (flags & XBF_TRYLOCK) ?
269 -EAGAIN : -ENOMEM;
270
271 if (bp->b_error) {
272 error = bp->b_error;
273 xfs_buf_ioerror_alert(bp, __func__);
274 XFS_BUF_UNDONE(bp);
275 xfs_buf_stale(bp);
276 xfs_buf_relse(bp);
277
278 /* bad CRC means corrupted metadata */
279 if (error == -EFSBADCRC)
280 error = -EFSCORRUPTED;
281 return error;
282 }
283#ifdef DEBUG
284 if (xfs_do_error) {
285 if (xfs_error_target == target) {
286 if (((xfs_req_num++) % xfs_error_mod) == 0) {
287 xfs_buf_relse(bp);
288 xfs_debug(mp, "Returning error!");
289 return -EIO;
290 }
291 }
292 }
293#endif
294 if (XFS_FORCED_SHUTDOWN(mp))
295 goto shutdown_abort;
296 *bpp = bp;
297 return 0;
298 }
299
300 /* 256 /*
301 * If we find the buffer in the cache with this transaction 257 * If we find the buffer in the cache with this transaction
302 * pointer in its b_fsprivate2 field, then we know we already 258 * pointer in its b_fsprivate2 field, then we know we already
@@ -305,49 +261,24 @@ xfs_trans_read_buf_map(
305 * If the buffer is not yet read in, then we read it in, increment 261 * If the buffer is not yet read in, then we read it in, increment
306 * the lock recursion count, and return it to the caller. 262 * the lock recursion count, and return it to the caller.
307 */ 263 */
308 bp = xfs_trans_buf_item_match(tp, target, map, nmaps); 264 if (tp)
309 if (bp != NULL) { 265 bp = xfs_trans_buf_item_match(tp, target, map, nmaps);
266 if (bp) {
310 ASSERT(xfs_buf_islocked(bp)); 267 ASSERT(xfs_buf_islocked(bp));
311 ASSERT(bp->b_transp == tp); 268 ASSERT(bp->b_transp == tp);
312 ASSERT(bp->b_fspriv != NULL); 269 ASSERT(bp->b_fspriv != NULL);
313 ASSERT(!bp->b_error); 270 ASSERT(!bp->b_error);
314 if (!(XFS_BUF_ISDONE(bp))) { 271 ASSERT(bp->b_flags & XBF_DONE);
315 trace_xfs_trans_read_buf_io(bp, _RET_IP_); 272
316 ASSERT(!XFS_BUF_ISASYNC(bp));
317 ASSERT(bp->b_iodone == NULL);
318 XFS_BUF_READ(bp);
319 bp->b_ops = ops;
320
321 error = xfs_buf_submit_wait(bp);
322 if (error) {
323 if (!XFS_FORCED_SHUTDOWN(mp))
324 xfs_buf_ioerror_alert(bp, __func__);
325 xfs_buf_relse(bp);
326 /*
327 * We can gracefully recover from most read
328 * errors. Ones we can't are those that happen
329 * after the transaction's already dirty.
330 */
331 if (tp->t_flags & XFS_TRANS_DIRTY)
332 xfs_force_shutdown(tp->t_mountp,
333 SHUTDOWN_META_IO_ERROR);
334 /* bad CRC means corrupted metadata */
335 if (error == -EFSBADCRC)
336 error = -EFSCORRUPTED;
337 return error;
338 }
339 }
340 /* 273 /*
341 * We never locked this buf ourselves, so we shouldn't 274 * We never locked this buf ourselves, so we shouldn't
342 * brelse it either. Just get out. 275 * brelse it either. Just get out.
343 */ 276 */
344 if (XFS_FORCED_SHUTDOWN(mp)) { 277 if (XFS_FORCED_SHUTDOWN(mp)) {
345 trace_xfs_trans_read_buf_shut(bp, _RET_IP_); 278 trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
346 *bpp = NULL;
347 return -EIO; 279 return -EIO;
348 } 280 }
349 281
350
351 bip = bp->b_fspriv; 282 bip = bp->b_fspriv;
352 bip->bli_recur++; 283 bip->bli_recur++;
353 284
@@ -358,17 +289,29 @@ xfs_trans_read_buf_map(
358 } 289 }
359 290
360 bp = xfs_buf_read_map(target, map, nmaps, flags, ops); 291 bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
361 if (bp == NULL) { 292 if (!bp) {
362 *bpp = NULL; 293 if (!(flags & XBF_TRYLOCK))
363 return (flags & XBF_TRYLOCK) ? 294 return -ENOMEM;
364 0 : -ENOMEM; 295 return tp ? 0 : -EAGAIN;
365 } 296 }
297
298 /*
299 * If we've had a read error, then the contents of the buffer are
300 * invalid and should not be used. To ensure that a followup read tries
301 * to pull the buffer from disk again, we clear the XBF_DONE flag and
302 * mark the buffer stale. This ensures that anyone who has a current
303 * reference to the buffer will interpret it's contents correctly and
304 * future cache lookups will also treat it as an empty, uninitialised
305 * buffer.
306 */
366 if (bp->b_error) { 307 if (bp->b_error) {
367 error = bp->b_error; 308 error = bp->b_error;
309 if (!XFS_FORCED_SHUTDOWN(mp))
310 xfs_buf_ioerror_alert(bp, __func__);
311 bp->b_flags &= ~XBF_DONE;
368 xfs_buf_stale(bp); 312 xfs_buf_stale(bp);
369 XFS_BUF_DONE(bp); 313
370 xfs_buf_ioerror_alert(bp, __func__); 314 if (tp && (tp->t_flags & XFS_TRANS_DIRTY))
371 if (tp->t_flags & XFS_TRANS_DIRTY)
372 xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); 315 xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
373 xfs_buf_relse(bp); 316 xfs_buf_relse(bp);
374 317
@@ -377,33 +320,19 @@ xfs_trans_read_buf_map(
377 error = -EFSCORRUPTED; 320 error = -EFSCORRUPTED;
378 return error; 321 return error;
379 } 322 }
380#ifdef DEBUG 323
381 if (xfs_do_error && !(tp->t_flags & XFS_TRANS_DIRTY)) { 324 if (XFS_FORCED_SHUTDOWN(mp)) {
382 if (xfs_error_target == target) { 325 xfs_buf_relse(bp);
383 if (((xfs_req_num++) % xfs_error_mod) == 0) { 326 trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
384 xfs_force_shutdown(tp->t_mountp, 327 return -EIO;
385 SHUTDOWN_META_IO_ERROR);
386 xfs_buf_relse(bp);
387 xfs_debug(mp, "Returning trans error!");
388 return -EIO;
389 }
390 }
391 } 328 }
392#endif
393 if (XFS_FORCED_SHUTDOWN(mp))
394 goto shutdown_abort;
395 329
396 _xfs_trans_bjoin(tp, bp, 1); 330 if (tp)
331 _xfs_trans_bjoin(tp, bp, 1);
397 trace_xfs_trans_read_buf(bp->b_fspriv); 332 trace_xfs_trans_read_buf(bp->b_fspriv);
398
399 *bpp = bp; 333 *bpp = bp;
400 return 0; 334 return 0;
401 335
402shutdown_abort:
403 trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
404 xfs_buf_relse(bp);
405 *bpp = NULL;
406 return -EIO;
407} 336}
408 337
409/* 338/*
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 846e061c2e98..76a16df55ef7 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -21,8 +21,6 @@
21#include "xfs_format.h" 21#include "xfs_format.h"
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h" 24#include "xfs_mount.h"
27#include "xfs_inode.h" 25#include "xfs_inode.h"
28#include "xfs_error.h" 26#include "xfs_error.h"
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 47978ba89dae..284397dd7990 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -18,10 +18,9 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_shared.h" 20#include "xfs_shared.h"
21#include "xfs_format.h"
21#include "xfs_log_format.h" 22#include "xfs_log_format.h"
22#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_mount.h" 24#include "xfs_mount.h"
26#include "xfs_trans.h" 25#include "xfs_trans.h"
27#include "xfs_trans_priv.h" 26#include "xfs_trans_priv.h"
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index cdb4d86520e1..17280cd71934 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -21,8 +21,6 @@
21#include "xfs_format.h" 21#include "xfs_format.h"
22#include "xfs_log_format.h" 22#include "xfs_log_format.h"
23#include "xfs_trans_resv.h" 23#include "xfs_trans_resv.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h" 24#include "xfs_mount.h"
27#include "xfs_inode.h" 25#include "xfs_inode.h"
28#include "xfs_trans.h" 26#include "xfs_trans.h"
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 93455b998041..69f6e475de97 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -20,8 +20,6 @@
20#include "xfs_format.h" 20#include "xfs_format.h"
21#include "xfs_log_format.h" 21#include "xfs_log_format.h"
22#include "xfs_trans_resv.h" 22#include "xfs_trans_resv.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_mount.h" 23#include "xfs_mount.h"
26#include "xfs_da_format.h" 24#include "xfs_da_format.h"
27#include "xfs_inode.h" 25#include "xfs_inode.h"