aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/binfmt_aout.c5
-rw-r--r--fs/binfmt_elf.c5
-rw-r--r--fs/binfmt_elf_fdpic.c6
-rw-r--r--fs/binfmt_em86.c4
-rw-r--r--fs/binfmt_flat.c5
-rw-r--r--fs/binfmt_misc.c4
-rw-r--r--fs/binfmt_script.c4
-rw-r--r--fs/binfmt_som.c5
-rw-r--r--fs/bio.c6
-rw-r--r--fs/block_dev.c166
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/disk-io.c8
-rw-r--r--fs/btrfs/extent-tree.c2
-rw-r--r--fs/btrfs/extent_map.c3
-rw-r--r--fs/btrfs/file.c3
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/btrfs/ordered-data.h2
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/buffer.c163
-rw-r--r--fs/cifs/Kconfig10
-rw-r--r--fs/cifs/README2
-rw-r--r--fs/cifs/cifs_debug.h72
-rw-r--r--fs/cifs/cifsacl.c765
-rw-r--r--fs/cifs/cifsacl.h66
-rw-r--r--fs/cifs/cifsfs.c17
-rw-r--r--fs/cifs/cifsglob.h36
-rw-r--r--fs/cifs/cifsproto.h10
-rw-r--r--fs/cifs/connect.c310
-rw-r--r--fs/cifs/dir.c43
-rw-r--r--fs/cifs/file.c206
-rw-r--r--fs/cifs/inode.c7
-rw-r--r--fs/cifs/netmisc.c14
-rw-r--r--fs/cifs/readdir.c55
-rw-r--r--fs/cifs/smb1ops.c35
-rw-r--r--fs/cifs/smb2file.c12
-rw-r--r--fs/cifs/smb2ops.c103
-rw-r--r--fs/cifs/smb2pdu.c5
-rw-r--r--fs/cifs/smb2proto.h4
-rw-r--r--fs/cifs/smb2transport.c13
-rw-r--r--fs/compat_ioctl.c3
-rw-r--r--fs/coredump.c4
-rw-r--r--fs/debugfs/inode.c1
-rw-r--r--fs/devpts/inode.c61
-rw-r--r--fs/direct-io.c8
-rw-r--r--fs/dlm/Kconfig2
-rw-r--r--fs/dlm/dlm_internal.h1
-rw-r--r--fs/dlm/lock.c16
-rw-r--r--fs/dlm/lowcomms.c5
-rw-r--r--fs/dlm/recover.c37
-rw-r--r--fs/eventpoll.c38
-rw-r--r--fs/exec.c37
-rw-r--r--fs/ext3/balloc.c5
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/ialloc.c19
-rw-r--r--fs/fhandle.c4
-rw-r--r--fs/file.c25
-rw-r--r--fs/fs-writeback.c4
-rw-r--r--fs/fs_struct.c24
-rw-r--r--fs/gfs2/file.c14
-rw-r--r--fs/gfs2/glock.c2
-rw-r--r--fs/gfs2/lops.c16
-rw-r--r--fs/gfs2/quota.c7
-rw-r--r--fs/gfs2/rgrp.c33
-rw-r--r--fs/gfs2/super.c3
-rw-r--r--fs/gfs2/trans.c8
-rw-r--r--fs/hugetlbfs/inode.c111
-rw-r--r--fs/inode.c18
-rw-r--r--fs/internal.h1
-rw-r--r--fs/jbd/transaction.c4
-rw-r--r--fs/jbd2/transaction.c2
-rw-r--r--fs/jffs2/file.c39
-rw-r--r--fs/logfs/inode.c2
-rw-r--r--fs/namei.c5
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/nfs/dir.c7
-rw-r--r--fs/nfs/dns_resolve.c5
-rw-r--r--fs/nfs/inode.c5
-rw-r--r--fs/nfs/internal.h6
-rw-r--r--fs/nfs/mount_clnt.c2
-rw-r--r--fs/nfs/namespace.c19
-rw-r--r--fs/nfs/nfs4namespace.c3
-rw-r--r--fs/nfs/nfs4proc.c46
-rw-r--r--fs/nfs/pnfs.c4
-rw-r--r--fs/nfs/super.c51
-rw-r--r--fs/nfs/unlink.c2
-rw-r--r--fs/nilfs2/page.c2
-rw-r--r--fs/notify/fanotify/Kconfig2
-rw-r--r--fs/notify/fanotify/fanotify.c1
-rw-r--r--fs/notify/fanotify/fanotify_user.c3
-rw-r--r--fs/notify/notification.c2
-rw-r--r--fs/ocfs2/file.c5
-rw-r--r--fs/proc/array.c4
-rw-r--r--fs/proc/base.c124
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/proc_sysctl.c9
-rw-r--r--fs/proc/task_mmu.c6
-rw-r--r--fs/pstore/inode.c7
-rw-r--r--fs/pstore/internal.h2
-rw-r--r--fs/pstore/platform.c14
-rw-r--r--fs/pstore/ram.c9
-rw-r--r--fs/reiserfs/inode.c10
-rw-r--r--fs/reiserfs/stree.c4
-rw-r--r--fs/reiserfs/super.c60
-rw-r--r--fs/splice.c5
-rw-r--r--fs/sysfs/file.c4
-rw-r--r--fs/ubifs/find.c12
-rw-r--r--fs/ubifs/lprops.c6
-rw-r--r--fs/ubifs/ubifs.h3
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/Makefile4
-rw-r--r--fs/xfs/uuid.h6
-rw-r--r--fs/xfs/xfs_ag.h5
-rw-r--r--fs/xfs/xfs_alloc.c183
-rw-r--r--fs/xfs/xfs_alloc.h6
-rw-r--r--fs/xfs/xfs_alloc_btree.c79
-rw-r--r--fs/xfs/xfs_alloc_btree.h2
-rw-r--r--fs/xfs/xfs_aops.c137
-rw-r--r--fs/xfs/xfs_attr.c103
-rw-r--r--fs/xfs/xfs_attr_leaf.c163
-rw-r--r--fs/xfs/xfs_attr_leaf.h6
-rw-r--r--fs/xfs/xfs_bmap.c127
-rw-r--r--fs/xfs/xfs_bmap.h9
-rw-r--r--fs/xfs/xfs_bmap_btree.c63
-rw-r--r--fs/xfs/xfs_bmap_btree.h1
-rw-r--r--fs/xfs/xfs_btree.c111
-rw-r--r--fs/xfs/xfs_btree.h22
-rw-r--r--fs/xfs/xfs_buf.c73
-rw-r--r--fs/xfs/xfs_buf.h27
-rw-r--r--fs/xfs/xfs_buf_item.c18
-rw-r--r--fs/xfs/xfs_cksum.h63
-rw-r--r--fs/xfs/xfs_da_btree.c141
-rw-r--r--fs/xfs/xfs_da_btree.h10
-rw-r--r--fs/xfs/xfs_dfrag.c13
-rw-r--r--fs/xfs/xfs_dir2_block.c436
-rw-r--r--fs/xfs/xfs_dir2_data.c170
-rw-r--r--fs/xfs/xfs_dir2_leaf.c172
-rw-r--r--fs/xfs/xfs_dir2_node.c288
-rw-r--r--fs/xfs/xfs_dir2_priv.h19
-rw-r--r--fs/xfs/xfs_dquot.c134
-rw-r--r--fs/xfs/xfs_dquot.h2
-rw-r--r--fs/xfs/xfs_export.c1
-rw-r--r--fs/xfs/xfs_file.c42
-rw-r--r--fs/xfs/xfs_fs.h33
-rw-r--r--fs/xfs/xfs_fs_subr.c96
-rw-r--r--fs/xfs/xfs_fsops.c158
-rw-r--r--fs/xfs/xfs_globals.c4
-rw-r--r--fs/xfs/xfs_ialloc.c84
-rw-r--r--fs/xfs/xfs_ialloc.h4
-rw-r--r--fs/xfs/xfs_ialloc_btree.c55
-rw-r--r--fs/xfs/xfs_ialloc_btree.h2
-rw-r--r--fs/xfs/xfs_icache.c (renamed from fs/xfs/xfs_sync.c)914
-rw-r--r--fs/xfs/xfs_icache.h (renamed from fs/xfs/xfs_sync.h)28
-rw-r--r--fs/xfs/xfs_iget.c705
-rw-r--r--fs/xfs/xfs_inode.c440
-rw-r--r--fs/xfs/xfs_inode.h12
-rw-r--r--fs/xfs/xfs_ioctl.c23
-rw-r--r--fs/xfs/xfs_iomap.c35
-rw-r--r--fs/xfs/xfs_iops.c8
-rw-r--r--fs/xfs/xfs_itable.c4
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_log.c260
-rw-r--r--fs/xfs/xfs_log.h4
-rw-r--r--fs/xfs/xfs_log_priv.h12
-rw-r--r--fs/xfs/xfs_log_recover.c148
-rw-r--r--fs/xfs/xfs_mount.c163
-rw-r--r--fs/xfs/xfs_mount.h13
-rw-r--r--fs/xfs/xfs_qm.c22
-rw-r--r--fs/xfs/xfs_qm_syscalls.c6
-rw-r--r--fs/xfs/xfs_rtalloc.c16
-rw-r--r--fs/xfs/xfs_sb.h7
-rw-r--r--fs/xfs/xfs_super.c148
-rw-r--r--fs/xfs/xfs_super.h1
-rw-r--r--fs/xfs/xfs_sysctl.c9
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_trace.h60
-rw-r--r--fs/xfs/xfs_trans.h19
-rw-r--r--fs/xfs/xfs_trans_buf.c9
-rw-r--r--fs/xfs/xfs_vnodeops.c168
-rw-r--r--fs/xfs/xfs_vnodeops.h9
179 files changed, 5550 insertions, 3960 deletions
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 0e7a6f81ae36..6043567b95c2 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -30,7 +30,7 @@
30#include <asm/cacheflush.h> 30#include <asm/cacheflush.h>
31#include <asm/a.out-core.h> 31#include <asm/a.out-core.h>
32 32
33static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); 33static int load_aout_binary(struct linux_binprm *);
34static int load_aout_library(struct file*); 34static int load_aout_library(struct file*);
35 35
36#ifdef CONFIG_COREDUMP 36#ifdef CONFIG_COREDUMP
@@ -201,8 +201,9 @@ static unsigned long __user *create_aout_tables(char __user *p, struct linux_bin
201 * libraries. There is no binary dependent code anywhere else. 201 * libraries. There is no binary dependent code anywhere else.
202 */ 202 */
203 203
204static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) 204static int load_aout_binary(struct linux_binprm * bprm)
205{ 205{
206 struct pt_regs *regs = current_pt_regs();
206 struct exec ex; 207 struct exec ex;
207 unsigned long error; 208 unsigned long error;
208 unsigned long fd_offset; 209 unsigned long fd_offset;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index fbd9f60bd763..6d7d1647a68c 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -44,7 +44,7 @@
44#define user_siginfo_t siginfo_t 44#define user_siginfo_t siginfo_t
45#endif 45#endif
46 46
47static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); 47static int load_elf_binary(struct linux_binprm *bprm);
48static int load_elf_library(struct file *); 48static int load_elf_library(struct file *);
49static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, 49static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
50 int, int, unsigned long); 50 int, int, unsigned long);
@@ -558,7 +558,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
558#endif 558#endif
559} 559}
560 560
561static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) 561static int load_elf_binary(struct linux_binprm *bprm)
562{ 562{
563 struct file *interpreter = NULL; /* to shut gcc up */ 563 struct file *interpreter = NULL; /* to shut gcc up */
564 unsigned long load_addr = 0, load_bias = 0; 564 unsigned long load_addr = 0, load_bias = 0;
@@ -575,6 +575,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
575 unsigned long reloc_func_desc __maybe_unused = 0; 575 unsigned long reloc_func_desc __maybe_unused = 0;
576 int executable_stack = EXSTACK_DEFAULT; 576 int executable_stack = EXSTACK_DEFAULT;
577 unsigned long def_flags = 0; 577 unsigned long def_flags = 0;
578 struct pt_regs *regs = current_pt_regs();
578 struct { 579 struct {
579 struct elfhdr elf_ex; 580 struct elfhdr elf_ex;
580 struct elfhdr interp_elf_ex; 581 struct elfhdr interp_elf_ex;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index a46049154107..dc84732e554f 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -56,7 +56,7 @@ typedef char *elf_caddr_t;
56 56
57MODULE_LICENSE("GPL"); 57MODULE_LICENSE("GPL");
58 58
59static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *); 59static int load_elf_fdpic_binary(struct linux_binprm *);
60static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *); 60static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *);
61static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *, 61static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *,
62 struct mm_struct *, const char *); 62 struct mm_struct *, const char *);
@@ -164,10 +164,10 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params,
164/* 164/*
165 * load an fdpic binary into various bits of memory 165 * load an fdpic binary into various bits of memory
166 */ 166 */
167static int load_elf_fdpic_binary(struct linux_binprm *bprm, 167static int load_elf_fdpic_binary(struct linux_binprm *bprm)
168 struct pt_regs *regs)
169{ 168{
170 struct elf_fdpic_params exec_params, interp_params; 169 struct elf_fdpic_params exec_params, interp_params;
170 struct pt_regs *regs = current_pt_regs();
171 struct elf_phdr *phdr; 171 struct elf_phdr *phdr;
172 unsigned long stack_size, entryaddr; 172 unsigned long stack_size, entryaddr;
173#ifdef ELF_FDPIC_PLAT_INIT 173#ifdef ELF_FDPIC_PLAT_INIT
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 2790c7e1912e..4e6cce57d113 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -22,7 +22,7 @@
22#define EM86_INTERP "/usr/bin/em86" 22#define EM86_INTERP "/usr/bin/em86"
23#define EM86_I_NAME "em86" 23#define EM86_I_NAME "em86"
24 24
25static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs) 25static int load_em86(struct linux_binprm *bprm)
26{ 26{
27 char *interp, *i_name, *i_arg; 27 char *interp, *i_name, *i_arg;
28 struct file * file; 28 struct file * file;
@@ -90,7 +90,7 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
90 if (retval < 0) 90 if (retval < 0)
91 return retval; 91 return retval;
92 92
93 return search_binary_handler(bprm, regs); 93 return search_binary_handler(bprm);
94} 94}
95 95
96static struct linux_binfmt em86_format = { 96static struct linux_binfmt em86_format = {
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index e280352b28f9..b56371981d16 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -88,7 +88,7 @@ struct lib_info {
88static int load_flat_shared_library(int id, struct lib_info *p); 88static int load_flat_shared_library(int id, struct lib_info *p);
89#endif 89#endif
90 90
91static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs); 91static int load_flat_binary(struct linux_binprm *);
92static int flat_core_dump(struct coredump_params *cprm); 92static int flat_core_dump(struct coredump_params *cprm);
93 93
94static struct linux_binfmt flat_format = { 94static struct linux_binfmt flat_format = {
@@ -858,9 +858,10 @@ out:
858 * libraries. There is no binary dependent code anywhere else. 858 * libraries. There is no binary dependent code anywhere else.
859 */ 859 */
860 860
861static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) 861static int load_flat_binary(struct linux_binprm * bprm)
862{ 862{
863 struct lib_info libinfo; 863 struct lib_info libinfo;
864 struct pt_regs *regs = current_pt_regs();
864 unsigned long p = bprm->p; 865 unsigned long p = bprm->p;
865 unsigned long stack_len; 866 unsigned long stack_len;
866 unsigned long start_addr; 867 unsigned long start_addr;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 790b3cddca67..b0b70fbea06c 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -104,7 +104,7 @@ static Node *check_file(struct linux_binprm *bprm)
104/* 104/*
105 * the loader itself 105 * the loader itself
106 */ 106 */
107static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) 107static int load_misc_binary(struct linux_binprm *bprm)
108{ 108{
109 Node *fmt; 109 Node *fmt;
110 struct file * interp_file = NULL; 110 struct file * interp_file = NULL;
@@ -199,7 +199,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
199 199
200 bprm->recursion_depth++; 200 bprm->recursion_depth++;
201 201
202 retval = search_binary_handler (bprm, regs); 202 retval = search_binary_handler(bprm);
203 if (retval < 0) 203 if (retval < 0)
204 goto _error; 204 goto _error;
205 205
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index d3b8c1f63155..8c954997e7f7 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -14,7 +14,7 @@
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16 16
17static int load_script(struct linux_binprm *bprm,struct pt_regs *regs) 17static int load_script(struct linux_binprm *bprm)
18{ 18{
19 const char *i_arg, *i_name; 19 const char *i_arg, *i_name;
20 char *cp; 20 char *cp;
@@ -95,7 +95,7 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
95 retval = prepare_binprm(bprm); 95 retval = prepare_binprm(bprm);
96 if (retval < 0) 96 if (retval < 0)
97 return retval; 97 return retval;
98 return search_binary_handler(bprm,regs); 98 return search_binary_handler(bprm);
99} 99}
100 100
101static struct linux_binfmt script_format = { 101static struct linux_binfmt script_format = {
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 4517aaff61b4..4e00ed68d4a6 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -35,7 +35,7 @@
35 35
36#include <linux/elf.h> 36#include <linux/elf.h>
37 37
38static int load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs); 38static int load_som_binary(struct linux_binprm * bprm);
39static int load_som_library(struct file *); 39static int load_som_library(struct file *);
40 40
41/* 41/*
@@ -180,13 +180,14 @@ out:
180 */ 180 */
181 181
182static int 182static int
183load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) 183load_som_binary(struct linux_binprm * bprm)
184{ 184{
185 int retval; 185 int retval;
186 unsigned int size; 186 unsigned int size;
187 unsigned long som_entry; 187 unsigned long som_entry;
188 struct som_hdr *som_ex; 188 struct som_hdr *som_ex;
189 struct som_exec_auxhdr *hpuxhdr; 189 struct som_exec_auxhdr *hpuxhdr;
190 struct pt_regs *regs = current_pt_regs();
190 191
191 /* Get the exec-header */ 192 /* Get the exec-header */
192 som_ex = (struct som_hdr *) bprm->buf; 193 som_ex = (struct som_hdr *) bprm->buf;
diff --git a/fs/bio.c b/fs/bio.c
index 9298c65ad9c7..b96fc6ce4855 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -75,6 +75,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
75 unsigned int sz = sizeof(struct bio) + extra_size; 75 unsigned int sz = sizeof(struct bio) + extra_size;
76 struct kmem_cache *slab = NULL; 76 struct kmem_cache *slab = NULL;
77 struct bio_slab *bslab, *new_bio_slabs; 77 struct bio_slab *bslab, *new_bio_slabs;
78 unsigned int new_bio_slab_max;
78 unsigned int i, entry = -1; 79 unsigned int i, entry = -1;
79 80
80 mutex_lock(&bio_slab_lock); 81 mutex_lock(&bio_slab_lock);
@@ -97,12 +98,13 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
97 goto out_unlock; 98 goto out_unlock;
98 99
99 if (bio_slab_nr == bio_slab_max && entry == -1) { 100 if (bio_slab_nr == bio_slab_max && entry == -1) {
100 bio_slab_max <<= 1; 101 new_bio_slab_max = bio_slab_max << 1;
101 new_bio_slabs = krealloc(bio_slabs, 102 new_bio_slabs = krealloc(bio_slabs,
102 bio_slab_max * sizeof(struct bio_slab), 103 new_bio_slab_max * sizeof(struct bio_slab),
103 GFP_KERNEL); 104 GFP_KERNEL);
104 if (!new_bio_slabs) 105 if (!new_bio_slabs)
105 goto out_unlock; 106 goto out_unlock;
107 bio_slab_max = new_bio_slab_max;
106 bio_slabs = new_bio_slabs; 108 bio_slabs = new_bio_slabs;
107 } 109 }
108 if (entry == -1) 110 if (entry == -1)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1a1e5e3b1eaf..ab3a456f6650 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -70,19 +70,6 @@ static void bdev_inode_switch_bdi(struct inode *inode,
70 spin_unlock(&dst->wb.list_lock); 70 spin_unlock(&dst->wb.list_lock);
71} 71}
72 72
73sector_t blkdev_max_block(struct block_device *bdev)
74{
75 sector_t retval = ~((sector_t)0);
76 loff_t sz = i_size_read(bdev->bd_inode);
77
78 if (sz) {
79 unsigned int size = block_size(bdev);
80 unsigned int sizebits = blksize_bits(size);
81 retval = (sz >> sizebits);
82 }
83 return retval;
84}
85
86/* Kill _all_ buffers and pagecache , dirty or not.. */ 73/* Kill _all_ buffers and pagecache , dirty or not.. */
87void kill_bdev(struct block_device *bdev) 74void kill_bdev(struct block_device *bdev)
88{ 75{
@@ -116,8 +103,6 @@ EXPORT_SYMBOL(invalidate_bdev);
116 103
117int set_blocksize(struct block_device *bdev, int size) 104int set_blocksize(struct block_device *bdev, int size)
118{ 105{
119 struct address_space *mapping;
120
121 /* Size must be a power of two, and between 512 and PAGE_SIZE */ 106 /* Size must be a power of two, and between 512 and PAGE_SIZE */
122 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) 107 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
123 return -EINVAL; 108 return -EINVAL;
@@ -126,19 +111,6 @@ int set_blocksize(struct block_device *bdev, int size)
126 if (size < bdev_logical_block_size(bdev)) 111 if (size < bdev_logical_block_size(bdev))
127 return -EINVAL; 112 return -EINVAL;
128 113
129 /* Prevent starting I/O or mapping the device */
130 percpu_down_write(&bdev->bd_block_size_semaphore);
131
132 /* Check that the block device is not memory mapped */
133 mapping = bdev->bd_inode->i_mapping;
134 mutex_lock(&mapping->i_mmap_mutex);
135 if (mapping_mapped(mapping)) {
136 mutex_unlock(&mapping->i_mmap_mutex);
137 percpu_up_write(&bdev->bd_block_size_semaphore);
138 return -EBUSY;
139 }
140 mutex_unlock(&mapping->i_mmap_mutex);
141
142 /* Don't change the size if it is same as current */ 114 /* Don't change the size if it is same as current */
143 if (bdev->bd_block_size != size) { 115 if (bdev->bd_block_size != size) {
144 sync_blockdev(bdev); 116 sync_blockdev(bdev);
@@ -146,9 +118,6 @@ int set_blocksize(struct block_device *bdev, int size)
146 bdev->bd_inode->i_blkbits = blksize_bits(size); 118 bdev->bd_inode->i_blkbits = blksize_bits(size);
147 kill_bdev(bdev); 119 kill_bdev(bdev);
148 } 120 }
149
150 percpu_up_write(&bdev->bd_block_size_semaphore);
151
152 return 0; 121 return 0;
153} 122}
154 123
@@ -181,52 +150,12 @@ static int
181blkdev_get_block(struct inode *inode, sector_t iblock, 150blkdev_get_block(struct inode *inode, sector_t iblock,
182 struct buffer_head *bh, int create) 151 struct buffer_head *bh, int create)
183{ 152{
184 if (iblock >= blkdev_max_block(I_BDEV(inode))) {
185 if (create)
186 return -EIO;
187
188 /*
189 * for reads, we're just trying to fill a partial page.
190 * return a hole, they will have to call get_block again
191 * before they can fill it, and they will get -EIO at that
192 * time
193 */
194 return 0;
195 }
196 bh->b_bdev = I_BDEV(inode); 153 bh->b_bdev = I_BDEV(inode);
197 bh->b_blocknr = iblock; 154 bh->b_blocknr = iblock;
198 set_buffer_mapped(bh); 155 set_buffer_mapped(bh);
199 return 0; 156 return 0;
200} 157}
201 158
202static int
203blkdev_get_blocks(struct inode *inode, sector_t iblock,
204 struct buffer_head *bh, int create)
205{
206 sector_t end_block = blkdev_max_block(I_BDEV(inode));
207 unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
208
209 if ((iblock + max_blocks) > end_block) {
210 max_blocks = end_block - iblock;
211 if ((long)max_blocks <= 0) {
212 if (create)
213 return -EIO; /* write fully beyond EOF */
214 /*
215 * It is a read which is fully beyond EOF. We return
216 * a !buffer_mapped buffer
217 */
218 max_blocks = 0;
219 }
220 }
221
222 bh->b_bdev = I_BDEV(inode);
223 bh->b_blocknr = iblock;
224 bh->b_size = max_blocks << inode->i_blkbits;
225 if (max_blocks)
226 set_buffer_mapped(bh);
227 return 0;
228}
229
230static ssize_t 159static ssize_t
231blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 160blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
232 loff_t offset, unsigned long nr_segs) 161 loff_t offset, unsigned long nr_segs)
@@ -235,7 +164,7 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
235 struct inode *inode = file->f_mapping->host; 164 struct inode *inode = file->f_mapping->host;
236 165
237 return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset, 166 return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
238 nr_segs, blkdev_get_blocks, NULL, NULL, 0); 167 nr_segs, blkdev_get_block, NULL, NULL, 0);
239} 168}
240 169
241int __sync_blockdev(struct block_device *bdev, int wait) 170int __sync_blockdev(struct block_device *bdev, int wait)
@@ -459,12 +388,6 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
459 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); 388 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
460 if (!ei) 389 if (!ei)
461 return NULL; 390 return NULL;
462
463 if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) {
464 kmem_cache_free(bdev_cachep, ei);
465 return NULL;
466 }
467
468 return &ei->vfs_inode; 391 return &ei->vfs_inode;
469} 392}
470 393
@@ -473,8 +396,6 @@ static void bdev_i_callback(struct rcu_head *head)
473 struct inode *inode = container_of(head, struct inode, i_rcu); 396 struct inode *inode = container_of(head, struct inode, i_rcu);
474 struct bdev_inode *bdi = BDEV_I(inode); 397 struct bdev_inode *bdi = BDEV_I(inode);
475 398
476 percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore);
477
478 kmem_cache_free(bdev_cachep, bdi); 399 kmem_cache_free(bdev_cachep, bdi);
479} 400}
480 401
@@ -1593,22 +1514,6 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1593 return blkdev_ioctl(bdev, mode, cmd, arg); 1514 return blkdev_ioctl(bdev, mode, cmd, arg);
1594} 1515}
1595 1516
1596ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
1597 unsigned long nr_segs, loff_t pos)
1598{
1599 ssize_t ret;
1600 struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
1601
1602 percpu_down_read(&bdev->bd_block_size_semaphore);
1603
1604 ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
1605
1606 percpu_up_read(&bdev->bd_block_size_semaphore);
1607
1608 return ret;
1609}
1610EXPORT_SYMBOL_GPL(blkdev_aio_read);
1611
1612/* 1517/*
1613 * Write data to the block device. Only intended for the block device itself 1518 * Write data to the block device. Only intended for the block device itself
1614 * and the raw driver which basically is a fake block device. 1519 * and the raw driver which basically is a fake block device.
@@ -1620,16 +1525,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1620 unsigned long nr_segs, loff_t pos) 1525 unsigned long nr_segs, loff_t pos)
1621{ 1526{
1622 struct file *file = iocb->ki_filp; 1527 struct file *file = iocb->ki_filp;
1623 struct block_device *bdev = I_BDEV(file->f_mapping->host);
1624 struct blk_plug plug; 1528 struct blk_plug plug;
1625 ssize_t ret; 1529 ssize_t ret;
1626 1530
1627 BUG_ON(iocb->ki_pos != pos); 1531 BUG_ON(iocb->ki_pos != pos);
1628 1532
1629 blk_start_plug(&plug); 1533 blk_start_plug(&plug);
1630
1631 percpu_down_read(&bdev->bd_block_size_semaphore);
1632
1633 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 1534 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
1634 if (ret > 0 || ret == -EIOCBQUEUED) { 1535 if (ret > 0 || ret == -EIOCBQUEUED) {
1635 ssize_t err; 1536 ssize_t err;
@@ -1638,62 +1539,27 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1638 if (err < 0 && ret > 0) 1539 if (err < 0 && ret > 0)
1639 ret = err; 1540 ret = err;
1640 } 1541 }
1641
1642 percpu_up_read(&bdev->bd_block_size_semaphore);
1643
1644 blk_finish_plug(&plug); 1542 blk_finish_plug(&plug);
1645
1646 return ret; 1543 return ret;
1647} 1544}
1648EXPORT_SYMBOL_GPL(blkdev_aio_write); 1545EXPORT_SYMBOL_GPL(blkdev_aio_write);
1649 1546
1650static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) 1547static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
1651{ 1548 unsigned long nr_segs, loff_t pos)
1652 int ret;
1653 struct block_device *bdev = I_BDEV(file->f_mapping->host);
1654
1655 percpu_down_read(&bdev->bd_block_size_semaphore);
1656
1657 ret = generic_file_mmap(file, vma);
1658
1659 percpu_up_read(&bdev->bd_block_size_semaphore);
1660
1661 return ret;
1662}
1663
1664static ssize_t blkdev_splice_read(struct file *file, loff_t *ppos,
1665 struct pipe_inode_info *pipe, size_t len,
1666 unsigned int flags)
1667{
1668 ssize_t ret;
1669 struct block_device *bdev = I_BDEV(file->f_mapping->host);
1670
1671 percpu_down_read(&bdev->bd_block_size_semaphore);
1672
1673 ret = generic_file_splice_read(file, ppos, pipe, len, flags);
1674
1675 percpu_up_read(&bdev->bd_block_size_semaphore);
1676
1677 return ret;
1678}
1679
1680static ssize_t blkdev_splice_write(struct pipe_inode_info *pipe,
1681 struct file *file, loff_t *ppos, size_t len,
1682 unsigned int flags)
1683{ 1549{
1684 ssize_t ret; 1550 struct file *file = iocb->ki_filp;
1685 struct block_device *bdev = I_BDEV(file->f_mapping->host); 1551 struct inode *bd_inode = file->f_mapping->host;
1686 1552 loff_t size = i_size_read(bd_inode);
1687 percpu_down_read(&bdev->bd_block_size_semaphore);
1688
1689 ret = generic_file_splice_write(pipe, file, ppos, len, flags);
1690 1553
1691 percpu_up_read(&bdev->bd_block_size_semaphore); 1554 if (pos >= size)
1555 return 0;
1692 1556
1693 return ret; 1557 size -= pos;
1558 if (size < INT_MAX)
1559 nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
1560 return generic_file_aio_read(iocb, iov, nr_segs, pos);
1694} 1561}
1695 1562
1696
1697/* 1563/*
1698 * Try to release a page associated with block device when the system 1564 * Try to release a page associated with block device when the system
1699 * is under memory pressure. 1565 * is under memory pressure.
@@ -1724,16 +1590,16 @@ const struct file_operations def_blk_fops = {
1724 .llseek = block_llseek, 1590 .llseek = block_llseek,
1725 .read = do_sync_read, 1591 .read = do_sync_read,
1726 .write = do_sync_write, 1592 .write = do_sync_write,
1727 .aio_read = blkdev_aio_read, 1593 .aio_read = blkdev_aio_read,
1728 .aio_write = blkdev_aio_write, 1594 .aio_write = blkdev_aio_write,
1729 .mmap = blkdev_mmap, 1595 .mmap = generic_file_mmap,
1730 .fsync = blkdev_fsync, 1596 .fsync = blkdev_fsync,
1731 .unlocked_ioctl = block_ioctl, 1597 .unlocked_ioctl = block_ioctl,
1732#ifdef CONFIG_COMPAT 1598#ifdef CONFIG_COMPAT
1733 .compat_ioctl = compat_blkdev_ioctl, 1599 .compat_ioctl = compat_blkdev_ioctl,
1734#endif 1600#endif
1735 .splice_read = blkdev_splice_read, 1601 .splice_read = generic_file_splice_read,
1736 .splice_write = blkdev_splice_write, 1602 .splice_write = generic_file_splice_write,
1737}; 1603};
1738 1604
1739int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) 1605int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c72ead869507..596617ecd329 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -413,7 +413,7 @@ struct btrfs_root_backup {
413 __le64 bytes_used; 413 __le64 bytes_used;
414 __le64 num_devices; 414 __le64 num_devices;
415 /* future */ 415 /* future */
416 __le64 unsed_64[4]; 416 __le64 unused_64[4];
417 417
418 u8 tree_root_level; 418 u8 tree_root_level;
419 u8 chunk_root_level; 419 u8 chunk_root_level;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7cda51995c1e..22a0439e5a86 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3416,8 +3416,8 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
3416 num_dirty = root->fs_info->dirty_metadata_bytes; 3416 num_dirty = root->fs_info->dirty_metadata_bytes;
3417 3417
3418 if (num_dirty > thresh) { 3418 if (num_dirty > thresh) {
3419 balance_dirty_pages_ratelimited_nr( 3419 balance_dirty_pages_ratelimited(
3420 root->fs_info->btree_inode->i_mapping, 1); 3420 root->fs_info->btree_inode->i_mapping);
3421 } 3421 }
3422 return; 3422 return;
3423} 3423}
@@ -3437,8 +3437,8 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
3437 num_dirty = root->fs_info->dirty_metadata_bytes; 3437 num_dirty = root->fs_info->dirty_metadata_bytes;
3438 3438
3439 if (num_dirty > thresh) { 3439 if (num_dirty > thresh) {
3440 balance_dirty_pages_ratelimited_nr( 3440 balance_dirty_pages_ratelimited(
3441 root->fs_info->btree_inode->i_mapping, 1); 3441 root->fs_info->btree_inode->i_mapping);
3442 } 3442 }
3443 return; 3443 return;
3444} 3444}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3d3e2c17d8d1..06b2635073f3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3888,7 +3888,7 @@ static int flush_space(struct btrfs_root *root,
3888 * @root - the root we're allocating for 3888 * @root - the root we're allocating for
3889 * @block_rsv - the block_rsv we're allocating for 3889 * @block_rsv - the block_rsv we're allocating for
3890 * @orig_bytes - the number of bytes we want 3890 * @orig_bytes - the number of bytes we want
3891 * @flush - wether or not we can flush to make our reservation 3891 * @flush - whether or not we can flush to make our reservation
3892 * 3892 *
3893 * This will reserve orgi_bytes number of bytes from the space info associated 3893 * This will reserve orgi_bytes number of bytes from the space info associated
3894 * with the block_rsv. If there is not enough space it will make an attempt to 3894 * with the block_rsv. If there is not enough space it will make an attempt to
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b8cbc8d5c7f7..ce9f79216723 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -234,12 +234,11 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
234} 234}
235 235
236/** 236/**
237 * unpint_extent_cache - unpin an extent from the cache 237 * unpin_extent_cache - unpin an extent from the cache
238 * @tree: tree to unpin the extent in 238 * @tree: tree to unpin the extent in
239 * @start: logical offset in the file 239 * @start: logical offset in the file
240 * @len: length of the extent 240 * @len: length of the extent
241 * @gen: generation that this extent has been modified in 241 * @gen: generation that this extent has been modified in
242 * @prealloc: if this is set we need to clear the prealloc flag
243 * 242 *
244 * Called after an extent has been written to disk properly. Set the generation 243 * Called after an extent has been written to disk properly. Set the generation
245 * to the generation that actually added the file item to the inode so we know 244 * to the generation that actually added the file item to the inode so we know
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9ab1bed88116..a8ee75cb96ee 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1346,8 +1346,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1346 1346
1347 cond_resched(); 1347 cond_resched();
1348 1348
1349 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1349 balance_dirty_pages_ratelimited(inode->i_mapping);
1350 dirty_pages);
1351 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1350 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1352 btrfs_btree_balance_dirty(root, 1); 1351 btrfs_btree_balance_dirty(root, 1);
1353 1352
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8fcf9a59c28d..5b3429ab8ec1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1225,7 +1225,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1225 } 1225 }
1226 1226
1227 defrag_count += ret; 1227 defrag_count += ret;
1228 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); 1228 balance_dirty_pages_ratelimited(inode->i_mapping);
1229 mutex_unlock(&inode->i_mutex); 1229 mutex_unlock(&inode->i_mutex);
1230 1230
1231 if (newer_than) { 1231 if (newer_than) {
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index dd27a0b46a37..853fc7beedfa 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -76,7 +76,7 @@ struct btrfs_ordered_sum {
76 76
77#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */ 77#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
78 78
79#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent 79#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
80 * has done its due diligence in updating 80 * has done its due diligence in updating
81 * the isize. */ 81 * the isize. */
82 82
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0f5ebb72a5ea..e3c6ee3cc2ba 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4294,7 +4294,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4294 4294
4295 rcu_read_lock(); 4295 rcu_read_lock();
4296 name = rcu_dereference(dev->name); 4296 name = rcu_dereference(dev->name);
4297 pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " 4297 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
4298 "(%s id %llu), size=%u\n", rw, 4298 "(%s id %llu), size=%u\n", rw,
4299 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, 4299 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4300 name->str, dev->devid, bio->bi_size); 4300 name->str, dev->devid, bio->bi_size);
diff --git a/fs/buffer.c b/fs/buffer.c
index b5f044283edb..c017a2dfb909 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -46,8 +46,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
46 46
47#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) 47#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
48 48
49inline void 49void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
50init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
51{ 50{
52 bh->b_end_io = handler; 51 bh->b_end_io = handler;
53 bh->b_private = private; 52 bh->b_private = private;
@@ -555,7 +554,7 @@ void emergency_thaw_all(void)
555 */ 554 */
556int sync_mapping_buffers(struct address_space *mapping) 555int sync_mapping_buffers(struct address_space *mapping)
557{ 556{
558 struct address_space *buffer_mapping = mapping->assoc_mapping; 557 struct address_space *buffer_mapping = mapping->private_data;
559 558
560 if (buffer_mapping == NULL || list_empty(&mapping->private_list)) 559 if (buffer_mapping == NULL || list_empty(&mapping->private_list))
561 return 0; 560 return 0;
@@ -588,10 +587,10 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
588 struct address_space *buffer_mapping = bh->b_page->mapping; 587 struct address_space *buffer_mapping = bh->b_page->mapping;
589 588
590 mark_buffer_dirty(bh); 589 mark_buffer_dirty(bh);
591 if (!mapping->assoc_mapping) { 590 if (!mapping->private_data) {
592 mapping->assoc_mapping = buffer_mapping; 591 mapping->private_data = buffer_mapping;
593 } else { 592 } else {
594 BUG_ON(mapping->assoc_mapping != buffer_mapping); 593 BUG_ON(mapping->private_data != buffer_mapping);
595 } 594 }
596 if (!bh->b_assoc_map) { 595 if (!bh->b_assoc_map) {
597 spin_lock(&buffer_mapping->private_lock); 596 spin_lock(&buffer_mapping->private_lock);
@@ -788,7 +787,7 @@ void invalidate_inode_buffers(struct inode *inode)
788 if (inode_has_buffers(inode)) { 787 if (inode_has_buffers(inode)) {
789 struct address_space *mapping = &inode->i_data; 788 struct address_space *mapping = &inode->i_data;
790 struct list_head *list = &mapping->private_list; 789 struct list_head *list = &mapping->private_list;
791 struct address_space *buffer_mapping = mapping->assoc_mapping; 790 struct address_space *buffer_mapping = mapping->private_data;
792 791
793 spin_lock(&buffer_mapping->private_lock); 792 spin_lock(&buffer_mapping->private_lock);
794 while (!list_empty(list)) 793 while (!list_empty(list))
@@ -811,7 +810,7 @@ int remove_inode_buffers(struct inode *inode)
811 if (inode_has_buffers(inode)) { 810 if (inode_has_buffers(inode)) {
812 struct address_space *mapping = &inode->i_data; 811 struct address_space *mapping = &inode->i_data;
813 struct list_head *list = &mapping->private_list; 812 struct list_head *list = &mapping->private_list;
814 struct address_space *buffer_mapping = mapping->assoc_mapping; 813 struct address_space *buffer_mapping = mapping->private_data;
815 814
816 spin_lock(&buffer_mapping->private_lock); 815 spin_lock(&buffer_mapping->private_lock);
817 while (!list_empty(list)) { 816 while (!list_empty(list)) {
@@ -850,13 +849,10 @@ try_again:
850 if (!bh) 849 if (!bh)
851 goto no_grow; 850 goto no_grow;
852 851
853 bh->b_bdev = NULL;
854 bh->b_this_page = head; 852 bh->b_this_page = head;
855 bh->b_blocknr = -1; 853 bh->b_blocknr = -1;
856 head = bh; 854 head = bh;
857 855
858 bh->b_state = 0;
859 atomic_set(&bh->b_count, 0);
860 bh->b_size = size; 856 bh->b_size = size;
861 857
862 /* Link the buffer to its page */ 858 /* Link the buffer to its page */
@@ -911,6 +907,18 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
911 attach_page_buffers(page, head); 907 attach_page_buffers(page, head);
912} 908}
913 909
910static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
911{
912 sector_t retval = ~((sector_t)0);
913 loff_t sz = i_size_read(bdev->bd_inode);
914
915 if (sz) {
916 unsigned int sizebits = blksize_bits(size);
917 retval = (sz >> sizebits);
918 }
919 return retval;
920}
921
914/* 922/*
915 * Initialise the state of a blockdev page's buffers. 923 * Initialise the state of a blockdev page's buffers.
916 */ 924 */
@@ -921,7 +929,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
921 struct buffer_head *head = page_buffers(page); 929 struct buffer_head *head = page_buffers(page);
922 struct buffer_head *bh = head; 930 struct buffer_head *bh = head;
923 int uptodate = PageUptodate(page); 931 int uptodate = PageUptodate(page);
924 sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode)); 932 sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
925 933
926 do { 934 do {
927 if (!buffer_mapped(bh)) { 935 if (!buffer_mapped(bh)) {
@@ -1553,6 +1561,28 @@ void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1553EXPORT_SYMBOL(unmap_underlying_metadata); 1561EXPORT_SYMBOL(unmap_underlying_metadata);
1554 1562
1555/* 1563/*
1564 * Size is a power-of-two in the range 512..PAGE_SIZE,
1565 * and the case we care about most is PAGE_SIZE.
1566 *
1567 * So this *could* possibly be written with those
1568 * constraints in mind (relevant mostly if some
1569 * architecture has a slow bit-scan instruction)
1570 */
1571static inline int block_size_bits(unsigned int blocksize)
1572{
1573 return ilog2(blocksize);
1574}
1575
1576static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
1577{
1578 BUG_ON(!PageLocked(page));
1579
1580 if (!page_has_buffers(page))
1581 create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
1582 return page_buffers(page);
1583}
1584
1585/*
1556 * NOTE! All mapped/uptodate combinations are valid: 1586 * NOTE! All mapped/uptodate combinations are valid:
1557 * 1587 *
1558 * Mapped Uptodate Meaning 1588 * Mapped Uptodate Meaning
@@ -1589,19 +1619,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1589 sector_t block; 1619 sector_t block;
1590 sector_t last_block; 1620 sector_t last_block;
1591 struct buffer_head *bh, *head; 1621 struct buffer_head *bh, *head;
1592 const unsigned blocksize = 1 << inode->i_blkbits; 1622 unsigned int blocksize, bbits;
1593 int nr_underway = 0; 1623 int nr_underway = 0;
1594 int write_op = (wbc->sync_mode == WB_SYNC_ALL ? 1624 int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1595 WRITE_SYNC : WRITE); 1625 WRITE_SYNC : WRITE);
1596 1626
1597 BUG_ON(!PageLocked(page)); 1627 head = create_page_buffers(page, inode,
1598
1599 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1600
1601 if (!page_has_buffers(page)) {
1602 create_empty_buffers(page, blocksize,
1603 (1 << BH_Dirty)|(1 << BH_Uptodate)); 1628 (1 << BH_Dirty)|(1 << BH_Uptodate));
1604 }
1605 1629
1606 /* 1630 /*
1607 * Be very careful. We have no exclusion from __set_page_dirty_buffers 1631 * Be very careful. We have no exclusion from __set_page_dirty_buffers
@@ -1613,9 +1637,12 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1613 * handle that here by just cleaning them. 1637 * handle that here by just cleaning them.
1614 */ 1638 */
1615 1639
1616 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1617 head = page_buffers(page);
1618 bh = head; 1640 bh = head;
1641 blocksize = bh->b_size;
1642 bbits = block_size_bits(blocksize);
1643
1644 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1645 last_block = (i_size_read(inode) - 1) >> bbits;
1619 1646
1620 /* 1647 /*
1621 * Get all the dirty buffers mapped to disk addresses and 1648 * Get all the dirty buffers mapped to disk addresses and
@@ -1806,12 +1833,10 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1806 BUG_ON(to > PAGE_CACHE_SIZE); 1833 BUG_ON(to > PAGE_CACHE_SIZE);
1807 BUG_ON(from > to); 1834 BUG_ON(from > to);
1808 1835
1809 blocksize = 1 << inode->i_blkbits; 1836 head = create_page_buffers(page, inode, 0);
1810 if (!page_has_buffers(page)) 1837 blocksize = head->b_size;
1811 create_empty_buffers(page, blocksize, 0); 1838 bbits = block_size_bits(blocksize);
1812 head = page_buffers(page);
1813 1839
1814 bbits = inode->i_blkbits;
1815 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); 1840 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1816 1841
1817 for(bh = head, block_start = 0; bh != head || !block_start; 1842 for(bh = head, block_start = 0; bh != head || !block_start;
@@ -1881,11 +1906,11 @@ static int __block_commit_write(struct inode *inode, struct page *page,
1881 unsigned blocksize; 1906 unsigned blocksize;
1882 struct buffer_head *bh, *head; 1907 struct buffer_head *bh, *head;
1883 1908
1884 blocksize = 1 << inode->i_blkbits; 1909 bh = head = page_buffers(page);
1910 blocksize = bh->b_size;
1885 1911
1886 for(bh = head = page_buffers(page), block_start = 0; 1912 block_start = 0;
1887 bh != head || !block_start; 1913 do {
1888 block_start=block_end, bh = bh->b_this_page) {
1889 block_end = block_start + blocksize; 1914 block_end = block_start + blocksize;
1890 if (block_end <= from || block_start >= to) { 1915 if (block_end <= from || block_start >= to) {
1891 if (!buffer_uptodate(bh)) 1916 if (!buffer_uptodate(bh))
@@ -1895,7 +1920,10 @@ static int __block_commit_write(struct inode *inode, struct page *page,
1895 mark_buffer_dirty(bh); 1920 mark_buffer_dirty(bh);
1896 } 1921 }
1897 clear_buffer_new(bh); 1922 clear_buffer_new(bh);
1898 } 1923
1924 block_start = block_end;
1925 bh = bh->b_this_page;
1926 } while (bh != head);
1899 1927
1900 /* 1928 /*
1901 * If this is a partial write which happened to make all buffers 1929 * If this is a partial write which happened to make all buffers
@@ -2020,7 +2048,6 @@ EXPORT_SYMBOL(generic_write_end);
2020int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, 2048int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2021 unsigned long from) 2049 unsigned long from)
2022{ 2050{
2023 struct inode *inode = page->mapping->host;
2024 unsigned block_start, block_end, blocksize; 2051 unsigned block_start, block_end, blocksize;
2025 unsigned to; 2052 unsigned to;
2026 struct buffer_head *bh, *head; 2053 struct buffer_head *bh, *head;
@@ -2029,13 +2056,13 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2029 if (!page_has_buffers(page)) 2056 if (!page_has_buffers(page))
2030 return 0; 2057 return 0;
2031 2058
2032 blocksize = 1 << inode->i_blkbits; 2059 head = page_buffers(page);
2060 blocksize = head->b_size;
2033 to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); 2061 to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2034 to = from + to; 2062 to = from + to;
2035 if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) 2063 if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2036 return 0; 2064 return 0;
2037 2065
2038 head = page_buffers(page);
2039 bh = head; 2066 bh = head;
2040 block_start = 0; 2067 block_start = 0;
2041 do { 2068 do {
@@ -2068,18 +2095,16 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
2068 struct inode *inode = page->mapping->host; 2095 struct inode *inode = page->mapping->host;
2069 sector_t iblock, lblock; 2096 sector_t iblock, lblock;
2070 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; 2097 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2071 unsigned int blocksize; 2098 unsigned int blocksize, bbits;
2072 int nr, i; 2099 int nr, i;
2073 int fully_mapped = 1; 2100 int fully_mapped = 1;
2074 2101
2075 BUG_ON(!PageLocked(page)); 2102 head = create_page_buffers(page, inode, 0);
2076 blocksize = 1 << inode->i_blkbits; 2103 blocksize = head->b_size;
2077 if (!page_has_buffers(page)) 2104 bbits = block_size_bits(blocksize);
2078 create_empty_buffers(page, blocksize, 0);
2079 head = page_buffers(page);
2080 2105
2081 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 2106 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
2082 lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; 2107 lblock = (i_size_read(inode)+blocksize-1) >> bbits;
2083 bh = head; 2108 bh = head;
2084 nr = 0; 2109 nr = 0;
2085 i = 0; 2110 i = 0;
@@ -2864,6 +2889,55 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
2864 bio_put(bio); 2889 bio_put(bio);
2865} 2890}
2866 2891
2892/*
2893 * This allows us to do IO even on the odd last sectors
2894 * of a device, even if the bh block size is some multiple
2895 * of the physical sector size.
2896 *
2897 * We'll just truncate the bio to the size of the device,
2898 * and clear the end of the buffer head manually.
2899 *
2900 * Truly out-of-range accesses will turn into actual IO
2901 * errors, this only handles the "we need to be able to
2902 * do IO at the final sector" case.
2903 */
2904static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
2905{
2906 sector_t maxsector;
2907 unsigned bytes;
2908
2909 maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
2910 if (!maxsector)
2911 return;
2912
2913 /*
2914 * If the *whole* IO is past the end of the device,
2915 * let it through, and the IO layer will turn it into
2916 * an EIO.
2917 */
2918 if (unlikely(bio->bi_sector >= maxsector))
2919 return;
2920
2921 maxsector -= bio->bi_sector;
2922 bytes = bio->bi_size;
2923 if (likely((bytes >> 9) <= maxsector))
2924 return;
2925
2926 /* Uhhuh. We've got a bh that straddles the device size! */
2927 bytes = maxsector << 9;
2928
2929 /* Truncate the bio.. */
2930 bio->bi_size = bytes;
2931 bio->bi_io_vec[0].bv_len = bytes;
2932
2933 /* ..and clear the end of the buffer for reads */
2934 if ((rw & RW_MASK) == READ) {
2935 void *kaddr = kmap_atomic(bh->b_page);
2936 memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes);
2937 kunmap_atomic(kaddr);
2938 }
2939}
2940
2867int submit_bh(int rw, struct buffer_head * bh) 2941int submit_bh(int rw, struct buffer_head * bh)
2868{ 2942{
2869 struct bio *bio; 2943 struct bio *bio;
@@ -2900,6 +2974,9 @@ int submit_bh(int rw, struct buffer_head * bh)
2900 bio->bi_end_io = end_bio_bh_io_sync; 2974 bio->bi_end_io = end_bio_bh_io_sync;
2901 bio->bi_private = bh; 2975 bio->bi_private = bh;
2902 2976
2977 /* Take care of bh's that straddle the end of the device */
2978 guard_bh_eod(rw, bio, bh);
2979
2903 bio_get(bio); 2980 bio_get(bio);
2904 submit_bio(rw, bio); 2981 submit_bio(rw, bio);
2905 2982
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 2075ddfffa73..21ff76c22a17 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -122,9 +122,17 @@ config CIFS_ACL
122 Allows fetching CIFS/NTFS ACL from the server. The DACL blob 122 Allows fetching CIFS/NTFS ACL from the server. The DACL blob
123 is handed over to the application/caller. 123 is handed over to the application/caller.
124 124
125config CIFS_DEBUG
126 bool "Enable CIFS debugging routines"
127 default y
128 depends on CIFS
129 help
130 Enabling this option adds helpful debugging messages to
131 the cifs code which increases the size of the cifs module.
132 If unsure, say Y.
125config CIFS_DEBUG2 133config CIFS_DEBUG2
126 bool "Enable additional CIFS debugging routines" 134 bool "Enable additional CIFS debugging routines"
127 depends on CIFS 135 depends on CIFS_DEBUG
128 help 136 help
129 Enabling this option adds a few more debugging routines 137 Enabling this option adds a few more debugging routines
130 to the cifs code which slightly increases the size of 138 to the cifs code which slightly increases the size of
diff --git a/fs/cifs/README b/fs/cifs/README
index 22ab7b5b8da7..2d5622f60e11 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -480,7 +480,7 @@ A partial list of the supported mount options follows:
480 Unicode on the wire. 480 Unicode on the wire.
481 nomapchars Do not translate any of these seven characters (default). 481 nomapchars Do not translate any of these seven characters (default).
482 nocase Request case insensitive path name matching (case 482 nocase Request case insensitive path name matching (case
483 sensitive is the default if the server suports it). 483 sensitive is the default if the server supports it).
484 (mount option "ignorecase" is identical to "nocase") 484 (mount option "ignorecase" is identical to "nocase")
485 posixpaths If CIFS Unix extensions are supported, attempt to 485 posixpaths If CIFS Unix extensions are supported, attempt to
486 negotiate posix path name support which allows certain 486 negotiate posix path name support which allows certain
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index c0c68bb492d7..86e92ef2abc1 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -18,7 +18,6 @@
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 * 19 *
20*/ 20*/
21#define CIFS_DEBUG /* BB temporary */
22 21
23#ifndef _H_CIFS_DEBUG 22#ifndef _H_CIFS_DEBUG
24#define _H_CIFS_DEBUG 23#define _H_CIFS_DEBUG
@@ -37,49 +36,43 @@ void dump_smb(void *, int);
37#define CIFS_RC 0x02 36#define CIFS_RC 0x02
38#define CIFS_TIMER 0x04 37#define CIFS_TIMER 0x04
39 38
39extern int cifsFYI;
40extern int cifsERROR;
41
40/* 42/*
41 * debug ON 43 * debug ON
42 * -------- 44 * --------
43 */ 45 */
44#ifdef CIFS_DEBUG 46#ifdef CONFIG_CIFS_DEBUG
45 47
46/* information message: e.g., configuration, major event */ 48/* information message: e.g., configuration, major event */
47extern int cifsFYI; 49#define cifsfyi(fmt, ...) \
48#define cifsfyi(fmt, arg...) \
49do { \ 50do { \
50 if (cifsFYI & CIFS_INFO) \ 51 if (cifsFYI & CIFS_INFO) \
51 printk(KERN_DEBUG "%s: " fmt "\n", __FILE__, ##arg); \ 52 printk(KERN_DEBUG "%s: " fmt "\n", \
53 __FILE__, ##__VA_ARGS__); \
52} while (0) 54} while (0)
53 55
54#define cFYI(set, fmt, arg...) \ 56#define cFYI(set, fmt, ...) \
55do { \ 57do { \
56 if (set) \ 58 if (set) \
57 cifsfyi(fmt, ##arg); \ 59 cifsfyi(fmt, ##__VA_ARGS__); \
58} while (0) 60} while (0)
59 61
60#define cifswarn(fmt, arg...) \ 62#define cifswarn(fmt, ...) \
61 printk(KERN_WARNING fmt "\n", ##arg) 63 printk(KERN_WARNING fmt "\n", ##__VA_ARGS__)
62 64
63/* debug event message: */ 65/* error event message: e.g., i/o error */
64extern int cifsERROR; 66#define cifserror(fmt, ...) \
65
66#define cEVENT(fmt, arg...) \
67do { \ 67do { \
68 if (cifsERROR) \ 68 if (cifsERROR) \
69 printk(KERN_EVENT "%s: " fmt "\n", __FILE__, ##arg); \ 69 printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \
70} while (0)
71
72/* error event message: e.g., i/o error */
73#define cifserror(fmt, arg...) \
74do { \
75 if (cifsERROR) \
76 printk(KERN_ERR "CIFS VFS: " fmt "\n", ##arg); \
77} while (0) 70} while (0)
78 71
79#define cERROR(set, fmt, arg...) \ 72#define cERROR(set, fmt, ...) \
80do { \ 73do { \
81 if (set) \ 74 if (set) \
82 cifserror(fmt, ##arg); \ 75 cifserror(fmt, ##__VA_ARGS__); \
83} while (0) 76} while (0)
84 77
85/* 78/*
@@ -87,10 +80,27 @@ do { \
87 * --------- 80 * ---------
88 */ 81 */
89#else /* _CIFS_DEBUG */ 82#else /* _CIFS_DEBUG */
90#define cERROR(set, fmt, arg...) 83#define cifsfyi(fmt, ...) \
91#define cEVENT(fmt, arg...) 84do { \
92#define cFYI(set, fmt, arg...) 85 if (0) \
93#define cifserror(fmt, arg...) 86 printk(KERN_DEBUG "%s: " fmt "\n", \
87 __FILE__, ##__VA_ARGS__); \
88} while (0)
89#define cFYI(set, fmt, ...) \
90do { \
91 if (0 && set) \
92 cifsfyi(fmt, ##__VA_ARGS__); \
93} while (0)
94#define cifserror(fmt, ...) \
95do { \
96 if (0) \
97 printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \
98} while (0)
99#define cERROR(set, fmt, ...) \
100do { \
101 if (0 && set) \
102 cifserror(fmt, ##__VA_ARGS__); \
103} while (0)
94#endif /* _CIFS_DEBUG */ 104#endif /* _CIFS_DEBUG */
95 105
96#endif /* _H_CIFS_DEBUG */ 106#endif /* _H_CIFS_DEBUG */
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index fc783e264420..75c1ee699143 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -42,135 +42,27 @@ static const struct cifs_sid sid_authusers = {
42/* group users */ 42/* group users */
43static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; 43static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
44 44
45const struct cred *root_cred; 45static const struct cred *root_cred;
46
47static void
48shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem,
49 int *nr_del)
50{
51 struct rb_node *node;
52 struct rb_node *tmp;
53 struct cifs_sid_id *psidid;
54
55 node = rb_first(root);
56 while (node) {
57 tmp = node;
58 node = rb_next(tmp);
59 psidid = rb_entry(tmp, struct cifs_sid_id, rbnode);
60 if (nr_to_scan == 0 || *nr_del == nr_to_scan)
61 ++(*nr_rem);
62 else {
63 if (time_after(jiffies, psidid->time + SID_MAP_EXPIRE)
64 && psidid->refcount == 0) {
65 rb_erase(tmp, root);
66 ++(*nr_del);
67 } else
68 ++(*nr_rem);
69 }
70 }
71}
72
73/*
74 * Run idmap cache shrinker.
75 */
76static int
77cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc)
78{
79 int nr_to_scan = sc->nr_to_scan;
80 int nr_del = 0;
81 int nr_rem = 0;
82 struct rb_root *root;
83
84 root = &uidtree;
85 spin_lock(&siduidlock);
86 shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
87 spin_unlock(&siduidlock);
88
89 root = &gidtree;
90 spin_lock(&sidgidlock);
91 shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
92 spin_unlock(&sidgidlock);
93
94 root = &siduidtree;
95 spin_lock(&uidsidlock);
96 shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
97 spin_unlock(&uidsidlock);
98
99 root = &sidgidtree;
100 spin_lock(&gidsidlock);
101 shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
102 spin_unlock(&gidsidlock);
103
104 return nr_rem;
105}
106
107static void
108sid_rb_insert(struct rb_root *root, unsigned long cid,
109 struct cifs_sid_id **psidid, char *typestr)
110{
111 char *strptr;
112 struct rb_node *node = root->rb_node;
113 struct rb_node *parent = NULL;
114 struct rb_node **linkto = &(root->rb_node);
115 struct cifs_sid_id *lsidid;
116
117 while (node) {
118 lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
119 parent = node;
120 if (cid > lsidid->id) {
121 linkto = &(node->rb_left);
122 node = node->rb_left;
123 }
124 if (cid < lsidid->id) {
125 linkto = &(node->rb_right);
126 node = node->rb_right;
127 }
128 }
129
130 (*psidid)->id = cid;
131 (*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
132 (*psidid)->refcount = 0;
133
134 sprintf((*psidid)->sidstr, "%s", typestr);
135 strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr);
136 sprintf(strptr, "%ld", cid);
137
138 clear_bit(SID_ID_PENDING, &(*psidid)->state);
139 clear_bit(SID_ID_MAPPED, &(*psidid)->state);
140
141 rb_link_node(&(*psidid)->rbnode, parent, linkto);
142 rb_insert_color(&(*psidid)->rbnode, root);
143}
144
145static struct cifs_sid_id *
146sid_rb_search(struct rb_root *root, unsigned long cid)
147{
148 struct rb_node *node = root->rb_node;
149 struct cifs_sid_id *lsidid;
150
151 while (node) {
152 lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
153 if (cid > lsidid->id)
154 node = node->rb_left;
155 else if (cid < lsidid->id)
156 node = node->rb_right;
157 else /* node found */
158 return lsidid;
159 }
160
161 return NULL;
162}
163
164static struct shrinker cifs_shrinker = {
165 .shrink = cifs_idmap_shrinker,
166 .seeks = DEFAULT_SEEKS,
167};
168 46
169static int 47static int
170cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep) 48cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
171{ 49{
172 char *payload; 50 char *payload;
173 51
52 /*
53 * If the payload is less than or equal to the size of a pointer, then
54 * an allocation here is wasteful. Just copy the data directly to the
55 * payload.value union member instead.
56 *
57 * With this however, you must check the datalen before trying to
58 * dereference payload.data!
59 */
60 if (prep->datalen <= sizeof(key->payload)) {
61 key->payload.value = 0;
62 memcpy(&key->payload.value, prep->data, prep->datalen);
63 key->datalen = prep->datalen;
64 return 0;
65 }
174 payload = kmalloc(prep->datalen, GFP_KERNEL); 66 payload = kmalloc(prep->datalen, GFP_KERNEL);
175 if (!payload) 67 if (!payload)
176 return -ENOMEM; 68 return -ENOMEM;
@@ -184,10 +76,11 @@ cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
184static inline void 76static inline void
185cifs_idmap_key_destroy(struct key *key) 77cifs_idmap_key_destroy(struct key *key)
186{ 78{
187 kfree(key->payload.data); 79 if (key->datalen > sizeof(key->payload))
80 kfree(key->payload.data);
188} 81}
189 82
190struct key_type cifs_idmap_key_type = { 83static struct key_type cifs_idmap_key_type = {
191 .name = "cifs.idmap", 84 .name = "cifs.idmap",
192 .instantiate = cifs_idmap_key_instantiate, 85 .instantiate = cifs_idmap_key_instantiate,
193 .destroy = cifs_idmap_key_destroy, 86 .destroy = cifs_idmap_key_destroy,
@@ -195,214 +88,174 @@ struct key_type cifs_idmap_key_type = {
195 .match = user_match, 88 .match = user_match,
196}; 89};
197 90
198static void 91static char *
199sid_to_str(struct cifs_sid *sidptr, char *sidstr) 92sid_to_key_str(struct cifs_sid *sidptr, unsigned int type)
200{ 93{
201 int i; 94 int i, len;
202 unsigned long saval; 95 unsigned int saval;
203 char *strptr; 96 char *sidstr, *strptr;
97 unsigned long long id_auth_val;
98
99 /* 3 bytes for prefix */
100 sidstr = kmalloc(3 + SID_STRING_BASE_SIZE +
101 (SID_STRING_SUBAUTH_SIZE * sidptr->num_subauth),
102 GFP_KERNEL);
103 if (!sidstr)
104 return sidstr;
204 105
205 strptr = sidstr; 106 strptr = sidstr;
107 len = sprintf(strptr, "%cs:S-%hhu", type == SIDOWNER ? 'o' : 'g',
108 sidptr->revision);
109 strptr += len;
110
111 /* The authority field is a single 48-bit number */
112 id_auth_val = (unsigned long long)sidptr->authority[5];
113 id_auth_val |= (unsigned long long)sidptr->authority[4] << 8;
114 id_auth_val |= (unsigned long long)sidptr->authority[3] << 16;
115 id_auth_val |= (unsigned long long)sidptr->authority[2] << 24;
116 id_auth_val |= (unsigned long long)sidptr->authority[1] << 32;
117 id_auth_val |= (unsigned long long)sidptr->authority[0] << 48;
206 118
207 sprintf(strptr, "%s", "S"); 119 /*
208 strptr = sidstr + strlen(sidstr); 120 * MS-DTYP states that if the authority is >= 2^32, then it should be
209 121 * expressed as a hex value.
210 sprintf(strptr, "-%d", sidptr->revision); 122 */
211 strptr = sidstr + strlen(sidstr); 123 if (id_auth_val <= UINT_MAX)
124 len = sprintf(strptr, "-%llu", id_auth_val);
125 else
126 len = sprintf(strptr, "-0x%llx", id_auth_val);
212 127
213 for (i = 0; i < 6; ++i) { 128 strptr += len;
214 if (sidptr->authority[i]) {
215 sprintf(strptr, "-%d", sidptr->authority[i]);
216 strptr = sidstr + strlen(sidstr);
217 }
218 }
219 129
220 for (i = 0; i < sidptr->num_subauth; ++i) { 130 for (i = 0; i < sidptr->num_subauth; ++i) {
221 saval = le32_to_cpu(sidptr->sub_auth[i]); 131 saval = le32_to_cpu(sidptr->sub_auth[i]);
222 sprintf(strptr, "-%ld", saval); 132 len = sprintf(strptr, "-%u", saval);
223 strptr = sidstr + strlen(sidstr); 133 strptr += len;
224 } 134 }
135
136 return sidstr;
225} 137}
226 138
227static void 139/*
228id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr, 140 * if the two SIDs (roughly equivalent to a UUID for a user or group) are
229 struct cifs_sid_id **psidid, char *typestr) 141 * the same returns zero, if they do not match returns non-zero.
142 */
143static int
144compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
230{ 145{
231 int rc; 146 int i;
232 char *strptr; 147 int num_subauth, num_sat, num_saw;
233 struct rb_node *node = root->rb_node;
234 struct rb_node *parent = NULL;
235 struct rb_node **linkto = &(root->rb_node);
236 struct cifs_sid_id *lsidid;
237
238 while (node) {
239 lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
240 parent = node;
241 rc = compare_sids(sidptr, &((lsidid)->sid));
242 if (rc > 0) {
243 linkto = &(node->rb_left);
244 node = node->rb_left;
245 } else if (rc < 0) {
246 linkto = &(node->rb_right);
247 node = node->rb_right;
248 }
249 }
250
251 memcpy(&(*psidid)->sid, sidptr, sizeof(struct cifs_sid));
252 (*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
253 (*psidid)->refcount = 0;
254 148
255 sprintf((*psidid)->sidstr, "%s", typestr); 149 if ((!ctsid) || (!cwsid))
256 strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr); 150 return 1;
257 sid_to_str(&(*psidid)->sid, strptr);
258 151
259 clear_bit(SID_ID_PENDING, &(*psidid)->state); 152 /* compare the revision */
260 clear_bit(SID_ID_MAPPED, &(*psidid)->state); 153 if (ctsid->revision != cwsid->revision) {
154 if (ctsid->revision > cwsid->revision)
155 return 1;
156 else
157 return -1;
158 }
261 159
262 rb_link_node(&(*psidid)->rbnode, parent, linkto); 160 /* compare all of the six auth values */
263 rb_insert_color(&(*psidid)->rbnode, root); 161 for (i = 0; i < NUM_AUTHS; ++i) {
264} 162 if (ctsid->authority[i] != cwsid->authority[i]) {
163 if (ctsid->authority[i] > cwsid->authority[i])
164 return 1;
165 else
166 return -1;
167 }
168 }
265 169
266static struct cifs_sid_id * 170 /* compare all of the subauth values if any */
267id_rb_search(struct rb_root *root, struct cifs_sid *sidptr) 171 num_sat = ctsid->num_subauth;
268{ 172 num_saw = cwsid->num_subauth;
269 int rc; 173 num_subauth = num_sat < num_saw ? num_sat : num_saw;
270 struct rb_node *node = root->rb_node; 174 if (num_subauth) {
271 struct cifs_sid_id *lsidid; 175 for (i = 0; i < num_subauth; ++i) {
272 176 if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) {
273 while (node) { 177 if (le32_to_cpu(ctsid->sub_auth[i]) >
274 lsidid = rb_entry(node, struct cifs_sid_id, rbnode); 178 le32_to_cpu(cwsid->sub_auth[i]))
275 rc = compare_sids(sidptr, &((lsidid)->sid)); 179 return 1;
276 if (rc > 0) { 180 else
277 node = node->rb_left; 181 return -1;
278 } else if (rc < 0) { 182 }
279 node = node->rb_right; 183 }
280 } else /* node found */
281 return lsidid;
282 } 184 }
283 185
284 return NULL; 186 return 0; /* sids compare/match */
285} 187}
286 188
287static int 189static void
288sidid_pending_wait(void *unused) 190cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src)
289{ 191{
290 schedule(); 192 int i;
291 return signal_pending(current) ? -ERESTARTSYS : 0; 193
194 dst->revision = src->revision;
195 dst->num_subauth = min_t(u8, src->num_subauth, SID_MAX_SUB_AUTHORITIES);
196 for (i = 0; i < NUM_AUTHS; ++i)
197 dst->authority[i] = src->authority[i];
198 for (i = 0; i < dst->num_subauth; ++i)
199 dst->sub_auth[i] = src->sub_auth[i];
292} 200}
293 201
294static int 202static int
295id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid) 203id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid)
296{ 204{
297 int rc = 0; 205 int rc;
298 struct key *sidkey; 206 struct key *sidkey;
207 struct cifs_sid *ksid;
208 unsigned int ksid_size;
209 char desc[3 + 10 + 1]; /* 3 byte prefix + 10 bytes for value + NULL */
299 const struct cred *saved_cred; 210 const struct cred *saved_cred;
300 struct cifs_sid *lsid;
301 struct cifs_sid_id *psidid, *npsidid;
302 struct rb_root *cidtree;
303 spinlock_t *cidlock;
304
305 if (sidtype == SIDOWNER) {
306 cidlock = &siduidlock;
307 cidtree = &uidtree;
308 } else if (sidtype == SIDGROUP) {
309 cidlock = &sidgidlock;
310 cidtree = &gidtree;
311 } else
312 return -EINVAL;
313 211
314 spin_lock(cidlock); 212 rc = snprintf(desc, sizeof(desc), "%ci:%u",
315 psidid = sid_rb_search(cidtree, cid); 213 sidtype == SIDOWNER ? 'o' : 'g', cid);
316 214 if (rc >= sizeof(desc))
317 if (!psidid) { /* node does not exist, allocate one & attempt adding */ 215 return -EINVAL;
318 spin_unlock(cidlock);
319 npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
320 if (!npsidid)
321 return -ENOMEM;
322
323 npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL);
324 if (!npsidid->sidstr) {
325 kfree(npsidid);
326 return -ENOMEM;
327 }
328 216
329 spin_lock(cidlock); 217 rc = 0;
330 psidid = sid_rb_search(cidtree, cid); 218 saved_cred = override_creds(root_cred);
331 if (psidid) { /* node happened to get inserted meanwhile */ 219 sidkey = request_key(&cifs_idmap_key_type, desc, "");
332 ++psidid->refcount; 220 if (IS_ERR(sidkey)) {
333 spin_unlock(cidlock); 221 rc = -EINVAL;
334 kfree(npsidid->sidstr); 222 cFYI(1, "%s: Can't map %cid %u to a SID", __func__,
335 kfree(npsidid); 223 sidtype == SIDOWNER ? 'u' : 'g', cid);
336 } else { 224 goto out_revert_creds;
337 psidid = npsidid; 225 } else if (sidkey->datalen < CIFS_SID_BASE_SIZE) {
338 sid_rb_insert(cidtree, cid, &psidid, 226 rc = -EIO;
339 sidtype == SIDOWNER ? "oi:" : "gi:"); 227 cFYI(1, "%s: Downcall contained malformed key "
340 ++psidid->refcount; 228 "(datalen=%hu)", __func__, sidkey->datalen);
341 spin_unlock(cidlock); 229 goto invalidate_key;
342 }
343 } else {
344 ++psidid->refcount;
345 spin_unlock(cidlock);
346 } 230 }
347 231
348 /* 232 /*
349 * If we are here, it is safe to access psidid and its fields 233 * A sid is usually too large to be embedded in payload.value, but if
350 * since a reference was taken earlier while holding the spinlock. 234 * there are no subauthorities and the host has 8-byte pointers, then
351 * A reference on the node is put without holding the spinlock 235 * it could be.
352 * and it is OK to do so in this case, shrinker will not erase
353 * this node until all references are put and we do not access
354 * any fields of the node after a reference is put .
355 */ 236 */
356 if (test_bit(SID_ID_MAPPED, &psidid->state)) { 237 ksid = sidkey->datalen <= sizeof(sidkey->payload) ?
357 memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid)); 238 (struct cifs_sid *)&sidkey->payload.value :
358 psidid->time = jiffies; /* update ts for accessing */ 239 (struct cifs_sid *)sidkey->payload.data;
359 goto id_sid_out; 240
360 } 241 ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32));
361 242 if (ksid_size > sidkey->datalen) {
362 if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) { 243 rc = -EIO;
363 rc = -EINVAL; 244 cFYI(1, "%s: Downcall contained malformed key (datalen=%hu, "
364 goto id_sid_out; 245 "ksid_size=%u)", __func__, sidkey->datalen, ksid_size);
246 goto invalidate_key;
365 } 247 }
366 248
367 if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) { 249 cifs_copy_sid(ssid, ksid);
368 saved_cred = override_creds(root_cred); 250out_key_put:
369 sidkey = request_key(&cifs_idmap_key_type, psidid->sidstr, ""); 251 key_put(sidkey);
370 if (IS_ERR(sidkey)) { 252out_revert_creds:
371 rc = -EINVAL; 253 revert_creds(saved_cred);
372 cFYI(1, "%s: Can't map and id to a SID", __func__);
373 } else {
374 lsid = (struct cifs_sid *)sidkey->payload.data;
375 memcpy(&psidid->sid, lsid,
376 sidkey->datalen < sizeof(struct cifs_sid) ?
377 sidkey->datalen : sizeof(struct cifs_sid));
378 memcpy(ssid, &psidid->sid,
379 sidkey->datalen < sizeof(struct cifs_sid) ?
380 sidkey->datalen : sizeof(struct cifs_sid));
381 set_bit(SID_ID_MAPPED, &psidid->state);
382 key_put(sidkey);
383 kfree(psidid->sidstr);
384 }
385 psidid->time = jiffies; /* update ts for accessing */
386 revert_creds(saved_cred);
387 clear_bit(SID_ID_PENDING, &psidid->state);
388 wake_up_bit(&psidid->state, SID_ID_PENDING);
389 } else {
390 rc = wait_on_bit(&psidid->state, SID_ID_PENDING,
391 sidid_pending_wait, TASK_INTERRUPTIBLE);
392 if (rc) {
393 cFYI(1, "%s: sidid_pending_wait interrupted %d",
394 __func__, rc);
395 --psidid->refcount;
396 return rc;
397 }
398 if (test_bit(SID_ID_MAPPED, &psidid->state))
399 memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid));
400 else
401 rc = -EINVAL;
402 }
403id_sid_out:
404 --psidid->refcount;
405 return rc; 254 return rc;
255
256invalidate_key:
257 key_invalidate(sidkey);
258 goto out_key_put;
406} 259}
407 260
408static int 261static int
@@ -410,111 +263,67 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
410 struct cifs_fattr *fattr, uint sidtype) 263 struct cifs_fattr *fattr, uint sidtype)
411{ 264{
412 int rc; 265 int rc;
413 unsigned long cid; 266 struct key *sidkey;
414 struct key *idkey; 267 char *sidstr;
415 const struct cred *saved_cred; 268 const struct cred *saved_cred;
416 struct cifs_sid_id *psidid, *npsidid; 269 uid_t fuid = cifs_sb->mnt_uid;
417 struct rb_root *cidtree; 270 gid_t fgid = cifs_sb->mnt_gid;
418 spinlock_t *cidlock;
419
420 if (sidtype == SIDOWNER) {
421 cid = cifs_sb->mnt_uid; /* default uid, in case upcall fails */
422 cidlock = &siduidlock;
423 cidtree = &uidtree;
424 } else if (sidtype == SIDGROUP) {
425 cid = cifs_sb->mnt_gid; /* default gid, in case upcall fails */
426 cidlock = &sidgidlock;
427 cidtree = &gidtree;
428 } else
429 return -ENOENT;
430
431 spin_lock(cidlock);
432 psidid = id_rb_search(cidtree, psid);
433
434 if (!psidid) { /* node does not exist, allocate one & attempt adding */
435 spin_unlock(cidlock);
436 npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
437 if (!npsidid)
438 return -ENOMEM;
439
440 npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL);
441 if (!npsidid->sidstr) {
442 kfree(npsidid);
443 return -ENOMEM;
444 }
445
446 spin_lock(cidlock);
447 psidid = id_rb_search(cidtree, psid);
448 if (psidid) { /* node happened to get inserted meanwhile */
449 ++psidid->refcount;
450 spin_unlock(cidlock);
451 kfree(npsidid->sidstr);
452 kfree(npsidid);
453 } else {
454 psidid = npsidid;
455 id_rb_insert(cidtree, psid, &psidid,
456 sidtype == SIDOWNER ? "os:" : "gs:");
457 ++psidid->refcount;
458 spin_unlock(cidlock);
459 }
460 } else {
461 ++psidid->refcount;
462 spin_unlock(cidlock);
463 }
464 271
465 /* 272 /*
466 * If we are here, it is safe to access psidid and its fields 273 * If we have too many subauthorities, then something is really wrong.
467 * since a reference was taken earlier while holding the spinlock. 274 * Just return an error.
468 * A reference on the node is put without holding the spinlock
469 * and it is OK to do so in this case, shrinker will not erase
470 * this node until all references are put and we do not access
471 * any fields of the node after a reference is put .
472 */ 275 */
473 if (test_bit(SID_ID_MAPPED, &psidid->state)) { 276 if (unlikely(psid->num_subauth > SID_MAX_SUB_AUTHORITIES)) {
474 cid = psidid->id; 277 cFYI(1, "%s: %u subauthorities is too many!", __func__,
475 psidid->time = jiffies; /* update ts for accessing */ 278 psid->num_subauth);
476 goto sid_to_id_out; 279 return -EIO;
477 } 280 }
478 281
479 if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) 282 sidstr = sid_to_key_str(psid, sidtype);
480 goto sid_to_id_out; 283 if (!sidstr)
481 284 return -ENOMEM;
482 if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) { 285
483 saved_cred = override_creds(root_cred); 286 saved_cred = override_creds(root_cred);
484 idkey = request_key(&cifs_idmap_key_type, psidid->sidstr, ""); 287 sidkey = request_key(&cifs_idmap_key_type, sidstr, "");
485 if (IS_ERR(idkey)) 288 if (IS_ERR(sidkey)) {
486 cFYI(1, "%s: Can't map SID to an id", __func__); 289 rc = -EINVAL;
487 else { 290 cFYI(1, "%s: Can't map SID %s to a %cid", __func__, sidstr,
488 cid = *(unsigned long *)idkey->payload.value; 291 sidtype == SIDOWNER ? 'u' : 'g');
489 psidid->id = cid; 292 goto out_revert_creds;
490 set_bit(SID_ID_MAPPED, &psidid->state); 293 }
491 key_put(idkey); 294
492 kfree(psidid->sidstr); 295 /*
493 } 296 * FIXME: Here we assume that uid_t and gid_t are same size. It's
494 revert_creds(saved_cred); 297 * probably a safe assumption but might be better to check based on
495 psidid->time = jiffies; /* update ts for accessing */ 298 * sidtype.
496 clear_bit(SID_ID_PENDING, &psidid->state); 299 */
497 wake_up_bit(&psidid->state, SID_ID_PENDING); 300 if (sidkey->datalen != sizeof(uid_t)) {
498 } else { 301 rc = -EIO;
499 rc = wait_on_bit(&psidid->state, SID_ID_PENDING, 302 cFYI(1, "%s: Downcall contained malformed key "
500 sidid_pending_wait, TASK_INTERRUPTIBLE); 303 "(datalen=%hu)", __func__, sidkey->datalen);
501 if (rc) { 304 key_invalidate(sidkey);
502 cFYI(1, "%s: sidid_pending_wait interrupted %d", 305 goto out_key_put;
503 __func__, rc);
504 --psidid->refcount; /* decremented without spinlock */
505 return rc;
506 }
507 if (test_bit(SID_ID_MAPPED, &psidid->state))
508 cid = psidid->id;
509 } 306 }
510 307
511sid_to_id_out:
512 --psidid->refcount; /* decremented without spinlock */
513 if (sidtype == SIDOWNER) 308 if (sidtype == SIDOWNER)
514 fattr->cf_uid = cid; 309 memcpy(&fuid, &sidkey->payload.value, sizeof(uid_t));
515 else 310 else
516 fattr->cf_gid = cid; 311 memcpy(&fgid, &sidkey->payload.value, sizeof(gid_t));
312
313out_key_put:
314 key_put(sidkey);
315out_revert_creds:
316 revert_creds(saved_cred);
317 kfree(sidstr);
517 318
319 /*
320 * Note that we return 0 here unconditionally. If the mapping
321 * fails then we just fall back to using the mnt_uid/mnt_gid.
322 */
323 if (sidtype == SIDOWNER)
324 fattr->cf_uid = fuid;
325 else
326 fattr->cf_gid = fgid;
518 return 0; 327 return 0;
519} 328}
520 329
@@ -561,17 +370,6 @@ init_cifs_idmap(void)
561 cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; 370 cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
562 root_cred = cred; 371 root_cred = cred;
563 372
564 spin_lock_init(&siduidlock);
565 uidtree = RB_ROOT;
566 spin_lock_init(&sidgidlock);
567 gidtree = RB_ROOT;
568
569 spin_lock_init(&uidsidlock);
570 siduidtree = RB_ROOT;
571 spin_lock_init(&gidsidlock);
572 sidgidtree = RB_ROOT;
573 register_shrinker(&cifs_shrinker);
574
575 cFYI(1, "cifs idmap keyring: %d", key_serial(keyring)); 373 cFYI(1, "cifs idmap keyring: %d", key_serial(keyring));
576 return 0; 374 return 0;
577 375
@@ -588,95 +386,13 @@ exit_cifs_idmap(void)
588 key_revoke(root_cred->thread_keyring); 386 key_revoke(root_cred->thread_keyring);
589 unregister_key_type(&cifs_idmap_key_type); 387 unregister_key_type(&cifs_idmap_key_type);
590 put_cred(root_cred); 388 put_cred(root_cred);
591 unregister_shrinker(&cifs_shrinker);
592 cFYI(1, "Unregistered %s key type", cifs_idmap_key_type.name); 389 cFYI(1, "Unregistered %s key type", cifs_idmap_key_type.name);
593} 390}
594 391
595void
596cifs_destroy_idmaptrees(void)
597{
598 struct rb_root *root;
599 struct rb_node *node;
600
601 root = &uidtree;
602 spin_lock(&siduidlock);
603 while ((node = rb_first(root)))
604 rb_erase(node, root);
605 spin_unlock(&siduidlock);
606
607 root = &gidtree;
608 spin_lock(&sidgidlock);
609 while ((node = rb_first(root)))
610 rb_erase(node, root);
611 spin_unlock(&sidgidlock);
612
613 root = &siduidtree;
614 spin_lock(&uidsidlock);
615 while ((node = rb_first(root)))
616 rb_erase(node, root);
617 spin_unlock(&uidsidlock);
618
619 root = &sidgidtree;
620 spin_lock(&gidsidlock);
621 while ((node = rb_first(root)))
622 rb_erase(node, root);
623 spin_unlock(&gidsidlock);
624}
625
626/* if the two SIDs (roughly equivalent to a UUID for a user or group) are
627 the same returns 1, if they do not match returns 0 */
628int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
629{
630 int i;
631 int num_subauth, num_sat, num_saw;
632
633 if ((!ctsid) || (!cwsid))
634 return 1;
635
636 /* compare the revision */
637 if (ctsid->revision != cwsid->revision) {
638 if (ctsid->revision > cwsid->revision)
639 return 1;
640 else
641 return -1;
642 }
643
644 /* compare all of the six auth values */
645 for (i = 0; i < 6; ++i) {
646 if (ctsid->authority[i] != cwsid->authority[i]) {
647 if (ctsid->authority[i] > cwsid->authority[i])
648 return 1;
649 else
650 return -1;
651 }
652 }
653
654 /* compare all of the subauth values if any */
655 num_sat = ctsid->num_subauth;
656 num_saw = cwsid->num_subauth;
657 num_subauth = num_sat < num_saw ? num_sat : num_saw;
658 if (num_subauth) {
659 for (i = 0; i < num_subauth; ++i) {
660 if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) {
661 if (le32_to_cpu(ctsid->sub_auth[i]) >
662 le32_to_cpu(cwsid->sub_auth[i]))
663 return 1;
664 else
665 return -1;
666 }
667 }
668 }
669
670 return 0; /* sids compare/match */
671}
672
673
674/* copy ntsd, owner sid, and group sid from a security descriptor to another */ 392/* copy ntsd, owner sid, and group sid from a security descriptor to another */
675static void copy_sec_desc(const struct cifs_ntsd *pntsd, 393static void copy_sec_desc(const struct cifs_ntsd *pntsd,
676 struct cifs_ntsd *pnntsd, __u32 sidsoffset) 394 struct cifs_ntsd *pnntsd, __u32 sidsoffset)
677{ 395{
678 int i;
679
680 struct cifs_sid *owner_sid_ptr, *group_sid_ptr; 396 struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
681 struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr; 397 struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
682 398
@@ -692,26 +408,14 @@ static void copy_sec_desc(const struct cifs_ntsd *pntsd,
692 owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + 408 owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
693 le32_to_cpu(pntsd->osidoffset)); 409 le32_to_cpu(pntsd->osidoffset));
694 nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset); 410 nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset);
695 411 cifs_copy_sid(nowner_sid_ptr, owner_sid_ptr);
696 nowner_sid_ptr->revision = owner_sid_ptr->revision;
697 nowner_sid_ptr->num_subauth = owner_sid_ptr->num_subauth;
698 for (i = 0; i < 6; i++)
699 nowner_sid_ptr->authority[i] = owner_sid_ptr->authority[i];
700 for (i = 0; i < 5; i++)
701 nowner_sid_ptr->sub_auth[i] = owner_sid_ptr->sub_auth[i];
702 412
703 /* copy group sid */ 413 /* copy group sid */
704 group_sid_ptr = (struct cifs_sid *)((char *)pntsd + 414 group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
705 le32_to_cpu(pntsd->gsidoffset)); 415 le32_to_cpu(pntsd->gsidoffset));
706 ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset + 416 ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset +
707 sizeof(struct cifs_sid)); 417 sizeof(struct cifs_sid));
708 418 cifs_copy_sid(ngroup_sid_ptr, group_sid_ptr);
709 ngroup_sid_ptr->revision = group_sid_ptr->revision;
710 ngroup_sid_ptr->num_subauth = group_sid_ptr->num_subauth;
711 for (i = 0; i < 6; i++)
712 ngroup_sid_ptr->authority[i] = group_sid_ptr->authority[i];
713 for (i = 0; i < 5; i++)
714 ngroup_sid_ptr->sub_auth[i] = group_sid_ptr->sub_auth[i];
715 419
716 return; 420 return;
717} 421}
@@ -818,7 +522,7 @@ static __u16 fill_ace_for_sid(struct cifs_ace *pntace,
818 522
819 pntace->sid.revision = psid->revision; 523 pntace->sid.revision = psid->revision;
820 pntace->sid.num_subauth = psid->num_subauth; 524 pntace->sid.num_subauth = psid->num_subauth;
821 for (i = 0; i < 6; i++) 525 for (i = 0; i < NUM_AUTHS; i++)
822 pntace->sid.authority[i] = psid->authority[i]; 526 pntace->sid.authority[i] = psid->authority[i];
823 for (i = 0; i < psid->num_subauth; i++) 527 for (i = 0; i < psid->num_subauth; i++)
824 pntace->sid.sub_auth[i] = psid->sub_auth[i]; 528 pntace->sid.sub_auth[i] = psid->sub_auth[i];
@@ -994,8 +698,8 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
994 return -EINVAL; 698 return -EINVAL;
995 } 699 }
996 700
997 if (psid->num_subauth) {
998#ifdef CONFIG_CIFS_DEBUG2 701#ifdef CONFIG_CIFS_DEBUG2
702 if (psid->num_subauth) {
999 int i; 703 int i;
1000 cFYI(1, "SID revision %d num_auth %d", 704 cFYI(1, "SID revision %d num_auth %d",
1001 psid->revision, psid->num_subauth); 705 psid->revision, psid->num_subauth);
@@ -1009,8 +713,8 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
1009 num auths and therefore go off the end */ 713 num auths and therefore go off the end */
1010 cFYI(1, "RID 0x%x", 714 cFYI(1, "RID 0x%x",
1011 le32_to_cpu(psid->sub_auth[psid->num_subauth-1])); 715 le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
1012#endif
1013 } 716 }
717#endif
1014 718
1015 return 0; 719 return 0;
1016} 720}
@@ -1120,8 +824,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
1120 kfree(nowner_sid_ptr); 824 kfree(nowner_sid_ptr);
1121 return rc; 825 return rc;
1122 } 826 }
1123 memcpy(owner_sid_ptr, nowner_sid_ptr, 827 cifs_copy_sid(owner_sid_ptr, nowner_sid_ptr);
1124 sizeof(struct cifs_sid));
1125 kfree(nowner_sid_ptr); 828 kfree(nowner_sid_ptr);
1126 *aclflag = CIFS_ACL_OWNER; 829 *aclflag = CIFS_ACL_OWNER;
1127 } 830 }
@@ -1139,8 +842,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
1139 kfree(ngroup_sid_ptr); 842 kfree(ngroup_sid_ptr);
1140 return rc; 843 return rc;
1141 } 844 }
1142 memcpy(group_sid_ptr, ngroup_sid_ptr, 845 cifs_copy_sid(group_sid_ptr, ngroup_sid_ptr);
1143 sizeof(struct cifs_sid));
1144 kfree(ngroup_sid_ptr); 846 kfree(ngroup_sid_ptr);
1145 *aclflag = CIFS_ACL_GROUP; 847 *aclflag = CIFS_ACL_GROUP;
1146 } 848 }
@@ -1316,42 +1018,39 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
1316 1018
1317 /* Get the security descriptor */ 1019 /* Get the security descriptor */
1318 pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen); 1020 pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
1319
1320 /* Add three ACEs for owner, group, everyone getting rid of
1321 other ACEs as chmod disables ACEs and set the security descriptor */
1322
1323 if (IS_ERR(pntsd)) { 1021 if (IS_ERR(pntsd)) {
1324 rc = PTR_ERR(pntsd); 1022 rc = PTR_ERR(pntsd);
1325 cERROR(1, "%s: error %d getting sec desc", __func__, rc); 1023 cERROR(1, "%s: error %d getting sec desc", __func__, rc);
1326 } else { 1024 goto out;
1327 /* allocate memory for the smb header, 1025 }
1328 set security descriptor request security descriptor
1329 parameters, and secuirty descriptor itself */
1330
1331 secdesclen = secdesclen < DEFSECDESCLEN ?
1332 DEFSECDESCLEN : secdesclen;
1333 pnntsd = kmalloc(secdesclen, GFP_KERNEL);
1334 if (!pnntsd) {
1335 cERROR(1, "Unable to allocate security descriptor");
1336 kfree(pntsd);
1337 return -ENOMEM;
1338 }
1339 1026
1340 rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid, 1027 /*
1341 &aclflag); 1028 * Add three ACEs for owner, group, everyone getting rid of other ACEs
1029 * as chmod disables ACEs and set the security descriptor. Allocate
1030 * memory for the smb header, set security descriptor request security
1031 * descriptor parameters, and secuirty descriptor itself
1032 */
1033 secdesclen = max_t(u32, secdesclen, DEFAULT_SEC_DESC_LEN);
1034 pnntsd = kmalloc(secdesclen, GFP_KERNEL);
1035 if (!pnntsd) {
1036 cERROR(1, "Unable to allocate security descriptor");
1037 kfree(pntsd);
1038 return -ENOMEM;
1039 }
1342 1040
1343 cFYI(DBG2, "build_sec_desc rc: %d", rc); 1041 rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid,
1042 &aclflag);
1344 1043
1345 if (!rc) { 1044 cFYI(DBG2, "build_sec_desc rc: %d", rc);
1346 /* Set the security descriptor */
1347 rc = set_cifs_acl(pnntsd, secdesclen, inode,
1348 path, aclflag);
1349 cFYI(DBG2, "set_cifs_acl rc: %d", rc);
1350 }
1351 1045
1352 kfree(pnntsd); 1046 if (!rc) {
1353 kfree(pntsd); 1047 /* Set the security descriptor */
1048 rc = set_cifs_acl(pnntsd, secdesclen, inode, path, aclflag);
1049 cFYI(DBG2, "set_cifs_acl rc: %d", rc);
1354 } 1050 }
1355 1051
1052 kfree(pnntsd);
1053 kfree(pntsd);
1054out:
1356 return rc; 1055 return rc;
1357} 1056}
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index 5c902c7ce524..4f3884835267 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -23,11 +23,8 @@
23#define _CIFSACL_H 23#define _CIFSACL_H
24 24
25 25
26#define NUM_AUTHS 6 /* number of authority fields */ 26#define NUM_AUTHS (6) /* number of authority fields */
27#define NUM_SUBAUTHS 5 /* number of sub authority fields */ 27#define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */
28#define NUM_WK_SIDS 7 /* number of well known sids */
29#define SIDNAMELENGTH 20 /* long enough for the ones we care about */
30#define DEFSECDESCLEN 192 /* sec desc len contaiting a dacl with three aces */
31 28
32#define READ_BIT 0x4 29#define READ_BIT 0x4
33#define WRITE_BIT 0x2 30#define WRITE_BIT 0x2
@@ -41,12 +38,32 @@
41 38
42#define SIDOWNER 1 39#define SIDOWNER 1
43#define SIDGROUP 2 40#define SIDGROUP 2
44#define SIDLEN 150 /* S- 1 revision- 6 authorities- max 5 sub authorities */
45 41
46#define SID_ID_MAPPED 0 42/*
47#define SID_ID_PENDING 1 43 * Security Descriptor length containing DACL with 3 ACEs (one each for
48#define SID_MAP_EXPIRE (3600 * HZ) /* map entry expires after one hour */ 44 * owner, group and world).
49#define SID_MAP_RETRY (300 * HZ) /* wait 5 minutes for next attempt to map */ 45 */
46#define DEFAULT_SEC_DESC_LEN (sizeof(struct cifs_ntsd) + \
47 sizeof(struct cifs_acl) + \
48 (sizeof(struct cifs_ace) * 3))
49
50/*
51 * Maximum size of a string representation of a SID:
52 *
53 * The fields are unsigned values in decimal. So:
54 *
55 * u8: max 3 bytes in decimal
56 * u32: max 10 bytes in decimal
57 *
58 * "S-" + 3 bytes for version field + 15 for authority field + NULL terminator
59 *
60 * For authority field, max is when all 6 values are non-zero and it must be
61 * represented in hex. So "-0x" + 12 hex digits.
62 *
63 * Add 11 bytes for each subauthority field (10 bytes each + 1 for '-')
64 */
65#define SID_STRING_BASE_SIZE (2 + 3 + 15 + 1)
66#define SID_STRING_SUBAUTH_SIZE (11) /* size of a single subauth string */
50 67
51struct cifs_ntsd { 68struct cifs_ntsd {
52 __le16 revision; /* revision level */ 69 __le16 revision; /* revision level */
@@ -60,10 +77,13 @@ struct cifs_ntsd {
60struct cifs_sid { 77struct cifs_sid {
61 __u8 revision; /* revision level */ 78 __u8 revision; /* revision level */
62 __u8 num_subauth; 79 __u8 num_subauth;
63 __u8 authority[6]; 80 __u8 authority[NUM_AUTHS];
64 __le32 sub_auth[5]; /* sub_auth[num_subauth] */ 81 __le32 sub_auth[SID_MAX_SUB_AUTHORITIES]; /* sub_auth[num_subauth] */
65} __attribute__((packed)); 82} __attribute__((packed));
66 83
84/* size of a struct cifs_sid, sans sub_auth array */
85#define CIFS_SID_BASE_SIZE (1 + 1 + NUM_AUTHS)
86
67struct cifs_acl { 87struct cifs_acl {
68 __le16 revision; /* revision level */ 88 __le16 revision; /* revision level */
69 __le16 size; 89 __le16 size;
@@ -78,26 +98,4 @@ struct cifs_ace {
78 struct cifs_sid sid; /* ie UUID of user or group who gets these perms */ 98 struct cifs_sid sid; /* ie UUID of user or group who gets these perms */
79} __attribute__((packed)); 99} __attribute__((packed));
80 100
81struct cifs_wksid {
82 struct cifs_sid cifssid;
83 char sidname[SIDNAMELENGTH];
84} __attribute__((packed));
85
86struct cifs_sid_id {
87 unsigned int refcount; /* increment with spinlock, decrement without */
88 unsigned long id;
89 unsigned long time;
90 unsigned long state;
91 char *sidstr;
92 struct rb_node rbnode;
93 struct cifs_sid sid;
94};
95
96#ifdef __KERNEL__
97extern struct key_type cifs_idmap_key_type;
98extern const struct cred *root_cred;
99#endif /* KERNEL */
100
101extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
102
103#endif /* _CIFSACL_H */ 101#endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index e7931cc55d0c..210f0af83fc4 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -64,24 +64,23 @@ unsigned int global_secflags = CIFSSEC_DEF;
64unsigned int sign_CIFS_PDUs = 1; 64unsigned int sign_CIFS_PDUs = 1;
65static const struct super_operations cifs_super_ops; 65static const struct super_operations cifs_super_ops;
66unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; 66unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
67module_param(CIFSMaxBufSize, int, 0); 67module_param(CIFSMaxBufSize, uint, 0);
68MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). " 68MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). "
69 "Default: 16384 Range: 8192 to 130048"); 69 "Default: 16384 Range: 8192 to 130048");
70unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL; 70unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL;
71module_param(cifs_min_rcv, int, 0); 71module_param(cifs_min_rcv, uint, 0);
72MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: " 72MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: "
73 "1 to 64"); 73 "1 to 64");
74unsigned int cifs_min_small = 30; 74unsigned int cifs_min_small = 30;
75module_param(cifs_min_small, int, 0); 75module_param(cifs_min_small, uint, 0);
76MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 " 76MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
77 "Range: 2 to 256"); 77 "Range: 2 to 256");
78unsigned int cifs_max_pending = CIFS_MAX_REQ; 78unsigned int cifs_max_pending = CIFS_MAX_REQ;
79module_param(cifs_max_pending, int, 0444); 79module_param(cifs_max_pending, uint, 0444);
80MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " 80MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
81 "Default: 32767 Range: 2 to 32767."); 81 "Default: 32767 Range: 2 to 32767.");
82module_param(enable_oplocks, bool, 0644); 82module_param(enable_oplocks, bool, 0644);
83MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:" 83MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
84 "y/Y/1");
85 84
86extern mempool_t *cifs_sm_req_poolp; 85extern mempool_t *cifs_sm_req_poolp;
87extern mempool_t *cifs_req_poolp; 86extern mempool_t *cifs_req_poolp;
@@ -230,6 +229,7 @@ cifs_alloc_inode(struct super_block *sb)
230 cifs_set_oplock_level(cifs_inode, 0); 229 cifs_set_oplock_level(cifs_inode, 0);
231 cifs_inode->delete_pending = false; 230 cifs_inode->delete_pending = false;
232 cifs_inode->invalid_mapping = false; 231 cifs_inode->invalid_mapping = false;
232 cifs_inode->leave_pages_clean = false;
233 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ 233 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
234 cifs_inode->server_eof = 0; 234 cifs_inode->server_eof = 0;
235 cifs_inode->uniqueid = 0; 235 cifs_inode->uniqueid = 0;
@@ -540,8 +540,8 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
540 char *s, *p; 540 char *s, *p;
541 char sep; 541 char sep;
542 542
543 full_path = build_path_to_root(vol, cifs_sb, 543 full_path = cifs_build_path_to_root(vol, cifs_sb,
544 cifs_sb_master_tcon(cifs_sb)); 544 cifs_sb_master_tcon(cifs_sb));
545 if (full_path == NULL) 545 if (full_path == NULL)
546 return ERR_PTR(-ENOMEM); 546 return ERR_PTR(-ENOMEM);
547 547
@@ -1205,7 +1205,6 @@ exit_cifs(void)
1205 unregister_filesystem(&cifs_fs_type); 1205 unregister_filesystem(&cifs_fs_type);
1206 cifs_dfs_release_automount_timer(); 1206 cifs_dfs_release_automount_timer();
1207#ifdef CONFIG_CIFS_ACL 1207#ifdef CONFIG_CIFS_ACL
1208 cifs_destroy_idmaptrees();
1209 exit_cifs_idmap(); 1208 exit_cifs_idmap();
1210#endif 1209#endif
1211#ifdef CONFIG_CIFS_UPCALL 1210#ifdef CONFIG_CIFS_UPCALL
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f5af2527fc69..aea1eec64911 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -178,6 +178,7 @@ struct smb_rqst {
178 178
179enum smb_version { 179enum smb_version {
180 Smb_1 = 1, 180 Smb_1 = 1,
181 Smb_20,
181 Smb_21, 182 Smb_21,
182 Smb_30, 183 Smb_30,
183}; 184};
@@ -280,9 +281,6 @@ struct smb_version_operations {
280 /* set attributes */ 281 /* set attributes */
281 int (*set_file_info)(struct inode *, const char *, FILE_BASIC_INFO *, 282 int (*set_file_info)(struct inode *, const char *, FILE_BASIC_INFO *,
282 const unsigned int); 283 const unsigned int);
283 /* build a full path to the root of the mount */
284 char * (*build_path_to_root)(struct smb_vol *, struct cifs_sb_info *,
285 struct cifs_tcon *);
286 /* check if we can send an echo or nor */ 284 /* check if we can send an echo or nor */
287 bool (*can_echo)(struct TCP_Server_Info *); 285 bool (*can_echo)(struct TCP_Server_Info *);
288 /* send echo request */ 286 /* send echo request */
@@ -369,6 +367,8 @@ struct smb_version_operations {
369 void (*set_lease_key)(struct inode *, struct cifs_fid *fid); 367 void (*set_lease_key)(struct inode *, struct cifs_fid *fid);
370 /* generate new lease key */ 368 /* generate new lease key */
371 void (*new_lease_key)(struct cifs_fid *fid); 369 void (*new_lease_key)(struct cifs_fid *fid);
370 int (*calc_signature)(struct smb_rqst *rqst,
371 struct TCP_Server_Info *server);
372}; 372};
373 373
374struct smb_version_values { 374struct smb_version_values {
@@ -396,7 +396,6 @@ struct smb_vol {
396 char *password; 396 char *password;
397 char *domainname; 397 char *domainname;
398 char *UNC; 398 char *UNC;
399 char *UNCip;
400 char *iocharset; /* local code page for mapping to and from Unicode */ 399 char *iocharset; /* local code page for mapping to and from Unicode */
401 char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */ 400 char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
402 char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */ 401 char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
@@ -444,11 +443,11 @@ struct smb_vol {
444 unsigned int rsize; 443 unsigned int rsize;
445 unsigned int wsize; 444 unsigned int wsize;
446 bool sockopt_tcp_nodelay:1; 445 bool sockopt_tcp_nodelay:1;
447 unsigned short int port;
448 unsigned long actimeo; /* attribute cache timeout (jiffies) */ 446 unsigned long actimeo; /* attribute cache timeout (jiffies) */
449 struct smb_version_operations *ops; 447 struct smb_version_operations *ops;
450 struct smb_version_values *vals; 448 struct smb_version_values *vals;
451 char *prepath; 449 char *prepath;
450 struct sockaddr_storage dstaddr; /* destination address */
452 struct sockaddr_storage srcaddr; /* allow binding to a local IP */ 451 struct sockaddr_storage srcaddr; /* allow binding to a local IP */
453 struct nls_table *local_nls; 452 struct nls_table *local_nls;
454}; 453};
@@ -1031,6 +1030,7 @@ struct cifsInodeInfo {
1031 bool clientCanCacheAll; /* read and writebehind oplock */ 1030 bool clientCanCacheAll; /* read and writebehind oplock */
1032 bool delete_pending; /* DELETE_ON_CLOSE is set */ 1031 bool delete_pending; /* DELETE_ON_CLOSE is set */
1033 bool invalid_mapping; /* pagecache is invalid */ 1032 bool invalid_mapping; /* pagecache is invalid */
1033 bool leave_pages_clean; /* protected by i_mutex, not set pages dirty */
1034 unsigned long time; /* jiffies of last update of inode */ 1034 unsigned long time; /* jiffies of last update of inode */
1035 u64 server_eof; /* current file size on server -- protected by i_lock */ 1035 u64 server_eof; /* current file size on server -- protected by i_lock */
1036 u64 uniqueid; /* server inode number */ 1036 u64 uniqueid; /* server inode number */
@@ -1067,30 +1067,16 @@ static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb)
1067static inline void 1067static inline void
1068convert_delimiter(char *path, char delim) 1068convert_delimiter(char *path, char delim)
1069{ 1069{
1070 int i; 1070 char old_delim, *pos;
1071 char old_delim;
1072
1073 if (path == NULL)
1074 return;
1075 1071
1076 if (delim == '/') 1072 if (delim == '/')
1077 old_delim = '\\'; 1073 old_delim = '\\';
1078 else 1074 else
1079 old_delim = '/'; 1075 old_delim = '/';
1080 1076
1081 for (i = 0; path[i] != '\0'; i++) { 1077 pos = path;
1082 if (path[i] == old_delim) 1078 while ((pos = strchr(pos, old_delim)))
1083 path[i] = delim; 1079 *pos = delim;
1084 }
1085}
1086
1087static inline char *
1088build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
1089 struct cifs_tcon *tcon)
1090{
1091 if (!vol->ops->build_path_to_root)
1092 return NULL;
1093 return vol->ops->build_path_to_root(vol, cifs_sb, tcon);
1094} 1080}
1095 1081
1096#ifdef CONFIG_CIFS_STATS 1082#ifdef CONFIG_CIFS_STATS
@@ -1362,7 +1348,7 @@ require use of the stronger protocol */
1362#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */ 1348#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */
1363#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */ 1349#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */
1364 1350
1365#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP) 1351#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMSSP)
1366#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2) 1352#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
1367#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP) 1353#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
1368/* 1354/*
@@ -1506,6 +1492,6 @@ extern struct smb_version_values smb20_values;
1506extern struct smb_version_operations smb21_operations; 1492extern struct smb_version_operations smb21_operations;
1507extern struct smb_version_values smb21_values; 1493extern struct smb_version_values smb21_values;
1508#define SMB30_VERSION_STRING "3.0" 1494#define SMB30_VERSION_STRING "3.0"
1509/*extern struct smb_version_operations smb30_operations; */ /* not needed yet */ 1495extern struct smb_version_operations smb30_operations;
1510extern struct smb_version_values smb30_values; 1496extern struct smb_version_values smb30_values;
1511#endif /* _CIFS_GLOB_H */ 1497#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 5144e9fbeb8c..1988c1baa224 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -58,8 +58,10 @@ do { \
58} while (0) 58} while (0)
59extern int init_cifs_idmap(void); 59extern int init_cifs_idmap(void);
60extern void exit_cifs_idmap(void); 60extern void exit_cifs_idmap(void);
61extern void cifs_destroy_idmaptrees(void);
62extern char *build_path_from_dentry(struct dentry *); 61extern char *build_path_from_dentry(struct dentry *);
62extern char *cifs_build_path_to_root(struct smb_vol *vol,
63 struct cifs_sb_info *cifs_sb,
64 struct cifs_tcon *tcon);
63extern char *build_wildcard_path_from_dentry(struct dentry *direntry); 65extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
64extern char *cifs_compose_mount_options(const char *sb_mountdata, 66extern char *cifs_compose_mount_options(const char *sb_mountdata,
65 const char *fullpath, const struct dfs_info3_param *ref, 67 const char *fullpath, const struct dfs_info3_param *ref,
@@ -107,9 +109,7 @@ extern unsigned int smbCalcSize(void *buf);
107extern int decode_negTokenInit(unsigned char *security_blob, int length, 109extern int decode_negTokenInit(unsigned char *security_blob, int length,
108 struct TCP_Server_Info *server); 110 struct TCP_Server_Info *server);
109extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len); 111extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
110extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port); 112extern void cifs_set_port(struct sockaddr *addr, const unsigned short int port);
111extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
112 const unsigned short int port);
113extern int map_smb_to_linux_error(char *buf, bool logErr); 113extern int map_smb_to_linux_error(char *buf, bool logErr);
114extern void header_assemble(struct smb_hdr *, char /* command */ , 114extern void header_assemble(struct smb_hdr *, char /* command */ ,
115 const struct cifs_tcon *, int /* length of 115 const struct cifs_tcon *, int /* length of
@@ -185,7 +185,7 @@ extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon);
185extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, 185extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset,
186 __u64 length, __u8 type, 186 __u64 length, __u8 type,
187 struct cifsLockInfo **conf_lock, 187 struct cifsLockInfo **conf_lock,
188 bool rw_check); 188 int rw_check);
189extern void cifs_add_pending_open(struct cifs_fid *fid, 189extern void cifs_add_pending_open(struct cifs_fid *fid,
190 struct tcon_link *tlink, 190 struct tcon_link *tlink,
191 struct cifs_pending_open *open); 191 struct cifs_pending_open *open);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 5c670b998ffb..7635b5db26a7 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -186,6 +186,7 @@ static const match_table_t cifs_mount_option_tokens = {
186 { Opt_user, "user=%s" }, 186 { Opt_user, "user=%s" },
187 { Opt_user, "username=%s" }, 187 { Opt_user, "username=%s" },
188 { Opt_blank_pass, "pass=" }, 188 { Opt_blank_pass, "pass=" },
189 { Opt_blank_pass, "password=" },
189 { Opt_pass, "pass=%s" }, 190 { Opt_pass, "pass=%s" },
190 { Opt_pass, "password=%s" }, 191 { Opt_pass, "password=%s" },
191 { Opt_blank_ip, "ip=" }, 192 { Opt_blank_ip, "ip=" },
@@ -274,6 +275,7 @@ static const match_table_t cifs_cacheflavor_tokens = {
274 275
275static const match_table_t cifs_smb_version_tokens = { 276static const match_table_t cifs_smb_version_tokens = {
276 { Smb_1, SMB1_VERSION_STRING }, 277 { Smb_1, SMB1_VERSION_STRING },
278 { Smb_20, SMB20_VERSION_STRING},
277 { Smb_21, SMB21_VERSION_STRING }, 279 { Smb_21, SMB21_VERSION_STRING },
278 { Smb_30, SMB30_VERSION_STRING }, 280 { Smb_30, SMB30_VERSION_STRING },
279}; 281};
@@ -1074,12 +1076,16 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
1074 vol->vals = &smb1_values; 1076 vol->vals = &smb1_values;
1075 break; 1077 break;
1076#ifdef CONFIG_CIFS_SMB2 1078#ifdef CONFIG_CIFS_SMB2
1079 case Smb_20:
1080 vol->ops = &smb21_operations; /* currently identical with 2.1 */
1081 vol->vals = &smb20_values;
1082 break;
1077 case Smb_21: 1083 case Smb_21:
1078 vol->ops = &smb21_operations; 1084 vol->ops = &smb21_operations;
1079 vol->vals = &smb21_values; 1085 vol->vals = &smb21_values;
1080 break; 1086 break;
1081 case Smb_30: 1087 case Smb_30:
1082 vol->ops = &smb21_operations; /* currently identical with 2.1 */ 1088 vol->ops = &smb30_operations;
1083 vol->vals = &smb30_values; 1089 vol->vals = &smb30_values;
1084 break; 1090 break;
1085#endif 1091#endif
@@ -1090,6 +1096,52 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
1090 return 0; 1096 return 0;
1091} 1097}
1092 1098
1099/*
1100 * Parse a devname into substrings and populate the vol->UNC and vol->prepath
1101 * fields with the result. Returns 0 on success and an error otherwise.
1102 */
1103static int
1104cifs_parse_devname(const char *devname, struct smb_vol *vol)
1105{
1106 char *pos;
1107 const char *delims = "/\\";
1108 size_t len;
1109
1110 /* make sure we have a valid UNC double delimiter prefix */
1111 len = strspn(devname, delims);
1112 if (len != 2)
1113 return -EINVAL;
1114
1115 /* find delimiter between host and sharename */
1116 pos = strpbrk(devname + 2, delims);
1117 if (!pos)
1118 return -EINVAL;
1119
1120 /* skip past delimiter */
1121 ++pos;
1122
1123 /* now go until next delimiter or end of string */
1124 len = strcspn(pos, delims);
1125
1126 /* move "pos" up to delimiter or NULL */
1127 pos += len;
1128 vol->UNC = kstrndup(devname, pos - devname, GFP_KERNEL);
1129 if (!vol->UNC)
1130 return -ENOMEM;
1131
1132 convert_delimiter(vol->UNC, '\\');
1133
1134 /* If pos is NULL, or is a bogus trailing delimiter then no prepath */
1135 if (!*pos++ || !*pos)
1136 return 0;
1137
1138 vol->prepath = kstrdup(pos, GFP_KERNEL);
1139 if (!vol->prepath)
1140 return -ENOMEM;
1141
1142 return 0;
1143}
1144
1093static int 1145static int
1094cifs_parse_mount_options(const char *mountdata, const char *devname, 1146cifs_parse_mount_options(const char *mountdata, const char *devname,
1095 struct smb_vol *vol) 1147 struct smb_vol *vol)
@@ -1108,11 +1160,17 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1108 char *string = NULL; 1160 char *string = NULL;
1109 char *tmp_end, *value; 1161 char *tmp_end, *value;
1110 char delim; 1162 char delim;
1163 bool got_ip = false;
1164 unsigned short port = 0;
1165 struct sockaddr *dstaddr = (struct sockaddr *)&vol->dstaddr;
1111 1166
1112 separator[0] = ','; 1167 separator[0] = ',';
1113 separator[1] = 0; 1168 separator[1] = 0;
1114 delim = separator[0]; 1169 delim = separator[0];
1115 1170
1171 /* ensure we always start with zeroed-out smb_vol */
1172 memset(vol, 0, sizeof(*vol));
1173
1116 /* 1174 /*
1117 * does not have to be perfect mapping since field is 1175 * does not have to be perfect mapping since field is
1118 * informational, only used for servers that do not support 1176 * informational, only used for servers that do not support
@@ -1169,6 +1227,16 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1169 vol->backupuid_specified = false; /* no backup intent for a user */ 1227 vol->backupuid_specified = false; /* no backup intent for a user */
1170 vol->backupgid_specified = false; /* no backup intent for a group */ 1228 vol->backupgid_specified = false; /* no backup intent for a group */
1171 1229
1230 /*
1231 * For now, we ignore -EINVAL errors under the assumption that the
1232 * unc= and prefixpath= options will be usable.
1233 */
1234 if (cifs_parse_devname(devname, vol) == -ENOMEM) {
1235 printk(KERN_ERR "CIFS: Unable to allocate memory to parse "
1236 "device string.\n");
1237 goto out_nomem;
1238 }
1239
1172 while ((data = strsep(&options, separator)) != NULL) { 1240 while ((data = strsep(&options, separator)) != NULL) {
1173 substring_t args[MAX_OPT_ARGS]; 1241 substring_t args[MAX_OPT_ARGS];
1174 unsigned long option; 1242 unsigned long option;
@@ -1416,12 +1484,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1416 vol->dir_mode = option; 1484 vol->dir_mode = option;
1417 break; 1485 break;
1418 case Opt_port: 1486 case Opt_port:
1419 if (get_option_ul(args, &option)) { 1487 if (get_option_ul(args, &option) ||
1420 cERROR(1, "%s: Invalid port value", 1488 option > USHRT_MAX) {
1421 __func__); 1489 cERROR(1, "%s: Invalid port value", __func__);
1422 goto cifs_parse_mount_err; 1490 goto cifs_parse_mount_err;
1423 } 1491 }
1424 vol->port = option; 1492 port = (unsigned short)option;
1425 break; 1493 break;
1426 case Opt_rsize: 1494 case Opt_rsize:
1427 if (get_option_ul(args, &option)) { 1495 if (get_option_ul(args, &option)) {
@@ -1537,53 +1605,48 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1537 vol->password[j] = '\0'; 1605 vol->password[j] = '\0';
1538 break; 1606 break;
1539 case Opt_blank_ip: 1607 case Opt_blank_ip:
1540 vol->UNCip = NULL; 1608 /* FIXME: should this be an error instead? */
1609 got_ip = false;
1541 break; 1610 break;
1542 case Opt_ip: 1611 case Opt_ip:
1543 string = match_strdup(args); 1612 string = match_strdup(args);
1544 if (string == NULL) 1613 if (string == NULL)
1545 goto out_nomem; 1614 goto out_nomem;
1546 1615
1547 if (strnlen(string, INET6_ADDRSTRLEN) > 1616 if (!cifs_convert_address(dstaddr, string,
1548 INET6_ADDRSTRLEN) { 1617 strlen(string))) {
1549 printk(KERN_WARNING "CIFS: ip address " 1618 printk(KERN_ERR "CIFS: bad ip= option (%s).\n",
1550 "too long\n"); 1619 string);
1551 goto cifs_parse_mount_err;
1552 }
1553 vol->UNCip = kstrdup(string, GFP_KERNEL);
1554 if (!vol->UNCip) {
1555 printk(KERN_WARNING "CIFS: no memory "
1556 "for UNC IP\n");
1557 goto cifs_parse_mount_err; 1620 goto cifs_parse_mount_err;
1558 } 1621 }
1622 got_ip = true;
1559 break; 1623 break;
1560 case Opt_unc: 1624 case Opt_unc:
1561 string = match_strdup(args); 1625 string = vol->UNC;
1562 if (string == NULL) 1626 vol->UNC = match_strdup(args);
1627 if (vol->UNC == NULL) {
1628 kfree(string);
1563 goto out_nomem; 1629 goto out_nomem;
1564
1565 temp_len = strnlen(string, 300);
1566 if (temp_len == 300) {
1567 printk(KERN_WARNING "CIFS: UNC name too long\n");
1568 goto cifs_parse_mount_err;
1569 } 1630 }
1570 1631
1571 vol->UNC = kmalloc(temp_len+1, GFP_KERNEL); 1632 convert_delimiter(vol->UNC, '\\');
1572 if (vol->UNC == NULL) { 1633 if (vol->UNC[0] != '\\' || vol->UNC[1] != '\\') {
1573 printk(KERN_WARNING "CIFS: no memory for UNC\n"); 1634 kfree(string);
1574 goto cifs_parse_mount_err; 1635 printk(KERN_ERR "CIFS: UNC Path does not "
1575 } 1636 "begin with // or \\\\\n");
1576 strcpy(vol->UNC, string);
1577
1578 if (strncmp(string, "//", 2) == 0) {
1579 vol->UNC[0] = '\\';
1580 vol->UNC[1] = '\\';
1581 } else if (strncmp(string, "\\\\", 2) != 0) {
1582 printk(KERN_WARNING "CIFS: UNC Path does not "
1583 "begin with // or \\\\\n");
1584 goto cifs_parse_mount_err; 1637 goto cifs_parse_mount_err;
1585 } 1638 }
1586 1639
1640 /* Compare old unc= option to new one */
1641 if (!string || strcmp(string, vol->UNC))
1642 printk(KERN_WARNING "CIFS: the value of the "
1643 "unc= mount option does not match the "
1644 "device string. Using the unc= option "
1645 "for now. In 3.10, that option will "
1646 "be ignored and the contents of the "
1647 "device string will be used "
1648 "instead. (%s != %s)\n", string,
1649 vol->UNC);
1587 break; 1650 break;
1588 case Opt_domain: 1651 case Opt_domain:
1589 string = match_strdup(args); 1652 string = match_strdup(args);
@@ -1618,31 +1681,26 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1618 } 1681 }
1619 break; 1682 break;
1620 case Opt_prefixpath: 1683 case Opt_prefixpath:
1621 string = match_strdup(args); 1684 /* skip over any leading delimiter */
1622 if (string == NULL) 1685 if (*args[0].from == '/' || *args[0].from == '\\')
1623 goto out_nomem; 1686 args[0].from++;
1624
1625 temp_len = strnlen(string, 1024);
1626 if (string[0] != '/')
1627 temp_len++; /* missing leading slash */
1628 if (temp_len > 1024) {
1629 printk(KERN_WARNING "CIFS: prefix too long\n");
1630 goto cifs_parse_mount_err;
1631 }
1632 1687
1633 vol->prepath = kmalloc(temp_len+1, GFP_KERNEL); 1688 string = vol->prepath;
1689 vol->prepath = match_strdup(args);
1634 if (vol->prepath == NULL) { 1690 if (vol->prepath == NULL) {
1635 printk(KERN_WARNING "CIFS: no memory " 1691 kfree(string);
1636 "for path prefix\n"); 1692 goto out_nomem;
1637 goto cifs_parse_mount_err;
1638 } 1693 }
1639 1694 /* Compare old prefixpath= option to new one */
1640 if (string[0] != '/') { 1695 if (!string || strcmp(string, vol->prepath))
1641 vol->prepath[0] = '/'; 1696 printk(KERN_WARNING "CIFS: the value of the "
1642 strcpy(vol->prepath+1, string); 1697 "prefixpath= mount option does not "
1643 } else 1698 "match the device string. Using the "
1644 strcpy(vol->prepath, string); 1699 "prefixpath= option for now. In 3.10, "
1645 1700 "that option will be ignored and the "
1701 "contents of the device string will be "
1702 "used instead.(%s != %s)\n", string,
1703 vol->prepath);
1646 break; 1704 break;
1647 case Opt_iocharset: 1705 case Opt_iocharset:
1648 string = match_strdup(args); 1706 string = match_strdup(args);
@@ -1799,9 +1857,30 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1799 goto cifs_parse_mount_err; 1857 goto cifs_parse_mount_err;
1800 } 1858 }
1801#endif 1859#endif
1860 if (!vol->UNC) {
1861 cERROR(1, "CIFS mount error: No usable UNC path provided in "
1862 "device string or in unc= option!");
1863 goto cifs_parse_mount_err;
1864 }
1802 1865
1803 if (vol->UNCip == NULL) 1866 /* make sure UNC has a share name */
1804 vol->UNCip = &vol->UNC[2]; 1867 if (!strchr(vol->UNC + 3, '\\')) {
1868 cERROR(1, "Malformed UNC. Unable to find share name.");
1869 goto cifs_parse_mount_err;
1870 }
1871
1872 if (!got_ip) {
1873 /* No ip= option specified? Try to get it from UNC */
1874 if (!cifs_convert_address(dstaddr, &vol->UNC[2],
1875 strlen(&vol->UNC[2]))) {
1876 printk(KERN_ERR "Unable to determine destination "
1877 "address.\n");
1878 goto cifs_parse_mount_err;
1879 }
1880 }
1881
1882 /* set the port that we got earlier */
1883 cifs_set_port(dstaddr, port);
1805 1884
1806 if (uid_specified) 1885 if (uid_specified)
1807 vol->override_uid = override_uid; 1886 vol->override_uid = override_uid;
@@ -1972,9 +2051,10 @@ match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
1972 return true; 2051 return true;
1973} 2052}
1974 2053
1975static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr, 2054static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
1976 struct smb_vol *vol)
1977{ 2055{
2056 struct sockaddr *addr = (struct sockaddr *)&vol->dstaddr;
2057
1978 if ((server->vals != vol->vals) || (server->ops != vol->ops)) 2058 if ((server->vals != vol->vals) || (server->ops != vol->ops))
1979 return 0; 2059 return 0;
1980 2060
@@ -1995,13 +2075,13 @@ static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr,
1995} 2075}
1996 2076
1997static struct TCP_Server_Info * 2077static struct TCP_Server_Info *
1998cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol) 2078cifs_find_tcp_session(struct smb_vol *vol)
1999{ 2079{
2000 struct TCP_Server_Info *server; 2080 struct TCP_Server_Info *server;
2001 2081
2002 spin_lock(&cifs_tcp_ses_lock); 2082 spin_lock(&cifs_tcp_ses_lock);
2003 list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { 2083 list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
2004 if (!match_server(server, addr, vol)) 2084 if (!match_server(server, vol))
2005 continue; 2085 continue;
2006 2086
2007 ++server->srv_count; 2087 ++server->srv_count;
@@ -2051,40 +2131,12 @@ static struct TCP_Server_Info *
2051cifs_get_tcp_session(struct smb_vol *volume_info) 2131cifs_get_tcp_session(struct smb_vol *volume_info)
2052{ 2132{
2053 struct TCP_Server_Info *tcp_ses = NULL; 2133 struct TCP_Server_Info *tcp_ses = NULL;
2054 struct sockaddr_storage addr;
2055 struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr;
2056 struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
2057 int rc; 2134 int rc;
2058 2135
2059 memset(&addr, 0, sizeof(struct sockaddr_storage)); 2136 cFYI(1, "UNC: %s", volume_info->UNC);
2060
2061 cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip);
2062
2063 if (volume_info->UNCip && volume_info->UNC) {
2064 rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
2065 volume_info->UNCip,
2066 strlen(volume_info->UNCip),
2067 volume_info->port);
2068 if (!rc) {
2069 /* we failed translating address */
2070 rc = -EINVAL;
2071 goto out_err;
2072 }
2073 } else if (volume_info->UNCip) {
2074 /* BB using ip addr as tcp_ses name to connect to the
2075 DFS root below */
2076 cERROR(1, "Connecting to DFS root not implemented yet");
2077 rc = -EINVAL;
2078 goto out_err;
2079 } else /* which tcp_sess DFS root would we conect to */ {
2080 cERROR(1, "CIFS mount error: No UNC path (e.g. -o "
2081 "unc=//192.168.1.100/public) specified");
2082 rc = -EINVAL;
2083 goto out_err;
2084 }
2085 2137
2086 /* see if we already have a matching tcp_ses */ 2138 /* see if we already have a matching tcp_ses */
2087 tcp_ses = cifs_find_tcp_session((struct sockaddr *)&addr, volume_info); 2139 tcp_ses = cifs_find_tcp_session(volume_info);
2088 if (tcp_ses) 2140 if (tcp_ses)
2089 return tcp_ses; 2141 return tcp_ses;
2090 2142
@@ -2129,27 +2181,18 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
2129 INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); 2181 INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
2130 INIT_LIST_HEAD(&tcp_ses->smb_ses_list); 2182 INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
2131 INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request); 2183 INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
2132 2184 memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
2185 sizeof(tcp_ses->srcaddr));
2186 memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr,
2187 sizeof(tcp_ses->dstaddr));
2133 /* 2188 /*
2134 * at this point we are the only ones with the pointer 2189 * at this point we are the only ones with the pointer
2135 * to the struct since the kernel thread not created yet 2190 * to the struct since the kernel thread not created yet
2136 * no need to spinlock this init of tcpStatus or srv_count 2191 * no need to spinlock this init of tcpStatus or srv_count
2137 */ 2192 */
2138 tcp_ses->tcpStatus = CifsNew; 2193 tcp_ses->tcpStatus = CifsNew;
2139 memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
2140 sizeof(tcp_ses->srcaddr));
2141 ++tcp_ses->srv_count; 2194 ++tcp_ses->srv_count;
2142 2195
2143 if (addr.ss_family == AF_INET6) {
2144 cFYI(1, "attempting ipv6 connect");
2145 /* BB should we allow ipv6 on port 139? */
2146 /* other OS never observed in Wild doing 139 with v6 */
2147 memcpy(&tcp_ses->dstaddr, sin_server6,
2148 sizeof(struct sockaddr_in6));
2149 } else
2150 memcpy(&tcp_ses->dstaddr, sin_server,
2151 sizeof(struct sockaddr_in));
2152
2153 rc = ip_connect(tcp_ses); 2196 rc = ip_connect(tcp_ses);
2154 if (rc < 0) { 2197 if (rc < 0) {
2155 cERROR(1, "Error connecting to socket. Aborting operation"); 2198 cERROR(1, "Error connecting to socket. Aborting operation");
@@ -2397,8 +2440,6 @@ cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)),
2397} 2440}
2398#endif /* CONFIG_KEYS */ 2441#endif /* CONFIG_KEYS */
2399 2442
2400static bool warned_on_ntlm; /* globals init to false automatically */
2401
2402static struct cifs_ses * 2443static struct cifs_ses *
2403cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) 2444cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
2404{ 2445{
@@ -2475,14 +2516,6 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
2475 ses->cred_uid = volume_info->cred_uid; 2516 ses->cred_uid = volume_info->cred_uid;
2476 ses->linux_uid = volume_info->linux_uid; 2517 ses->linux_uid = volume_info->linux_uid;
2477 2518
2478 /* ntlmv2 is much stronger than ntlm security, and has been broadly
2479 supported for many years, time to update default security mechanism */
2480 if ((volume_info->secFlg == 0) && warned_on_ntlm == false) {
2481 warned_on_ntlm = true;
2482 cERROR(1, "default security mechanism requested. The default "
2483 "security mechanism will be upgraded from ntlm to "
2484 "ntlmv2 in kernel release 3.3");
2485 }
2486 ses->overrideSecFlg = volume_info->secFlg; 2519 ses->overrideSecFlg = volume_info->secFlg;
2487 2520
2488 mutex_lock(&ses->session_mutex); 2521 mutex_lock(&ses->session_mutex);
@@ -2598,13 +2631,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
2598 } 2631 }
2599 } 2632 }
2600 2633
2601 if (strchr(volume_info->UNC + 3, '\\') == NULL
2602 && strchr(volume_info->UNC + 3, '/') == NULL) {
2603 cERROR(1, "Missing share name");
2604 rc = -ENODEV;
2605 goto out_fail;
2606 }
2607
2608 /* 2634 /*
2609 * BB Do we need to wrap session_mutex around this TCon call and Unix 2635 * BB Do we need to wrap session_mutex around this TCon call and Unix
2610 * SetFS as we do on SessSetup and reconnect? 2636 * SetFS as we do on SessSetup and reconnect?
@@ -2718,11 +2744,8 @@ cifs_match_super(struct super_block *sb, void *data)
2718 struct cifs_ses *ses; 2744 struct cifs_ses *ses;
2719 struct cifs_tcon *tcon; 2745 struct cifs_tcon *tcon;
2720 struct tcon_link *tlink; 2746 struct tcon_link *tlink;
2721 struct sockaddr_storage addr;
2722 int rc = 0; 2747 int rc = 0;
2723 2748
2724 memset(&addr, 0, sizeof(struct sockaddr_storage));
2725
2726 spin_lock(&cifs_tcp_ses_lock); 2749 spin_lock(&cifs_tcp_ses_lock);
2727 cifs_sb = CIFS_SB(sb); 2750 cifs_sb = CIFS_SB(sb);
2728 tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); 2751 tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
@@ -2736,17 +2759,7 @@ cifs_match_super(struct super_block *sb, void *data)
2736 2759
2737 volume_info = mnt_data->vol; 2760 volume_info = mnt_data->vol;
2738 2761
2739 if (!volume_info->UNCip || !volume_info->UNC) 2762 if (!match_server(tcp_srv, volume_info) ||
2740 goto out;
2741
2742 rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
2743 volume_info->UNCip,
2744 strlen(volume_info->UNCip),
2745 volume_info->port);
2746 if (!rc)
2747 goto out;
2748
2749 if (!match_server(tcp_srv, (struct sockaddr *)&addr, volume_info) ||
2750 !match_session(ses, volume_info) || 2763 !match_session(ses, volume_info) ||
2751 !match_tcon(tcon, volume_info->UNC)) { 2764 !match_tcon(tcon, volume_info->UNC)) {
2752 rc = 0; 2765 rc = 0;
@@ -3261,8 +3274,6 @@ cleanup_volume_info_contents(struct smb_vol *volume_info)
3261{ 3274{
3262 kfree(volume_info->username); 3275 kfree(volume_info->username);
3263 kzfree(volume_info->password); 3276 kzfree(volume_info->password);
3264 if (volume_info->UNCip != volume_info->UNC + 2)
3265 kfree(volume_info->UNCip);
3266 kfree(volume_info->UNC); 3277 kfree(volume_info->UNC);
3267 kfree(volume_info->domainname); 3278 kfree(volume_info->domainname);
3268 kfree(volume_info->iocharset); 3279 kfree(volume_info->iocharset);
@@ -3280,14 +3291,16 @@ cifs_cleanup_volume_info(struct smb_vol *volume_info)
3280 3291
3281 3292
3282#ifdef CONFIG_CIFS_DFS_UPCALL 3293#ifdef CONFIG_CIFS_DFS_UPCALL
3283/* build_path_to_root returns full path to root when 3294/*
3284 * we do not have an exiting connection (tcon) */ 3295 * cifs_build_path_to_root returns full path to root when we do not have an
3296 * exiting connection (tcon)
3297 */
3285static char * 3298static char *
3286build_unc_path_to_root(const struct smb_vol *vol, 3299build_unc_path_to_root(const struct smb_vol *vol,
3287 const struct cifs_sb_info *cifs_sb) 3300 const struct cifs_sb_info *cifs_sb)
3288{ 3301{
3289 char *full_path, *pos; 3302 char *full_path, *pos;
3290 unsigned int pplen = vol->prepath ? strlen(vol->prepath) : 0; 3303 unsigned int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0;
3291 unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1); 3304 unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1);
3292 3305
3293 full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL); 3306 full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL);
@@ -3298,6 +3311,7 @@ build_unc_path_to_root(const struct smb_vol *vol,
3298 pos = full_path + unc_len; 3311 pos = full_path + unc_len;
3299 3312
3300 if (pplen) { 3313 if (pplen) {
3314 *pos++ = CIFS_DIR_SEP(cifs_sb);
3301 strncpy(pos, vol->prepath, pplen); 3315 strncpy(pos, vol->prepath, pplen);
3302 pos += pplen; 3316 pos += pplen;
3303 } 3317 }
@@ -3353,7 +3367,6 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses,
3353 mdata = NULL; 3367 mdata = NULL;
3354 } else { 3368 } else {
3355 cleanup_volume_info_contents(volume_info); 3369 cleanup_volume_info_contents(volume_info);
3356 memset(volume_info, '\0', sizeof(*volume_info));
3357 rc = cifs_setup_volume_info(volume_info, mdata, 3370 rc = cifs_setup_volume_info(volume_info, mdata,
3358 fake_devname); 3371 fake_devname);
3359 } 3372 }
@@ -3375,7 +3388,6 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
3375 if (cifs_parse_mount_options(mount_data, devname, volume_info)) 3388 if (cifs_parse_mount_options(mount_data, devname, volume_info))
3376 return -EINVAL; 3389 return -EINVAL;
3377 3390
3378
3379 if (volume_info->nullauth) { 3391 if (volume_info->nullauth) {
3380 cFYI(1, "Anonymous login"); 3392 cFYI(1, "Anonymous login");
3381 kfree(volume_info->username); 3393 kfree(volume_info->username);
@@ -3412,7 +3424,7 @@ cifs_get_volume_info(char *mount_data, const char *devname)
3412 int rc; 3424 int rc;
3413 struct smb_vol *volume_info; 3425 struct smb_vol *volume_info;
3414 3426
3415 volume_info = kzalloc(sizeof(struct smb_vol), GFP_KERNEL); 3427 volume_info = kmalloc(sizeof(struct smb_vol), GFP_KERNEL);
3416 if (!volume_info) 3428 if (!volume_info)
3417 return ERR_PTR(-ENOMEM); 3429 return ERR_PTR(-ENOMEM);
3418 3430
@@ -3537,8 +3549,10 @@ remote_path_check:
3537 rc = -ENOSYS; 3549 rc = -ENOSYS;
3538 goto mount_fail_check; 3550 goto mount_fail_check;
3539 } 3551 }
3540 /* build_path_to_root works only when we have a valid tcon */ 3552 /*
3541 full_path = build_path_to_root(volume_info, cifs_sb, tcon); 3553 * cifs_build_path_to_root works only when we have a valid tcon
3554 */
3555 full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon);
3542 if (full_path == NULL) { 3556 if (full_path == NULL) {
3543 rc = -ENOMEM; 3557 rc = -ENOMEM;
3544 goto mount_fail_check; 3558 goto mount_fail_check;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 7c0a81283645..8719bbe0dcc3 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -44,6 +44,38 @@ renew_parental_timestamps(struct dentry *direntry)
44 } while (!IS_ROOT(direntry)); 44 } while (!IS_ROOT(direntry));
45} 45}
46 46
47char *
48cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
49 struct cifs_tcon *tcon)
50{
51 int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0;
52 int dfsplen;
53 char *full_path = NULL;
54
55 /* if no prefix path, simply set path to the root of share to "" */
56 if (pplen == 0) {
57 full_path = kzalloc(1, GFP_KERNEL);
58 return full_path;
59 }
60
61 if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
62 dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
63 else
64 dfsplen = 0;
65
66 full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL);
67 if (full_path == NULL)
68 return full_path;
69
70 if (dfsplen)
71 strncpy(full_path, tcon->treeName, dfsplen);
72 full_path[dfsplen] = CIFS_DIR_SEP(cifs_sb);
73 strncpy(full_path + dfsplen + 1, vol->prepath, pplen);
74 convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
75 full_path[dfsplen + pplen] = 0; /* add trailing null */
76 return full_path;
77}
78
47/* Note: caller must free return buffer */ 79/* Note: caller must free return buffer */
48char * 80char *
49build_path_from_dentry(struct dentry *direntry) 81build_path_from_dentry(struct dentry *direntry)
@@ -398,7 +430,16 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
398 * in network traffic in the other paths. 430 * in network traffic in the other paths.
399 */ 431 */
400 if (!(oflags & O_CREAT)) { 432 if (!(oflags & O_CREAT)) {
401 struct dentry *res = cifs_lookup(inode, direntry, 0); 433 struct dentry *res;
434
435 /*
436 * Check for hashed negative dentry. We have already revalidated
437 * the dentry and it is fine. No need to perform another lookup.
438 */
439 if (!d_unhashed(direntry))
440 return -ENOENT;
441
442 res = cifs_lookup(inode, direntry, 0);
402 if (IS_ERR(res)) 443 if (IS_ERR(res))
403 return PTR_ERR(res); 444 return PTR_ERR(res);
404 445
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index edb25b4bbb95..0a6677ba212b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -505,16 +505,36 @@ out:
505 return rc; 505 return rc;
506} 506}
507 507
508static int cifs_push_posix_locks(struct cifsFileInfo *cfile);
509
508/* 510/*
509 * Try to reacquire byte range locks that were released when session 511 * Try to reacquire byte range locks that were released when session
510 * to server was lost 512 * to server was lost.
511 */ 513 */
512static int cifs_relock_file(struct cifsFileInfo *cifsFile) 514static int
515cifs_relock_file(struct cifsFileInfo *cfile)
513{ 516{
517 struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
518 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
519 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
514 int rc = 0; 520 int rc = 0;
515 521
516 /* BB list all locks open on this file and relock */ 522 /* we are going to update can_cache_brlcks here - need a write access */
523 down_write(&cinode->lock_sem);
524 if (cinode->can_cache_brlcks) {
525 /* can cache locks - no need to push them */
526 up_write(&cinode->lock_sem);
527 return rc;
528 }
517 529
530 if (cap_unix(tcon->ses) &&
531 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
532 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
533 rc = cifs_push_posix_locks(cfile);
534 else
535 rc = tcon->ses->server->ops->push_mand_locks(cfile);
536
537 up_write(&cinode->lock_sem);
518 return rc; 538 return rc;
519} 539}
520 540
@@ -739,10 +759,15 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock)
739 } 759 }
740} 760}
741 761
762#define CIFS_LOCK_OP 0
763#define CIFS_READ_OP 1
764#define CIFS_WRITE_OP 2
765
766/* @rw_check : 0 - no op, 1 - read, 2 - write */
742static bool 767static bool
743cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset, 768cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
744 __u64 length, __u8 type, struct cifsFileInfo *cfile, 769 __u64 length, __u8 type, struct cifsFileInfo *cfile,
745 struct cifsLockInfo **conf_lock, bool rw_check) 770 struct cifsLockInfo **conf_lock, int rw_check)
746{ 771{
747 struct cifsLockInfo *li; 772 struct cifsLockInfo *li;
748 struct cifsFileInfo *cur_cfile = fdlocks->cfile; 773 struct cifsFileInfo *cur_cfile = fdlocks->cfile;
@@ -752,9 +777,13 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
752 if (offset + length <= li->offset || 777 if (offset + length <= li->offset ||
753 offset >= li->offset + li->length) 778 offset >= li->offset + li->length)
754 continue; 779 continue;
755 if (rw_check && server->ops->compare_fids(cfile, cur_cfile) && 780 if (rw_check != CIFS_LOCK_OP && current->tgid == li->pid &&
756 current->tgid == li->pid) 781 server->ops->compare_fids(cfile, cur_cfile)) {
757 continue; 782 /* shared lock prevents write op through the same fid */
783 if (!(li->type & server->vals->shared_lock_type) ||
784 rw_check != CIFS_WRITE_OP)
785 continue;
786 }
758 if ((type & server->vals->shared_lock_type) && 787 if ((type & server->vals->shared_lock_type) &&
759 ((server->ops->compare_fids(cfile, cur_cfile) && 788 ((server->ops->compare_fids(cfile, cur_cfile) &&
760 current->tgid == li->pid) || type == li->type)) 789 current->tgid == li->pid) || type == li->type))
@@ -769,7 +798,7 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
769bool 798bool
770cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length, 799cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
771 __u8 type, struct cifsLockInfo **conf_lock, 800 __u8 type, struct cifsLockInfo **conf_lock,
772 bool rw_check) 801 int rw_check)
773{ 802{
774 bool rc = false; 803 bool rc = false;
775 struct cifs_fid_locks *cur; 804 struct cifs_fid_locks *cur;
@@ -805,7 +834,7 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
805 down_read(&cinode->lock_sem); 834 down_read(&cinode->lock_sem);
806 835
807 exist = cifs_find_lock_conflict(cfile, offset, length, type, 836 exist = cifs_find_lock_conflict(cfile, offset, length, type,
808 &conf_lock, false); 837 &conf_lock, CIFS_LOCK_OP);
809 if (exist) { 838 if (exist) {
810 flock->fl_start = conf_lock->offset; 839 flock->fl_start = conf_lock->offset;
811 flock->fl_end = conf_lock->offset + conf_lock->length - 1; 840 flock->fl_end = conf_lock->offset + conf_lock->length - 1;
@@ -852,7 +881,7 @@ try_again:
852 down_write(&cinode->lock_sem); 881 down_write(&cinode->lock_sem);
853 882
854 exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length, 883 exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length,
855 lock->type, &conf_lock, false); 884 lock->type, &conf_lock, CIFS_LOCK_OP);
856 if (!exist && cinode->can_cache_brlcks) { 885 if (!exist && cinode->can_cache_brlcks) {
857 list_add_tail(&lock->llist, &cfile->llist->locks); 886 list_add_tail(&lock->llist, &cfile->llist->locks);
858 up_write(&cinode->lock_sem); 887 up_write(&cinode->lock_sem);
@@ -948,7 +977,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
948 int rc = 0, stored_rc; 977 int rc = 0, stored_rc;
949 struct cifsLockInfo *li, *tmp; 978 struct cifsLockInfo *li, *tmp;
950 struct cifs_tcon *tcon; 979 struct cifs_tcon *tcon;
951 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
952 unsigned int num, max_num, max_buf; 980 unsigned int num, max_num, max_buf;
953 LOCKING_ANDX_RANGE *buf, *cur; 981 LOCKING_ANDX_RANGE *buf, *cur;
954 int types[] = {LOCKING_ANDX_LARGE_FILES, 982 int types[] = {LOCKING_ANDX_LARGE_FILES,
@@ -958,21 +986,12 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
958 xid = get_xid(); 986 xid = get_xid();
959 tcon = tlink_tcon(cfile->tlink); 987 tcon = tlink_tcon(cfile->tlink);
960 988
961 /* we are going to update can_cache_brlcks here - need a write access */
962 down_write(&cinode->lock_sem);
963 if (!cinode->can_cache_brlcks) {
964 up_write(&cinode->lock_sem);
965 free_xid(xid);
966 return rc;
967 }
968
969 /* 989 /*
970 * Accessing maxBuf is racy with cifs_reconnect - need to store value 990 * Accessing maxBuf is racy with cifs_reconnect - need to store value
971 * and check it for zero before using. 991 * and check it for zero before using.
972 */ 992 */
973 max_buf = tcon->ses->server->maxBuf; 993 max_buf = tcon->ses->server->maxBuf;
974 if (!max_buf) { 994 if (!max_buf) {
975 up_write(&cinode->lock_sem);
976 free_xid(xid); 995 free_xid(xid);
977 return -EINVAL; 996 return -EINVAL;
978 } 997 }
@@ -981,7 +1000,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
981 sizeof(LOCKING_ANDX_RANGE); 1000 sizeof(LOCKING_ANDX_RANGE);
982 buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); 1001 buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
983 if (!buf) { 1002 if (!buf) {
984 up_write(&cinode->lock_sem);
985 free_xid(xid); 1003 free_xid(xid);
986 return -ENOMEM; 1004 return -ENOMEM;
987 } 1005 }
@@ -1018,9 +1036,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
1018 } 1036 }
1019 } 1037 }
1020 1038
1021 cinode->can_cache_brlcks = false;
1022 up_write(&cinode->lock_sem);
1023
1024 kfree(buf); 1039 kfree(buf);
1025 free_xid(xid); 1040 free_xid(xid);
1026 return rc; 1041 return rc;
@@ -1043,7 +1058,6 @@ struct lock_to_push {
1043static int 1058static int
1044cifs_push_posix_locks(struct cifsFileInfo *cfile) 1059cifs_push_posix_locks(struct cifsFileInfo *cfile)
1045{ 1060{
1046 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
1047 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 1061 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
1048 struct file_lock *flock, **before; 1062 struct file_lock *flock, **before;
1049 unsigned int count = 0, i = 0; 1063 unsigned int count = 0, i = 0;
@@ -1054,14 +1068,6 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1054 1068
1055 xid = get_xid(); 1069 xid = get_xid();
1056 1070
1057 /* we are going to update can_cache_brlcks here - need a write access */
1058 down_write(&cinode->lock_sem);
1059 if (!cinode->can_cache_brlcks) {
1060 up_write(&cinode->lock_sem);
1061 free_xid(xid);
1062 return rc;
1063 }
1064
1065 lock_flocks(); 1071 lock_flocks();
1066 cifs_for_each_lock(cfile->dentry->d_inode, before) { 1072 cifs_for_each_lock(cfile->dentry->d_inode, before) {
1067 if ((*before)->fl_flags & FL_POSIX) 1073 if ((*before)->fl_flags & FL_POSIX)
@@ -1127,9 +1133,6 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1127 } 1133 }
1128 1134
1129out: 1135out:
1130 cinode->can_cache_brlcks = false;
1131 up_write(&cinode->lock_sem);
1132
1133 free_xid(xid); 1136 free_xid(xid);
1134 return rc; 1137 return rc;
1135err_out: 1138err_out:
@@ -1144,14 +1147,27 @@ static int
1144cifs_push_locks(struct cifsFileInfo *cfile) 1147cifs_push_locks(struct cifsFileInfo *cfile)
1145{ 1148{
1146 struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); 1149 struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
1150 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
1147 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 1151 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
1152 int rc = 0;
1153
1154 /* we are going to update can_cache_brlcks here - need a write access */
1155 down_write(&cinode->lock_sem);
1156 if (!cinode->can_cache_brlcks) {
1157 up_write(&cinode->lock_sem);
1158 return rc;
1159 }
1148 1160
1149 if (cap_unix(tcon->ses) && 1161 if (cap_unix(tcon->ses) &&
1150 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && 1162 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
1151 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) 1163 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
1152 return cifs_push_posix_locks(cfile); 1164 rc = cifs_push_posix_locks(cfile);
1165 else
1166 rc = tcon->ses->server->ops->push_mand_locks(cfile);
1153 1167
1154 return tcon->ses->server->ops->push_mand_locks(cfile); 1168 cinode->can_cache_brlcks = false;
1169 up_write(&cinode->lock_sem);
1170 return rc;
1155} 1171}
1156 1172
1157static void 1173static void
@@ -1436,16 +1452,18 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
1436 return -ENOMEM; 1452 return -ENOMEM;
1437 1453
1438 rc = cifs_lock_add_if(cfile, lock, wait_flag); 1454 rc = cifs_lock_add_if(cfile, lock, wait_flag);
1439 if (rc < 0) 1455 if (rc < 0) {
1440 kfree(lock); 1456 kfree(lock);
1441 if (rc <= 0) 1457 return rc;
1458 }
1459 if (!rc)
1442 goto out; 1460 goto out;
1443 1461
1444 rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, 1462 rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
1445 type, 1, 0, wait_flag); 1463 type, 1, 0, wait_flag);
1446 if (rc) { 1464 if (rc) {
1447 kfree(lock); 1465 kfree(lock);
1448 goto out; 1466 return rc;
1449 } 1467 }
1450 1468
1451 cifs_lock_add(cfile, lock); 1469 cifs_lock_add(cfile, lock);
@@ -1794,7 +1812,6 @@ static int cifs_writepages(struct address_space *mapping,
1794 struct TCP_Server_Info *server; 1812 struct TCP_Server_Info *server;
1795 struct page *page; 1813 struct page *page;
1796 int rc = 0; 1814 int rc = 0;
1797 loff_t isize = i_size_read(mapping->host);
1798 1815
1799 /* 1816 /*
1800 * If wsize is smaller than the page cache size, default to writing 1817 * If wsize is smaller than the page cache size, default to writing
@@ -1899,7 +1916,7 @@ retry:
1899 */ 1916 */
1900 set_page_writeback(page); 1917 set_page_writeback(page);
1901 1918
1902 if (page_offset(page) >= isize) { 1919 if (page_offset(page) >= i_size_read(mapping->host)) {
1903 done = true; 1920 done = true;
1904 unlock_page(page); 1921 unlock_page(page);
1905 end_page_writeback(page); 1922 end_page_writeback(page);
@@ -1932,7 +1949,8 @@ retry:
1932 wdata->offset = page_offset(wdata->pages[0]); 1949 wdata->offset = page_offset(wdata->pages[0]);
1933 wdata->pagesz = PAGE_CACHE_SIZE; 1950 wdata->pagesz = PAGE_CACHE_SIZE;
1934 wdata->tailsz = 1951 wdata->tailsz =
1935 min(isize - page_offset(wdata->pages[nr_pages - 1]), 1952 min(i_size_read(mapping->host) -
1953 page_offset(wdata->pages[nr_pages - 1]),
1936 (loff_t)PAGE_CACHE_SIZE); 1954 (loff_t)PAGE_CACHE_SIZE);
1937 wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + 1955 wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) +
1938 wdata->tailsz; 1956 wdata->tailsz;
@@ -2085,7 +2103,15 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
2085 } else { 2103 } else {
2086 rc = copied; 2104 rc = copied;
2087 pos += copied; 2105 pos += copied;
2088 set_page_dirty(page); 2106 /*
2107 * When we use strict cache mode and cifs_strict_writev was run
2108 * with level II oplock (indicated by leave_pages_clean field of
2109 * CIFS_I(inode)), we can leave pages clean - cifs_strict_writev
2110 * sent the data to the server itself.
2111 */
2112 if (!CIFS_I(inode)->leave_pages_clean ||
2113 !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO))
2114 set_page_dirty(page);
2089 } 2115 }
2090 2116
2091 if (rc > 0) { 2117 if (rc > 0) {
@@ -2436,8 +2462,8 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
2436} 2462}
2437 2463
2438static ssize_t 2464static ssize_t
2439cifs_writev(struct kiocb *iocb, const struct iovec *iov, 2465cifs_pagecache_writev(struct kiocb *iocb, const struct iovec *iov,
2440 unsigned long nr_segs, loff_t pos) 2466 unsigned long nr_segs, loff_t pos, bool cache_ex)
2441{ 2467{
2442 struct file *file = iocb->ki_filp; 2468 struct file *file = iocb->ki_filp;
2443 struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; 2469 struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
@@ -2457,10 +2483,14 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
2457 down_read(&cinode->lock_sem); 2483 down_read(&cinode->lock_sem);
2458 if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs), 2484 if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs),
2459 server->vals->exclusive_lock_type, NULL, 2485 server->vals->exclusive_lock_type, NULL,
2460 true)) { 2486 CIFS_WRITE_OP)) {
2461 mutex_lock(&inode->i_mutex); 2487 mutex_lock(&inode->i_mutex);
2488 if (!cache_ex)
2489 cinode->leave_pages_clean = true;
2462 rc = __generic_file_aio_write(iocb, iov, nr_segs, 2490 rc = __generic_file_aio_write(iocb, iov, nr_segs,
2463 &iocb->ki_pos); 2491 &iocb->ki_pos);
2492 if (!cache_ex)
2493 cinode->leave_pages_clean = false;
2464 mutex_unlock(&inode->i_mutex); 2494 mutex_unlock(&inode->i_mutex);
2465 } 2495 }
2466 2496
@@ -2487,42 +2517,62 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
2487 struct cifsFileInfo *cfile = (struct cifsFileInfo *) 2517 struct cifsFileInfo *cfile = (struct cifsFileInfo *)
2488 iocb->ki_filp->private_data; 2518 iocb->ki_filp->private_data;
2489 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 2519 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
2490 2520 ssize_t written, written2;
2491#ifdef CONFIG_CIFS_SMB2
2492 /* 2521 /*
2493 * If we have an oplock for read and want to write a data to the file 2522 * We need to store clientCanCacheAll here to prevent race
2494 * we need to store it in the page cache and then push it to the server 2523 * conditions - this value can be changed during an execution
2495 * to be sure the next read will get a valid data. 2524 * of generic_file_aio_write. For CIFS it can be changed from
2525 * true to false only, but for SMB2 it can be changed both from
2526 * true to false and vice versa. So, we can end up with a data
2527 * stored in the cache, not marked dirty and not sent to the
2528 * server if this value changes its state from false to true
2529 * after cifs_write_end.
2496 */ 2530 */
2497 if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead) { 2531 bool cache_ex = cinode->clientCanCacheAll;
2498 ssize_t written; 2532 bool cache_read = cinode->clientCanCacheRead;
2499 int rc; 2533 int rc;
2500 2534 loff_t saved_pos;
2501 written = generic_file_aio_write(iocb, iov, nr_segs, pos);
2502 rc = filemap_fdatawrite(inode->i_mapping);
2503 if (rc)
2504 return (ssize_t)rc;
2505 2535
2506 return written; 2536 if (cache_ex) {
2537 if (cap_unix(tcon->ses) &&
2538 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) &&
2539 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(
2540 tcon->fsUnixInfo.Capability)))
2541 return generic_file_aio_write(iocb, iov, nr_segs, pos);
2542 return cifs_pagecache_writev(iocb, iov, nr_segs, pos, cache_ex);
2507 } 2543 }
2508#endif
2509 2544
2510 /* 2545 /*
2511 * For non-oplocked files in strict cache mode we need to write the data 2546 * For files without exclusive oplock in strict cache mode we need to
2512 * to the server exactly from the pos to pos+len-1 rather than flush all 2547 * write the data to the server exactly from the pos to pos+len-1 rather
2513 * affected pages because it may cause a error with mandatory locks on 2548 * than flush all affected pages because it may cause a error with
2514 * these pages but not on the region from pos to ppos+len-1. 2549 * mandatory locks on these pages but not on the region from pos to
2550 * ppos+len-1.
2515 */ 2551 */
2552 written = cifs_user_writev(iocb, iov, nr_segs, pos);
2553 if (!cache_read || written <= 0)
2554 return written;
2516 2555
2517 if (!cinode->clientCanCacheAll) 2556 saved_pos = iocb->ki_pos;
2518 return cifs_user_writev(iocb, iov, nr_segs, pos); 2557 iocb->ki_pos = pos;
2519 2558 /* we have a read oplock - need to store a data in the page cache */
2520 if (cap_unix(tcon->ses) && 2559 if (cap_unix(tcon->ses) &&
2521 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && 2560 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) &&
2522 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) 2561 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(
2523 return generic_file_aio_write(iocb, iov, nr_segs, pos); 2562 tcon->fsUnixInfo.Capability)))
2524 2563 written2 = generic_file_aio_write(iocb, iov, nr_segs, pos);
2525 return cifs_writev(iocb, iov, nr_segs, pos); 2564 else
2565 written2 = cifs_pagecache_writev(iocb, iov, nr_segs, pos,
2566 cache_ex);
2567 /* errors occured during writing - invalidate the page cache */
2568 if (written2 < 0) {
2569 rc = cifs_invalidate_mapping(inode);
2570 if (rc)
2571 written = (ssize_t)rc;
2572 else
2573 iocb->ki_pos = saved_pos;
2574 }
2575 return written;
2526} 2576}
2527 2577
2528static struct cifs_readdata * 2578static struct cifs_readdata *
@@ -2892,7 +2942,7 @@ cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
2892 down_read(&cinode->lock_sem); 2942 down_read(&cinode->lock_sem);
2893 if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs), 2943 if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs),
2894 tcon->ses->server->vals->shared_lock_type, 2944 tcon->ses->server->vals->shared_lock_type,
2895 NULL, true)) 2945 NULL, CIFS_READ_OP))
2896 rc = generic_file_aio_read(iocb, iov, nr_segs, pos); 2946 rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
2897 up_read(&cinode->lock_sem); 2947 up_read(&cinode->lock_sem);
2898 return rc; 2948 return rc;
@@ -3536,7 +3586,7 @@ void cifs_oplock_break(struct work_struct *work)
3536 if (cinode->clientCanCacheRead == 0) { 3586 if (cinode->clientCanCacheRead == 0) {
3537 rc = filemap_fdatawait(inode->i_mapping); 3587 rc = filemap_fdatawait(inode->i_mapping);
3538 mapping_set_error(inode->i_mapping, rc); 3588 mapping_set_error(inode->i_mapping, rc);
3539 invalidate_remote_inode(inode); 3589 cifs_invalidate_mapping(inode);
3540 } 3590 }
3541 cFYI(1, "Oplock flush inode %p rc %d", inode, rc); 3591 cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
3542 } 3592 }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index afdff79651f1..ed6208ff85a7 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1791,11 +1791,12 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1791 stat->ino = CIFS_I(inode)->uniqueid; 1791 stat->ino = CIFS_I(inode)->uniqueid;
1792 1792
1793 /* 1793 /*
1794 * If on a multiuser mount without unix extensions, and the admin hasn't 1794 * If on a multiuser mount without unix extensions or cifsacl being
1795 * overridden them, set the ownership to the fsuid/fsgid of the current 1795 * enabled, and the admin hasn't overridden them, set the ownership
1796 * process. 1796 * to the fsuid/fsgid of the current process.
1797 */ 1797 */
1798 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) && 1798 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
1799 !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
1799 !tcon->unix_ext) { 1800 !tcon->unix_ext) {
1800 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)) 1801 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
1801 stat->uid = current_fsuid(); 1802 stat->uid = current_fsuid();
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index d5ce9e26696c..a82bc51fdc82 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -204,7 +204,7 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
204 return rc; 204 return rc;
205} 205}
206 206
207int 207void
208cifs_set_port(struct sockaddr *addr, const unsigned short int port) 208cifs_set_port(struct sockaddr *addr, const unsigned short int port)
209{ 209{
210 switch (addr->sa_family) { 210 switch (addr->sa_family) {
@@ -214,19 +214,7 @@ cifs_set_port(struct sockaddr *addr, const unsigned short int port)
214 case AF_INET6: 214 case AF_INET6:
215 ((struct sockaddr_in6 *)addr)->sin6_port = htons(port); 215 ((struct sockaddr_in6 *)addr)->sin6_port = htons(port);
216 break; 216 break;
217 default:
218 return 0;
219 } 217 }
220 return 1;
221}
222
223int
224cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
225 const unsigned short int port)
226{
227 if (!cifs_convert_address(dst, src, len))
228 return 0;
229 return cifs_set_port(dst, port);
230} 218}
231 219
232/***************************************************************************** 220/*****************************************************************************
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f9b5d3d6cf33..6002fdc920ae 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -66,18 +66,20 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
66#endif /* DEBUG2 */ 66#endif /* DEBUG2 */
67 67
68/* 68/*
69 * Attempt to preload the dcache with the results from the FIND_FIRST/NEXT
70 *
69 * Find the dentry that matches "name". If there isn't one, create one. If it's 71 * Find the dentry that matches "name". If there isn't one, create one. If it's
70 * a negative dentry or the uniqueid changed, then drop it and recreate it. 72 * a negative dentry or the uniqueid changed, then drop it and recreate it.
71 */ 73 */
72static struct dentry * 74static void
73cifs_readdir_lookup(struct dentry *parent, struct qstr *name, 75cifs_prime_dcache(struct dentry *parent, struct qstr *name,
74 struct cifs_fattr *fattr) 76 struct cifs_fattr *fattr)
75{ 77{
76 struct dentry *dentry, *alias; 78 struct dentry *dentry, *alias;
77 struct inode *inode; 79 struct inode *inode;
78 struct super_block *sb = parent->d_inode->i_sb; 80 struct super_block *sb = parent->d_inode->i_sb;
79 81
80 cFYI(1, "For %s", name->name); 82 cFYI(1, "%s: for %s", __func__, name->name);
81 83
82 if (parent->d_op && parent->d_op->d_hash) 84 if (parent->d_op && parent->d_op->d_hash)
83 parent->d_op->d_hash(parent, parent->d_inode, name); 85 parent->d_op->d_hash(parent, parent->d_inode, name);
@@ -86,35 +88,33 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
86 88
87 dentry = d_lookup(parent, name); 89 dentry = d_lookup(parent, name);
88 if (dentry) { 90 if (dentry) {
91 int err;
92
89 inode = dentry->d_inode; 93 inode = dentry->d_inode;
90 /* update inode in place if i_ino didn't change */ 94 /* update inode in place if i_ino didn't change */
91 if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) { 95 if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) {
92 cifs_fattr_to_inode(inode, fattr); 96 cifs_fattr_to_inode(inode, fattr);
93 return dentry; 97 goto out;
94 } 98 }
95 d_drop(dentry); 99 err = d_invalidate(dentry);
96 dput(dentry); 100 dput(dentry);
101 if (err)
102 return;
97 } 103 }
98 104
99 dentry = d_alloc(parent, name); 105 dentry = d_alloc(parent, name);
100 if (dentry == NULL) 106 if (!dentry)
101 return NULL; 107 return;
102 108
103 inode = cifs_iget(sb, fattr); 109 inode = cifs_iget(sb, fattr);
104 if (!inode) { 110 if (!inode)
105 dput(dentry); 111 goto out;
106 return NULL;
107 }
108 112
109 alias = d_materialise_unique(dentry, inode); 113 alias = d_materialise_unique(dentry, inode);
110 if (alias != NULL) { 114 if (alias && !IS_ERR(alias))
111 dput(dentry); 115 dput(alias);
112 if (IS_ERR(alias)) 116out:
113 return NULL; 117 dput(dentry);
114 dentry = alias;
115 }
116
117 return dentry;
118} 118}
119 119
120static void 120static void
@@ -134,6 +134,16 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
134 if (fattr->cf_cifsattrs & ATTR_READONLY) 134 if (fattr->cf_cifsattrs & ATTR_READONLY)
135 fattr->cf_mode &= ~S_IWUGO; 135 fattr->cf_mode &= ~S_IWUGO;
136 136
137 /*
138 * We of course don't get ACL info in FIND_FIRST/NEXT results, so
139 * mark it for revalidation so that "ls -l" will look right. It might
140 * be super-slow, but if we don't do this then the ownership of files
141 * may look wrong since the inodes may not have timed out by the time
142 * "ls" does a stat() call on them.
143 */
144 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
145 fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
146
137 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL && 147 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL &&
138 fattr->cf_cifsattrs & ATTR_SYSTEM) { 148 fattr->cf_cifsattrs & ATTR_SYSTEM) {
139 if (fattr->cf_eof == 0) { 149 if (fattr->cf_eof == 0) {
@@ -649,7 +659,6 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
649 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 659 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
650 struct cifs_dirent de = { NULL, }; 660 struct cifs_dirent de = { NULL, };
651 struct cifs_fattr fattr; 661 struct cifs_fattr fattr;
652 struct dentry *dentry;
653 struct qstr name; 662 struct qstr name;
654 int rc = 0; 663 int rc = 0;
655 ino_t ino; 664 ino_t ino;
@@ -720,13 +729,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
720 */ 729 */
721 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; 730 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
722 731
723 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); 732 cifs_prime_dcache(file->f_dentry, &name, &fattr);
724 dentry = cifs_readdir_lookup(file->f_dentry, &name, &fattr);
725 733
734 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
726 rc = filldir(dirent, name.name, name.len, file->f_pos, ino, 735 rc = filldir(dirent, name.name, name.len, file->f_pos, ino,
727 fattr.cf_dtype); 736 fattr.cf_dtype);
728
729 dput(dentry);
730 return rc; 737 return rc;
731} 738}
732 739
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 56cc4be87807..a5d234c8d5d9 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -575,37 +575,6 @@ cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
575 return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data); 575 return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data);
576} 576}
577 577
578static char *
579cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
580 struct cifs_tcon *tcon)
581{
582 int pplen = vol->prepath ? strlen(vol->prepath) : 0;
583 int dfsplen;
584 char *full_path = NULL;
585
586 /* if no prefix path, simply set path to the root of share to "" */
587 if (pplen == 0) {
588 full_path = kzalloc(1, GFP_KERNEL);
589 return full_path;
590 }
591
592 if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
593 dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
594 else
595 dfsplen = 0;
596
597 full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL);
598 if (full_path == NULL)
599 return full_path;
600
601 if (dfsplen)
602 strncpy(full_path, tcon->treeName, dfsplen);
603 strncpy(full_path + dfsplen, vol->prepath, pplen);
604 convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
605 full_path[dfsplen + pplen] = 0; /* add trailing null */
606 return full_path;
607}
608
609static void 578static void
610cifs_clear_stats(struct cifs_tcon *tcon) 579cifs_clear_stats(struct cifs_tcon *tcon)
611{ 580{
@@ -766,7 +735,6 @@ smb_set_file_info(struct inode *inode, const char *full_path,
766 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 735 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
767 struct tcon_link *tlink = NULL; 736 struct tcon_link *tlink = NULL;
768 struct cifs_tcon *tcon; 737 struct cifs_tcon *tcon;
769 FILE_BASIC_INFO info_buf;
770 738
771 /* if the file is already open for write, just use that fileid */ 739 /* if the file is already open for write, just use that fileid */
772 open_file = find_writable_file(cinode, true); 740 open_file = find_writable_file(cinode, true);
@@ -817,7 +785,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
817 netpid = current->tgid; 785 netpid = current->tgid;
818 786
819set_via_filehandle: 787set_via_filehandle:
820 rc = CIFSSMBSetFileInfo(xid, tcon, &info_buf, netfid, netpid); 788 rc = CIFSSMBSetFileInfo(xid, tcon, buf, netfid, netpid);
821 if (!rc) 789 if (!rc)
822 cinode->cifsAttrs = le32_to_cpu(buf->Attributes); 790 cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
823 791
@@ -944,7 +912,6 @@ struct smb_version_operations smb1_operations = {
944 .set_path_size = CIFSSMBSetEOF, 912 .set_path_size = CIFSSMBSetEOF,
945 .set_file_size = CIFSSMBSetFileSize, 913 .set_file_size = CIFSSMBSetFileSize,
946 .set_file_info = smb_set_file_info, 914 .set_file_info = smb_set_file_info,
947 .build_path_to_root = cifs_build_path_to_root,
948 .echo = CIFSSMBEcho, 915 .echo = CIFSSMBEcho,
949 .mkdir = CIFSSMBMkDir, 916 .mkdir = CIFSSMBMkDir,
950 .mkdir_setinfo = cifs_mkdir_setinfo, 917 .mkdir_setinfo = cifs_mkdir_setinfo,
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index a93eec30a50d..71e6aed4b382 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -260,13 +260,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
260 struct cifs_fid_locks *fdlocks; 260 struct cifs_fid_locks *fdlocks;
261 261
262 xid = get_xid(); 262 xid = get_xid();
263 /* we are going to update can_cache_brlcks here - need a write access */
264 down_write(&cinode->lock_sem);
265 if (!cinode->can_cache_brlcks) {
266 up_write(&cinode->lock_sem);
267 free_xid(xid);
268 return rc;
269 }
270 263
271 /* 264 /*
272 * Accessing maxBuf is racy with cifs_reconnect - need to store value 265 * Accessing maxBuf is racy with cifs_reconnect - need to store value
@@ -274,7 +267,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
274 */ 267 */
275 max_buf = tlink_tcon(cfile->tlink)->ses->server->maxBuf; 268 max_buf = tlink_tcon(cfile->tlink)->ses->server->maxBuf;
276 if (!max_buf) { 269 if (!max_buf) {
277 up_write(&cinode->lock_sem);
278 free_xid(xid); 270 free_xid(xid);
279 return -EINVAL; 271 return -EINVAL;
280 } 272 }
@@ -282,7 +274,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
282 max_num = max_buf / sizeof(struct smb2_lock_element); 274 max_num = max_buf / sizeof(struct smb2_lock_element);
283 buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL); 275 buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL);
284 if (!buf) { 276 if (!buf) {
285 up_write(&cinode->lock_sem);
286 free_xid(xid); 277 free_xid(xid);
287 return -ENOMEM; 278 return -ENOMEM;
288 } 279 }
@@ -293,10 +284,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
293 rc = stored_rc; 284 rc = stored_rc;
294 } 285 }
295 286
296 cinode->can_cache_brlcks = false;
297 kfree(buf); 287 kfree(buf);
298
299 up_write(&cinode->lock_sem);
300 free_xid(xid); 288 free_xid(xid);
301 return rc; 289 return rc;
302} 290}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 4d9dbe0b7385..d79de7bc4435 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -262,23 +262,6 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
262 return rc; 262 return rc;
263} 263}
264 264
265static char *
266smb2_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
267 struct cifs_tcon *tcon)
268{
269 int pplen = vol->prepath ? strlen(vol->prepath) : 0;
270 char *full_path = NULL;
271
272 /* if no prefix path, simply set path to the root of share to "" */
273 if (pplen == 0) {
274 full_path = kzalloc(2, GFP_KERNEL);
275 return full_path;
276 }
277
278 cERROR(1, "prefixpath is not supported for SMB2 now");
279 return NULL;
280}
281
282static bool 265static bool
283smb2_can_echo(struct TCP_Server_Info *server) 266smb2_can_echo(struct TCP_Server_Info *server)
284{ 267{
@@ -613,7 +596,6 @@ struct smb_version_operations smb21_operations = {
613 .set_path_size = smb2_set_path_size, 596 .set_path_size = smb2_set_path_size,
614 .set_file_size = smb2_set_file_size, 597 .set_file_size = smb2_set_file_size,
615 .set_file_info = smb2_set_file_info, 598 .set_file_info = smb2_set_file_info,
616 .build_path_to_root = smb2_build_path_to_root,
617 .mkdir = smb2_mkdir, 599 .mkdir = smb2_mkdir,
618 .mkdir_setinfo = smb2_mkdir_setinfo, 600 .mkdir_setinfo = smb2_mkdir_setinfo,
619 .rmdir = smb2_rmdir, 601 .rmdir = smb2_rmdir,
@@ -641,6 +623,91 @@ struct smb_version_operations smb21_operations = {
641 .get_lease_key = smb2_get_lease_key, 623 .get_lease_key = smb2_get_lease_key,
642 .set_lease_key = smb2_set_lease_key, 624 .set_lease_key = smb2_set_lease_key,
643 .new_lease_key = smb2_new_lease_key, 625 .new_lease_key = smb2_new_lease_key,
626 .calc_signature = smb2_calc_signature,
627};
628
629
630struct smb_version_operations smb30_operations = {
631 .compare_fids = smb2_compare_fids,
632 .setup_request = smb2_setup_request,
633 .setup_async_request = smb2_setup_async_request,
634 .check_receive = smb2_check_receive,
635 .add_credits = smb2_add_credits,
636 .set_credits = smb2_set_credits,
637 .get_credits_field = smb2_get_credits_field,
638 .get_credits = smb2_get_credits,
639 .get_next_mid = smb2_get_next_mid,
640 .read_data_offset = smb2_read_data_offset,
641 .read_data_length = smb2_read_data_length,
642 .map_error = map_smb2_to_linux_error,
643 .find_mid = smb2_find_mid,
644 .check_message = smb2_check_message,
645 .dump_detail = smb2_dump_detail,
646 .clear_stats = smb2_clear_stats,
647 .print_stats = smb2_print_stats,
648 .is_oplock_break = smb2_is_valid_oplock_break,
649 .need_neg = smb2_need_neg,
650 .negotiate = smb2_negotiate,
651 .negotiate_wsize = smb2_negotiate_wsize,
652 .negotiate_rsize = smb2_negotiate_rsize,
653 .sess_setup = SMB2_sess_setup,
654 .logoff = SMB2_logoff,
655 .tree_connect = SMB2_tcon,
656 .tree_disconnect = SMB2_tdis,
657 .is_path_accessible = smb2_is_path_accessible,
658 .can_echo = smb2_can_echo,
659 .echo = SMB2_echo,
660 .query_path_info = smb2_query_path_info,
661 .get_srv_inum = smb2_get_srv_inum,
662 .query_file_info = smb2_query_file_info,
663 .set_path_size = smb2_set_path_size,
664 .set_file_size = smb2_set_file_size,
665 .set_file_info = smb2_set_file_info,
666 .mkdir = smb2_mkdir,
667 .mkdir_setinfo = smb2_mkdir_setinfo,
668 .rmdir = smb2_rmdir,
669 .unlink = smb2_unlink,
670 .rename = smb2_rename_path,
671 .create_hardlink = smb2_create_hardlink,
672 .open = smb2_open_file,
673 .set_fid = smb2_set_fid,
674 .close = smb2_close_file,
675 .flush = smb2_flush_file,
676 .async_readv = smb2_async_readv,
677 .async_writev = smb2_async_writev,
678 .sync_read = smb2_sync_read,
679 .sync_write = smb2_sync_write,
680 .query_dir_first = smb2_query_dir_first,
681 .query_dir_next = smb2_query_dir_next,
682 .close_dir = smb2_close_dir,
683 .calc_smb_size = smb2_calc_size,
684 .is_status_pending = smb2_is_status_pending,
685 .oplock_response = smb2_oplock_response,
686 .queryfs = smb2_queryfs,
687 .mand_lock = smb2_mand_lock,
688 .mand_unlock_range = smb2_unlock_range,
689 .push_mand_locks = smb2_push_mandatory_locks,
690 .get_lease_key = smb2_get_lease_key,
691 .set_lease_key = smb2_set_lease_key,
692 .new_lease_key = smb2_new_lease_key,
693 .calc_signature = smb3_calc_signature,
694};
695
696struct smb_version_values smb20_values = {
697 .version_string = SMB20_VERSION_STRING,
698 .protocol_id = SMB20_PROT_ID,
699 .req_capabilities = 0, /* MBZ */
700 .large_lock_type = 0,
701 .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
702 .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
703 .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
704 .header_size = sizeof(struct smb2_hdr),
705 .max_header_size = MAX_SMB2_HDR_SIZE,
706 .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
707 .lock_cmd = SMB2_LOCK,
708 .cap_unix = 0,
709 .cap_nt_find = SMB2_NT_FIND,
710 .cap_large_files = SMB2_LARGE_FILES,
644}; 711};
645 712
646struct smb_version_values smb21_values = { 713struct smb_version_values smb21_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index cf33622cdac8..41d9d0725f0f 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -425,7 +425,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
425 } 425 }
426 426
427 cFYI(1, "sec_flags 0x%x", sec_flags); 427 cFYI(1, "sec_flags 0x%x", sec_flags);
428 if (sec_flags & CIFSSEC_MUST_SIGN) { 428 if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
429 cFYI(1, "Signing required"); 429 cFYI(1, "Signing required");
430 if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED | 430 if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED |
431 SMB2_NEGOTIATE_SIGNING_ENABLED))) { 431 SMB2_NEGOTIATE_SIGNING_ENABLED))) {
@@ -612,7 +612,8 @@ ssetup_ntlmssp_authenticate:
612 612
613 /* BB add code to build os and lm fields */ 613 /* BB add code to build os and lm fields */
614 614
615 rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, CIFS_LOG_ERROR); 615 rc = SendReceive2(xid, ses, iov, 2, &resp_buftype,
616 CIFS_LOG_ERROR | CIFS_NEG_OP);
616 617
617 kfree(security_blob); 618 kfree(security_blob);
618 rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base; 619 rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 7d25f8b14f93..2aa3535e38ce 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -47,6 +47,10 @@ extern struct mid_q_entry *smb2_setup_request(struct cifs_ses *ses,
47 struct smb_rqst *rqst); 47 struct smb_rqst *rqst);
48extern struct mid_q_entry *smb2_setup_async_request( 48extern struct mid_q_entry *smb2_setup_async_request(
49 struct TCP_Server_Info *server, struct smb_rqst *rqst); 49 struct TCP_Server_Info *server, struct smb_rqst *rqst);
50extern int smb2_calc_signature(struct smb_rqst *rqst,
51 struct TCP_Server_Info *server);
52extern int smb3_calc_signature(struct smb_rqst *rqst,
53 struct TCP_Server_Info *server);
50extern void smb2_echo_request(struct work_struct *work); 54extern void smb2_echo_request(struct work_struct *work);
51extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode); 55extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode);
52extern __u8 smb2_map_lease_to_oplock(__le32 lease_state); 56extern __u8 smb2_map_lease_to_oplock(__le32 lease_state);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 2a5fdf26f79f..8dd73e61d762 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -39,7 +39,7 @@
39#include "smb2status.h" 39#include "smb2status.h"
40#include "smb2glob.h" 40#include "smb2glob.h"
41 41
42static int 42int
43smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) 43smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
44{ 44{
45 int i, rc; 45 int i, rc;
@@ -116,6 +116,13 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
116 return rc; 116 return rc;
117} 117}
118 118
119int
120smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
121{
122 cFYI(1, "smb3 signatures not supported yet");
123 return -EOPNOTSUPP;
124}
125
119/* must be called with server->srv_mutex held */ 126/* must be called with server->srv_mutex held */
120static int 127static int
121smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) 128smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
@@ -132,7 +139,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
132 return rc; 139 return rc;
133 } 140 }
134 141
135 rc = smb2_calc_signature(rqst, server); 142 rc = server->ops->calc_signature(rqst, server);
136 143
137 return rc; 144 return rc;
138} 145}
@@ -168,7 +175,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
168 memset(smb2_pdu->Signature, 0, SMB2_SIGNATURE_SIZE); 175 memset(smb2_pdu->Signature, 0, SMB2_SIGNATURE_SIZE);
169 176
170 mutex_lock(&server->srv_mutex); 177 mutex_lock(&server->srv_mutex);
171 rc = smb2_calc_signature(rqst, server); 178 rc = server->ops->calc_signature(rqst, server);
172 mutex_unlock(&server->srv_mutex); 179 mutex_unlock(&server->srv_mutex);
173 180
174 if (rc) 181 if (rc)
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 4c6285fff598..e2f57a007029 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -844,6 +844,9 @@ COMPATIBLE_IOCTL(TIOCGDEV)
844COMPATIBLE_IOCTL(TIOCCBRK) 844COMPATIBLE_IOCTL(TIOCCBRK)
845COMPATIBLE_IOCTL(TIOCGSID) 845COMPATIBLE_IOCTL(TIOCGSID)
846COMPATIBLE_IOCTL(TIOCGICOUNT) 846COMPATIBLE_IOCTL(TIOCGICOUNT)
847COMPATIBLE_IOCTL(TIOCGPKT)
848COMPATIBLE_IOCTL(TIOCGPTLCK)
849COMPATIBLE_IOCTL(TIOCGEXCL)
847/* Little t */ 850/* Little t */
848COMPATIBLE_IOCTL(TIOCGETD) 851COMPATIBLE_IOCTL(TIOCGETD)
849COMPATIBLE_IOCTL(TIOCSETD) 852COMPATIBLE_IOCTL(TIOCSETD)
diff --git a/fs/coredump.c b/fs/coredump.c
index ce47379bfa61..177493272a61 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -458,7 +458,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
458 return err; 458 return err;
459} 459}
460 460
461void do_coredump(siginfo_t *siginfo, struct pt_regs *regs) 461void do_coredump(siginfo_t *siginfo)
462{ 462{
463 struct core_state core_state; 463 struct core_state core_state;
464 struct core_name cn; 464 struct core_name cn;
@@ -474,7 +474,7 @@ void do_coredump(siginfo_t *siginfo, struct pt_regs *regs)
474 static atomic_t core_dump_count = ATOMIC_INIT(0); 474 static atomic_t core_dump_count = ATOMIC_INIT(0);
475 struct coredump_params cprm = { 475 struct coredump_params cprm = {
476 .siginfo = siginfo, 476 .siginfo = siginfo,
477 .regs = regs, 477 .regs = signal_pt_regs(),
478 .limit = rlimit(RLIMIT_CORE), 478 .limit = rlimit(RLIMIT_CORE),
479 /* 479 /*
480 * We must use the same mm->flags while dumping core to avoid 480 * We must use the same mm->flags while dumping core to avoid
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b607d92cdf24..153bb1e42e63 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -59,7 +59,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev
59 case S_IFDIR: 59 case S_IFDIR:
60 inode->i_op = &simple_dir_inode_operations; 60 inode->i_op = &simple_dir_inode_operations;
61 inode->i_fop = &simple_dir_operations; 61 inode->i_fop = &simple_dir_operations;
62 inode->i_private = NULL;
63 62
64 /* directory inodes start off with i_nlink == 2 63 /* directory inodes start off with i_nlink == 2
65 * (for "." entry) */ 64 * (for "." entry) */
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 14afbabe6546..472e6befc54d 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -545,37 +545,38 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx)
545 mutex_unlock(&allocated_ptys_lock); 545 mutex_unlock(&allocated_ptys_lock);
546} 546}
547 547
548int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty) 548/**
549 * devpts_pty_new -- create a new inode in /dev/pts/
550 * @ptmx_inode: inode of the master
551 * @device: major+minor of the node to be created
552 * @index: used as a name of the node
553 * @priv: what's given back by devpts_get_priv
554 *
555 * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill.
556 */
557struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
558 void *priv)
549{ 559{
550 /* tty layer puts index from devpts_new_index() in here */
551 int number = tty->index;
552 struct tty_driver *driver = tty->driver;
553 dev_t device = MKDEV(driver->major, driver->minor_start+number);
554 struct dentry *dentry; 560 struct dentry *dentry;
555 struct super_block *sb = pts_sb_from_inode(ptmx_inode); 561 struct super_block *sb = pts_sb_from_inode(ptmx_inode);
556 struct inode *inode = new_inode(sb); 562 struct inode *inode;
557 struct dentry *root = sb->s_root; 563 struct dentry *root = sb->s_root;
558 struct pts_fs_info *fsi = DEVPTS_SB(sb); 564 struct pts_fs_info *fsi = DEVPTS_SB(sb);
559 struct pts_mount_opts *opts = &fsi->mount_opts; 565 struct pts_mount_opts *opts = &fsi->mount_opts;
560 int ret = 0;
561 char s[12]; 566 char s[12];
562 567
563 /* We're supposed to be given the slave end of a pty */ 568 inode = new_inode(sb);
564 BUG_ON(driver->type != TTY_DRIVER_TYPE_PTY);
565 BUG_ON(driver->subtype != PTY_TYPE_SLAVE);
566
567 if (!inode) 569 if (!inode)
568 return -ENOMEM; 570 return ERR_PTR(-ENOMEM);
569 571
570 inode->i_ino = number + 3; 572 inode->i_ino = index + 3;
571 inode->i_uid = opts->setuid ? opts->uid : current_fsuid(); 573 inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
572 inode->i_gid = opts->setgid ? opts->gid : current_fsgid(); 574 inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
573 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 575 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
574 init_special_inode(inode, S_IFCHR|opts->mode, device); 576 init_special_inode(inode, S_IFCHR|opts->mode, device);
575 inode->i_private = tty; 577 inode->i_private = priv;
576 tty->driver_data = inode;
577 578
578 sprintf(s, "%d", number); 579 sprintf(s, "%d", index);
579 580
580 mutex_lock(&root->d_inode->i_mutex); 581 mutex_lock(&root->d_inode->i_mutex);
581 582
@@ -585,18 +586,24 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
585 fsnotify_create(root->d_inode, dentry); 586 fsnotify_create(root->d_inode, dentry);
586 } else { 587 } else {
587 iput(inode); 588 iput(inode);
588 ret = -ENOMEM; 589 inode = ERR_PTR(-ENOMEM);
589 } 590 }
590 591
591 mutex_unlock(&root->d_inode->i_mutex); 592 mutex_unlock(&root->d_inode->i_mutex);
592 593
593 return ret; 594 return inode;
594} 595}
595 596
596struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number) 597/**
598 * devpts_get_priv -- get private data for a slave
599 * @pts_inode: inode of the slave
600 *
601 * Returns whatever was passed as priv in devpts_pty_new for a given inode.
602 */
603void *devpts_get_priv(struct inode *pts_inode)
597{ 604{
598 struct dentry *dentry; 605 struct dentry *dentry;
599 struct tty_struct *tty; 606 void *priv = NULL;
600 607
601 BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); 608 BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
602 609
@@ -605,18 +612,22 @@ struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
605 if (!dentry) 612 if (!dentry)
606 return NULL; 613 return NULL;
607 614
608 tty = NULL;
609 if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) 615 if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
610 tty = (struct tty_struct *)pts_inode->i_private; 616 priv = pts_inode->i_private;
611 617
612 dput(dentry); 618 dput(dentry);
613 619
614 return tty; 620 return priv;
615} 621}
616 622
617void devpts_pty_kill(struct tty_struct *tty) 623/**
624 * devpts_pty_kill -- remove inode form /dev/pts/
625 * @inode: inode of the slave to be removed
626 *
627 * This is an inverse operation of devpts_pty_new.
628 */
629void devpts_pty_kill(struct inode *inode)
618{ 630{
619 struct inode *inode = tty->driver_data;
620 struct super_block *sb = pts_sb_from_inode(inode); 631 struct super_block *sb = pts_sb_from_inode(inode);
621 struct dentry *root = sb->s_root; 632 struct dentry *root = sb->s_root;
622 struct dentry *dentry; 633 struct dentry *dentry;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index f86c720dba0e..cf5b44b10c67 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -540,6 +540,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
540 sector_t fs_endblk; /* Into file, in filesystem-sized blocks */ 540 sector_t fs_endblk; /* Into file, in filesystem-sized blocks */
541 unsigned long fs_count; /* Number of filesystem-sized blocks */ 541 unsigned long fs_count; /* Number of filesystem-sized blocks */
542 int create; 542 int create;
543 unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor;
543 544
544 /* 545 /*
545 * If there was a memory error and we've overwritten all the 546 * If there was a memory error and we've overwritten all the
@@ -554,7 +555,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
554 fs_count = fs_endblk - fs_startblk + 1; 555 fs_count = fs_endblk - fs_startblk + 1;
555 556
556 map_bh->b_state = 0; 557 map_bh->b_state = 0;
557 map_bh->b_size = fs_count << dio->inode->i_blkbits; 558 map_bh->b_size = fs_count << i_blkbits;
558 559
559 /* 560 /*
560 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we 561 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
@@ -1053,7 +1054,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1053 int seg; 1054 int seg;
1054 size_t size; 1055 size_t size;
1055 unsigned long addr; 1056 unsigned long addr;
1056 unsigned blkbits = inode->i_blkbits; 1057 unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
1058 unsigned blkbits = i_blkbits;
1057 unsigned blocksize_mask = (1 << blkbits) - 1; 1059 unsigned blocksize_mask = (1 << blkbits) - 1;
1058 ssize_t retval = -EINVAL; 1060 ssize_t retval = -EINVAL;
1059 loff_t end = offset; 1061 loff_t end = offset;
@@ -1149,7 +1151,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1149 dio->inode = inode; 1151 dio->inode = inode;
1150 dio->rw = rw; 1152 dio->rw = rw;
1151 sdio.blkbits = blkbits; 1153 sdio.blkbits = blkbits;
1152 sdio.blkfactor = inode->i_blkbits - blkbits; 1154 sdio.blkfactor = i_blkbits - blkbits;
1153 sdio.block_in_file = offset >> blkbits; 1155 sdio.block_in_file = offset >> blkbits;
1154 1156
1155 sdio.get_block = get_block; 1157 sdio.get_block = get_block;
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 1897eb1b4b6a..e4242c3f8486 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -1,6 +1,6 @@
1menuconfig DLM 1menuconfig DLM
2 tristate "Distributed Lock Manager (DLM)" 2 tristate "Distributed Lock Manager (DLM)"
3 depends on EXPERIMENTAL && INET 3 depends on INET
4 depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) 4 depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
5 select IP_SCTP 5 select IP_SCTP
6 help 6 help
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 871c1abf6029..77c0f70f8fe8 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -337,6 +337,7 @@ enum rsb_flags {
337 RSB_NEW_MASTER2, 337 RSB_NEW_MASTER2,
338 RSB_RECOVER_CONVERT, 338 RSB_RECOVER_CONVERT,
339 RSB_RECOVER_GRANT, 339 RSB_RECOVER_GRANT,
340 RSB_RECOVER_LVB_INVAL,
340}; 341};
341 342
342static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag) 343static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index b56950758188..a579f30f237d 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -5393,6 +5393,13 @@ static void purge_dead_list(struct dlm_ls *ls, struct dlm_rsb *r,
5393 if ((lkb->lkb_nodeid == nodeid_gone) || 5393 if ((lkb->lkb_nodeid == nodeid_gone) ||
5394 dlm_is_removed(ls, lkb->lkb_nodeid)) { 5394 dlm_is_removed(ls, lkb->lkb_nodeid)) {
5395 5395
5396 /* tell recover_lvb to invalidate the lvb
5397 because a node holding EX/PW failed */
5398 if ((lkb->lkb_exflags & DLM_LKF_VALBLK) &&
5399 (lkb->lkb_grmode >= DLM_LOCK_PW)) {
5400 rsb_set_flag(r, RSB_RECOVER_LVB_INVAL);
5401 }
5402
5396 del_lkb(r, lkb); 5403 del_lkb(r, lkb);
5397 5404
5398 /* this put should free the lkb */ 5405 /* this put should free the lkb */
@@ -6025,15 +6032,18 @@ static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
6025 return error; 6032 return error;
6026} 6033}
6027 6034
6028/* The force flag allows the unlock to go ahead even if the lkb isn't granted. 6035/* The FORCEUNLOCK flag allows the unlock to go ahead even if the lkb isn't
6029 Regardless of what rsb queue the lock is on, it's removed and freed. */ 6036 granted. Regardless of what rsb queue the lock is on, it's removed and
6037 freed. The IVVALBLK flag causes the lvb on the resource to be invalidated
6038 if our lock is PW/EX (it's ignored if our granted mode is smaller.) */
6030 6039
6031static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) 6040static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
6032{ 6041{
6033 struct dlm_args args; 6042 struct dlm_args args;
6034 int error; 6043 int error;
6035 6044
6036 set_unlock_args(DLM_LKF_FORCEUNLOCK, lkb->lkb_ua, &args); 6045 set_unlock_args(DLM_LKF_FORCEUNLOCK | DLM_LKF_IVVALBLK,
6046 lkb->lkb_ua, &args);
6037 6047
6038 error = unlock_lock(ls, lkb, &args); 6048 error = unlock_lock(ls, lkb, &args);
6039 if (error == -DLM_EUNLOCK) 6049 if (error == -DLM_EUNLOCK)
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 331ea4f94efd..dd87a31bcc21 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1385,7 +1385,6 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
1385 struct connection *con; 1385 struct connection *con;
1386 struct writequeue_entry *e; 1386 struct writequeue_entry *e;
1387 int offset = 0; 1387 int offset = 0;
1388 int users = 0;
1389 1388
1390 con = nodeid2con(nodeid, allocation); 1389 con = nodeid2con(nodeid, allocation);
1391 if (!con) 1390 if (!con)
@@ -1399,7 +1398,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
1399 } else { 1398 } else {
1400 offset = e->end; 1399 offset = e->end;
1401 e->end += len; 1400 e->end += len;
1402 users = e->users++; 1401 e->users++;
1403 } 1402 }
1404 spin_unlock(&con->writequeue_lock); 1403 spin_unlock(&con->writequeue_lock);
1405 1404
@@ -1414,7 +1413,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
1414 spin_lock(&con->writequeue_lock); 1413 spin_lock(&con->writequeue_lock);
1415 offset = e->end; 1414 offset = e->end;
1416 e->end += len; 1415 e->end += len;
1417 users = e->users++; 1416 e->users++;
1418 list_add_tail(&e->list, &con->writequeue); 1417 list_add_tail(&e->list, &con->writequeue);
1419 spin_unlock(&con->writequeue_lock); 1418 spin_unlock(&con->writequeue_lock);
1420 goto got_one; 1419 goto got_one;
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 4a7a76e42fc3..aedea28a86a1 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -717,8 +717,14 @@ void dlm_recovered_lock(struct dlm_rsb *r)
717 * the VALNOTVALID flag if necessary, and determining the correct lvb contents 717 * the VALNOTVALID flag if necessary, and determining the correct lvb contents
718 * based on the lvb's of the locks held on the rsb. 718 * based on the lvb's of the locks held on the rsb.
719 * 719 *
720 * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it 720 * RSB_VALNOTVALID is set in two cases:
721 * was already set prior to recovery, it's not cleared, regardless of locks. 721 *
722 * 1. we are master, but not new, and we purged an EX/PW lock held by a
723 * failed node (in dlm_recover_purge which set RSB_RECOVER_LVB_INVAL)
724 *
725 * 2. we are a new master, and there are only NL/CR locks left.
726 * (We could probably improve this by only invaliding in this way when
727 * the previous master left uncleanly. VMS docs mention that.)
722 * 728 *
723 * The LVB contents are only considered for changing when this is a new master 729 * The LVB contents are only considered for changing when this is a new master
724 * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with 730 * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
@@ -734,6 +740,19 @@ static void recover_lvb(struct dlm_rsb *r)
734 int big_lock_exists = 0; 740 int big_lock_exists = 0;
735 int lvblen = r->res_ls->ls_lvblen; 741 int lvblen = r->res_ls->ls_lvblen;
736 742
743 if (!rsb_flag(r, RSB_NEW_MASTER2) &&
744 rsb_flag(r, RSB_RECOVER_LVB_INVAL)) {
745 /* case 1 above */
746 rsb_set_flag(r, RSB_VALNOTVALID);
747 return;
748 }
749
750 if (!rsb_flag(r, RSB_NEW_MASTER2))
751 return;
752
753 /* we are the new master, so figure out if VALNOTVALID should
754 be set, and set the rsb lvb from the best lkb available. */
755
737 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { 756 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
738 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) 757 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
739 continue; 758 continue;
@@ -772,13 +791,10 @@ static void recover_lvb(struct dlm_rsb *r)
772 if (!lock_lvb_exists) 791 if (!lock_lvb_exists)
773 goto out; 792 goto out;
774 793
794 /* lvb is invalidated if only NL/CR locks remain */
775 if (!big_lock_exists) 795 if (!big_lock_exists)
776 rsb_set_flag(r, RSB_VALNOTVALID); 796 rsb_set_flag(r, RSB_VALNOTVALID);
777 797
778 /* don't mess with the lvb unless we're the new master */
779 if (!rsb_flag(r, RSB_NEW_MASTER2))
780 goto out;
781
782 if (!r->res_lvbptr) { 798 if (!r->res_lvbptr) {
783 r->res_lvbptr = dlm_allocate_lvb(r->res_ls); 799 r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
784 if (!r->res_lvbptr) 800 if (!r->res_lvbptr)
@@ -852,12 +868,19 @@ void dlm_recover_rsbs(struct dlm_ls *ls)
852 if (is_master(r)) { 868 if (is_master(r)) {
853 if (rsb_flag(r, RSB_RECOVER_CONVERT)) 869 if (rsb_flag(r, RSB_RECOVER_CONVERT))
854 recover_conversion(r); 870 recover_conversion(r);
871
872 /* recover lvb before granting locks so the updated
873 lvb/VALNOTVALID is presented in the completion */
874 recover_lvb(r);
875
855 if (rsb_flag(r, RSB_NEW_MASTER2)) 876 if (rsb_flag(r, RSB_NEW_MASTER2))
856 recover_grant(r); 877 recover_grant(r);
857 recover_lvb(r);
858 count++; 878 count++;
879 } else {
880 rsb_clear_flag(r, RSB_VALNOTVALID);
859 } 881 }
860 rsb_clear_flag(r, RSB_RECOVER_CONVERT); 882 rsb_clear_flag(r, RSB_RECOVER_CONVERT);
883 rsb_clear_flag(r, RSB_RECOVER_LVB_INVAL);
861 rsb_clear_flag(r, RSB_NEW_MASTER2); 884 rsb_clear_flag(r, RSB_NEW_MASTER2);
862 unlock_rsb(r); 885 unlock_rsb(r);
863 } 886 }
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index da72250ddc1c..cd96649bfe62 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -346,7 +346,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
346/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ 346/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
347static inline int ep_op_has_event(int op) 347static inline int ep_op_has_event(int op)
348{ 348{
349 return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD; 349 return op != EPOLL_CTL_DEL;
350} 350}
351 351
352/* Initialize the poll safe wake up structure */ 352/* Initialize the poll safe wake up structure */
@@ -676,34 +676,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
676 return 0; 676 return 0;
677} 677}
678 678
679/*
680 * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item
681 * had no event flags set, indicating that another thread may be currently
682 * handling that item's events (in the case that EPOLLONESHOT was being
683 * used). Otherwise a zero result indicates that the item has been disabled
684 * from receiving events. A disabled item may be re-enabled via
685 * EPOLL_CTL_MOD. Must be called with "mtx" held.
686 */
687static int ep_disable(struct eventpoll *ep, struct epitem *epi)
688{
689 int result = 0;
690 unsigned long flags;
691
692 spin_lock_irqsave(&ep->lock, flags);
693 if (epi->event.events & ~EP_PRIVATE_BITS) {
694 if (ep_is_linked(&epi->rdllink))
695 list_del_init(&epi->rdllink);
696 /* Ensure ep_poll_callback will not add epi back onto ready
697 list: */
698 epi->event.events &= EP_PRIVATE_BITS;
699 }
700 else
701 result = -EBUSY;
702 spin_unlock_irqrestore(&ep->lock, flags);
703
704 return result;
705}
706
707static void ep_free(struct eventpoll *ep) 679static void ep_free(struct eventpoll *ep)
708{ 680{
709 struct rb_node *rbp; 681 struct rb_node *rbp;
@@ -1048,6 +1020,8 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
1048 rb_insert_color(&epi->rbn, &ep->rbr); 1020 rb_insert_color(&epi->rbn, &ep->rbr);
1049} 1021}
1050 1022
1023
1024
1051#define PATH_ARR_SIZE 5 1025#define PATH_ARR_SIZE 5
1052/* 1026/*
1053 * These are the number paths of length 1 to 5, that we are allowing to emanate 1027 * These are the number paths of length 1 to 5, that we are allowing to emanate
@@ -1813,12 +1787,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1813 } else 1787 } else
1814 error = -ENOENT; 1788 error = -ENOENT;
1815 break; 1789 break;
1816 case EPOLL_CTL_DISABLE:
1817 if (epi)
1818 error = ep_disable(ep, epi);
1819 else
1820 error = -ENOENT;
1821 break;
1822 } 1790 }
1823 mutex_unlock(&ep->mtx); 1791 mutex_unlock(&ep->mtx);
1824 1792
diff --git a/fs/exec.c b/fs/exec.c
index 0039055b1fc6..721a29929511 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1349,7 +1349,7 @@ EXPORT_SYMBOL(remove_arg_zero);
1349/* 1349/*
1350 * cycle the list of binary formats handler, until one recognizes the image 1350 * cycle the list of binary formats handler, until one recognizes the image
1351 */ 1351 */
1352int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) 1352int search_binary_handler(struct linux_binprm *bprm)
1353{ 1353{
1354 unsigned int depth = bprm->recursion_depth; 1354 unsigned int depth = bprm->recursion_depth;
1355 int try,retval; 1355 int try,retval;
@@ -1374,13 +1374,13 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1374 for (try=0; try<2; try++) { 1374 for (try=0; try<2; try++) {
1375 read_lock(&binfmt_lock); 1375 read_lock(&binfmt_lock);
1376 list_for_each_entry(fmt, &formats, lh) { 1376 list_for_each_entry(fmt, &formats, lh) {
1377 int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary; 1377 int (*fn)(struct linux_binprm *) = fmt->load_binary;
1378 if (!fn) 1378 if (!fn)
1379 continue; 1379 continue;
1380 if (!try_module_get(fmt->module)) 1380 if (!try_module_get(fmt->module))
1381 continue; 1381 continue;
1382 read_unlock(&binfmt_lock); 1382 read_unlock(&binfmt_lock);
1383 retval = fn(bprm, regs); 1383 retval = fn(bprm);
1384 /* 1384 /*
1385 * Restore the depth counter to its starting value 1385 * Restore the depth counter to its starting value
1386 * in this call, so we don't have to rely on every 1386 * in this call, so we don't have to rely on every
@@ -1439,8 +1439,7 @@ EXPORT_SYMBOL(search_binary_handler);
1439 */ 1439 */
1440static int do_execve_common(const char *filename, 1440static int do_execve_common(const char *filename,
1441 struct user_arg_ptr argv, 1441 struct user_arg_ptr argv,
1442 struct user_arg_ptr envp, 1442 struct user_arg_ptr envp)
1443 struct pt_regs *regs)
1444{ 1443{
1445 struct linux_binprm *bprm; 1444 struct linux_binprm *bprm;
1446 struct file *file; 1445 struct file *file;
@@ -1524,7 +1523,7 @@ static int do_execve_common(const char *filename,
1524 if (retval < 0) 1523 if (retval < 0)
1525 goto out; 1524 goto out;
1526 1525
1527 retval = search_binary_handler(bprm,regs); 1526 retval = search_binary_handler(bprm);
1528 if (retval < 0) 1527 if (retval < 0)
1529 goto out; 1528 goto out;
1530 1529
@@ -1566,19 +1565,17 @@ out_ret:
1566 1565
1567int do_execve(const char *filename, 1566int do_execve(const char *filename,
1568 const char __user *const __user *__argv, 1567 const char __user *const __user *__argv,
1569 const char __user *const __user *__envp, 1568 const char __user *const __user *__envp)
1570 struct pt_regs *regs)
1571{ 1569{
1572 struct user_arg_ptr argv = { .ptr.native = __argv }; 1570 struct user_arg_ptr argv = { .ptr.native = __argv };
1573 struct user_arg_ptr envp = { .ptr.native = __envp }; 1571 struct user_arg_ptr envp = { .ptr.native = __envp };
1574 return do_execve_common(filename, argv, envp, regs); 1572 return do_execve_common(filename, argv, envp);
1575} 1573}
1576 1574
1577#ifdef CONFIG_COMPAT 1575#ifdef CONFIG_COMPAT
1578int compat_do_execve(const char *filename, 1576static int compat_do_execve(const char *filename,
1579 const compat_uptr_t __user *__argv, 1577 const compat_uptr_t __user *__argv,
1580 const compat_uptr_t __user *__envp, 1578 const compat_uptr_t __user *__envp)
1581 struct pt_regs *regs)
1582{ 1579{
1583 struct user_arg_ptr argv = { 1580 struct user_arg_ptr argv = {
1584 .is_compat = true, 1581 .is_compat = true,
@@ -1588,7 +1585,7 @@ int compat_do_execve(const char *filename,
1588 .is_compat = true, 1585 .is_compat = true,
1589 .ptr.compat = __envp, 1586 .ptr.compat = __envp,
1590 }; 1587 };
1591 return do_execve_common(filename, argv, envp, regs); 1588 return do_execve_common(filename, argv, envp);
1592} 1589}
1593#endif 1590#endif
1594 1591
@@ -1669,7 +1666,7 @@ SYSCALL_DEFINE3(execve,
1669 struct filename *path = getname(filename); 1666 struct filename *path = getname(filename);
1670 int error = PTR_ERR(path); 1667 int error = PTR_ERR(path);
1671 if (!IS_ERR(path)) { 1668 if (!IS_ERR(path)) {
1672 error = do_execve(path->name, argv, envp, current_pt_regs()); 1669 error = do_execve(path->name, argv, envp);
1673 putname(path); 1670 putname(path);
1674 } 1671 }
1675 return error; 1672 return error;
@@ -1682,8 +1679,7 @@ asmlinkage long compat_sys_execve(const char __user * filename,
1682 struct filename *path = getname(filename); 1679 struct filename *path = getname(filename);
1683 int error = PTR_ERR(path); 1680 int error = PTR_ERR(path);
1684 if (!IS_ERR(path)) { 1681 if (!IS_ERR(path)) {
1685 error = compat_do_execve(path->name, argv, envp, 1682 error = compat_do_execve(path->name, argv, envp);
1686 current_pt_regs());
1687 putname(path); 1683 putname(path);
1688 } 1684 }
1689 return error; 1685 return error;
@@ -1696,12 +1692,9 @@ int kernel_execve(const char *filename,
1696 const char *const argv[], 1692 const char *const argv[],
1697 const char *const envp[]) 1693 const char *const envp[])
1698{ 1694{
1699 struct pt_regs *p = current_pt_regs(); 1695 int ret = do_execve(filename,
1700 int ret;
1701
1702 ret = do_execve(filename,
1703 (const char __user *const __user *)argv, 1696 (const char __user *const __user *)argv,
1704 (const char __user *const __user *)envp, p); 1697 (const char __user *const __user *)envp);
1705 if (ret < 0) 1698 if (ret < 0)
1706 return ret; 1699 return ret;
1707 1700
@@ -1709,6 +1702,6 @@ int kernel_execve(const char *filename,
1709 * We were successful. We won't be returning to our caller, but 1702 * We were successful. We won't be returning to our caller, but
1710 * instead to user space by manipulating the kernel stack. 1703 * instead to user space by manipulating the kernel stack.
1711 */ 1704 */
1712 ret_from_kernel_execve(p); 1705 ret_from_kernel_execve(current_pt_regs());
1713} 1706}
1714#endif 1707#endif
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 7320a66e958f..22548f56197b 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -2101,8 +2101,9 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
2101 end = start + (range->len >> sb->s_blocksize_bits) - 1; 2101 end = start + (range->len >> sb->s_blocksize_bits) - 1;
2102 minlen = range->minlen >> sb->s_blocksize_bits; 2102 minlen = range->minlen >> sb->s_blocksize_bits;
2103 2103
2104 if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)) || 2104 if (minlen > EXT3_BLOCKS_PER_GROUP(sb) ||
2105 unlikely(start >= max_blks)) 2105 start >= max_blks ||
2106 range->len < sb->s_blocksize)
2106 return -EINVAL; 2107 return -EINVAL;
2107 if (end >= max_blks) 2108 if (end >= max_blks)
2108 end = max_blks - 1; 2109 end = max_blks - 1;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3c20de1d59d0..df163da388c9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2455,7 +2455,7 @@ TAS_BUFFER_FNS(Uninit, uninit)
2455BUFFER_FNS(Da_Mapped, da_mapped) 2455BUFFER_FNS(Da_Mapped, da_mapped)
2456 2456
2457/* 2457/*
2458 * Add new method to test wether block and inode bitmaps are properly 2458 * Add new method to test whether block and inode bitmaps are properly
2459 * initialized. With uninit_bg reading the block from disk is not enough 2459 * initialized. With uninit_bg reading the block from disk is not enough
2460 * to mark the bitmap uptodate. We need to also zero-out the bitmap 2460 * to mark the bitmap uptodate. We need to also zero-out the bitmap
2461 */ 2461 */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 4facdd29a350..3a100e7a62a8 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -725,6 +725,10 @@ repeat_in_this_group:
725 "inode=%lu", ino + 1); 725 "inode=%lu", ino + 1);
726 continue; 726 continue;
727 } 727 }
728 BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
729 err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
730 if (err)
731 goto fail;
728 ext4_lock_group(sb, group); 732 ext4_lock_group(sb, group);
729 ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); 733 ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
730 ext4_unlock_group(sb, group); 734 ext4_unlock_group(sb, group);
@@ -738,6 +742,11 @@ repeat_in_this_group:
738 goto out; 742 goto out;
739 743
740got: 744got:
745 BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
746 err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
747 if (err)
748 goto fail;
749
741 /* We may have to initialize the block bitmap if it isn't already */ 750 /* We may have to initialize the block bitmap if it isn't already */
742 if (ext4_has_group_desc_csum(sb) && 751 if (ext4_has_group_desc_csum(sb) &&
743 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 752 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
@@ -771,11 +780,6 @@ got:
771 goto fail; 780 goto fail;
772 } 781 }
773 782
774 BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
775 err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
776 if (err)
777 goto fail;
778
779 BUFFER_TRACE(group_desc_bh, "get_write_access"); 783 BUFFER_TRACE(group_desc_bh, "get_write_access");
780 err = ext4_journal_get_write_access(handle, group_desc_bh); 784 err = ext4_journal_get_write_access(handle, group_desc_bh);
781 if (err) 785 if (err)
@@ -823,11 +827,6 @@ got:
823 } 827 }
824 ext4_unlock_group(sb, group); 828 ext4_unlock_group(sb, group);
825 829
826 BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
827 err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
828 if (err)
829 goto fail;
830
831 BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); 830 BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
832 err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); 831 err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
833 if (err) 832 if (err)
diff --git a/fs/fhandle.c b/fs/fhandle.c
index f775bfdd6e4a..cccdc874bb55 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -22,7 +22,7 @@ static long do_sys_name_to_handle(struct path *path,
22 struct file_handle *handle = NULL; 22 struct file_handle *handle = NULL;
23 23
24 /* 24 /*
25 * We need t make sure wether the file system 25 * We need to make sure whether the file system
26 * support decoding of the file handle 26 * support decoding of the file handle
27 */ 27 */
28 if (!path->dentry->d_sb->s_export_op || 28 if (!path->dentry->d_sb->s_export_op ||
@@ -40,7 +40,7 @@ static long do_sys_name_to_handle(struct path *path,
40 if (!handle) 40 if (!handle)
41 return -ENOMEM; 41 return -ENOMEM;
42 42
43 /* convert handle size to multiple of sizeof(u32) */ 43 /* convert handle size to multiple of sizeof(u32) */
44 handle_dwords = f_handle.handle_bytes >> 2; 44 handle_dwords = f_handle.handle_bytes >> 2;
45 45
46 /* we ask for a non connected handle */ 46 /* we ask for a non connected handle */
diff --git a/fs/file.c b/fs/file.c
index d3b5fa80b71b..15cb8618e95d 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -519,12 +519,6 @@ struct files_struct init_files = {
519 .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), 519 .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
520}; 520};
521 521
522void daemonize_descriptors(void)
523{
524 atomic_inc(&init_files.count);
525 reset_files_struct(&init_files);
526}
527
528/* 522/*
529 * allocate a file descriptor, mark it busy. 523 * allocate a file descriptor, mark it busy.
530 */ 524 */
@@ -685,7 +679,6 @@ void do_close_on_exec(struct files_struct *files)
685 struct fdtable *fdt; 679 struct fdtable *fdt;
686 680
687 /* exec unshares first */ 681 /* exec unshares first */
688 BUG_ON(atomic_read(&files->count) != 1);
689 spin_lock(&files->file_lock); 682 spin_lock(&files->file_lock);
690 for (i = 0; ; i++) { 683 for (i = 0; ; i++) {
691 unsigned long set; 684 unsigned long set;
@@ -900,7 +893,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags)
900 return __close_fd(files, fd); 893 return __close_fd(files, fd);
901 894
902 if (fd >= rlimit(RLIMIT_NOFILE)) 895 if (fd >= rlimit(RLIMIT_NOFILE))
903 return -EMFILE; 896 return -EBADF;
904 897
905 spin_lock(&files->file_lock); 898 spin_lock(&files->file_lock);
906 err = expand_files(files, fd); 899 err = expand_files(files, fd);
@@ -926,7 +919,7 @@ SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
926 return -EINVAL; 919 return -EINVAL;
927 920
928 if (newfd >= rlimit(RLIMIT_NOFILE)) 921 if (newfd >= rlimit(RLIMIT_NOFILE))
929 return -EMFILE; 922 return -EBADF;
930 923
931 spin_lock(&files->file_lock); 924 spin_lock(&files->file_lock);
932 err = expand_files(files, newfd); 925 err = expand_files(files, newfd);
@@ -995,16 +988,18 @@ int iterate_fd(struct files_struct *files, unsigned n,
995 const void *p) 988 const void *p)
996{ 989{
997 struct fdtable *fdt; 990 struct fdtable *fdt;
998 struct file *file;
999 int res = 0; 991 int res = 0;
1000 if (!files) 992 if (!files)
1001 return 0; 993 return 0;
1002 spin_lock(&files->file_lock); 994 spin_lock(&files->file_lock);
1003 fdt = files_fdtable(files); 995 for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
1004 while (!res && n < fdt->max_fds) { 996 struct file *file;
1005 file = rcu_dereference_check_fdtable(files, fdt->fd[n++]); 997 file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
1006 if (file) 998 if (!file)
1007 res = f(p, file, n); 999 continue;
1000 res = f(p, file, n);
1001 if (res)
1002 break;
1008 } 1003 }
1009 spin_unlock(&files->file_lock); 1004 spin_unlock(&files->file_lock);
1010 return res; 1005 return res;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 51ea267d444c..310972b72a66 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -228,6 +228,8 @@ static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
228static void inode_sync_complete(struct inode *inode) 228static void inode_sync_complete(struct inode *inode)
229{ 229{
230 inode->i_state &= ~I_SYNC; 230 inode->i_state &= ~I_SYNC;
231 /* If inode is clean an unused, put it into LRU now... */
232 inode_add_lru(inode);
231 /* Waiters must see I_SYNC cleared before being woken up */ 233 /* Waiters must see I_SYNC cleared before being woken up */
232 smp_mb(); 234 smp_mb();
233 wake_up_bit(&inode->i_state, __I_SYNC); 235 wake_up_bit(&inode->i_state, __I_SYNC);
@@ -1032,7 +1034,7 @@ int bdi_writeback_thread(void *data)
1032 while (!kthread_freezable_should_stop(NULL)) { 1034 while (!kthread_freezable_should_stop(NULL)) {
1033 /* 1035 /*
1034 * Remove own delayed wake-up timer, since we are already awake 1036 * Remove own delayed wake-up timer, since we are already awake
1035 * and we'll take care of the preriodic write-back. 1037 * and we'll take care of the periodic write-back.
1036 */ 1038 */
1037 del_timer(&wb->wakeup_timer); 1039 del_timer(&wb->wakeup_timer);
1038 1040
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 5df4775fea03..fe6ca583bbc0 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -164,27 +164,3 @@ struct fs_struct init_fs = {
164 .seq = SEQCNT_ZERO, 164 .seq = SEQCNT_ZERO,
165 .umask = 0022, 165 .umask = 0022,
166}; 166};
167
168void daemonize_fs_struct(void)
169{
170 struct fs_struct *fs = current->fs;
171
172 if (fs) {
173 int kill;
174
175 task_lock(current);
176
177 spin_lock(&init_fs.lock);
178 init_fs.users++;
179 spin_unlock(&init_fs.lock);
180
181 spin_lock(&fs->lock);
182 current->fs = &init_fs;
183 kill = !--fs->users;
184 spin_unlock(&fs->lock);
185
186 task_unlock(current);
187 if (kill)
188 free_fs_struct(fs);
189 }
190}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 0def0504afc1..e056b4ce4877 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -516,15 +516,13 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
516 struct gfs2_holder i_gh; 516 struct gfs2_holder i_gh;
517 int error; 517 int error;
518 518
519 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); 519 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
520 error = gfs2_glock_nq(&i_gh); 520 &i_gh);
521 if (error == 0) {
522 file_accessed(file);
523 gfs2_glock_dq(&i_gh);
524 }
525 gfs2_holder_uninit(&i_gh);
526 if (error) 521 if (error)
527 return error; 522 return error;
523 /* grab lock to update inode */
524 gfs2_glock_dq_uninit(&i_gh);
525 file_accessed(file);
528 } 526 }
529 vma->vm_ops = &gfs2_vm_ops; 527 vma->vm_ops = &gfs2_vm_ops;
530 528
@@ -677,10 +675,8 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
677 size_t writesize = iov_length(iov, nr_segs); 675 size_t writesize = iov_length(iov, nr_segs);
678 struct dentry *dentry = file->f_dentry; 676 struct dentry *dentry = file->f_dentry;
679 struct gfs2_inode *ip = GFS2_I(dentry->d_inode); 677 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
680 struct gfs2_sbd *sdp;
681 int ret; 678 int ret;
682 679
683 sdp = GFS2_SB(file->f_mapping->host);
684 ret = gfs2_rs_alloc(ip); 680 ret = gfs2_rs_alloc(ip);
685 if (ret) 681 if (ret)
686 return ret; 682 return ret;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index e6c2fd53cab2..0f22d09f358d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -768,7 +768,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
768 mapping->host = s->s_bdev->bd_inode; 768 mapping->host = s->s_bdev->bd_inode;
769 mapping->flags = 0; 769 mapping->flags = 0;
770 mapping_set_gfp_mask(mapping, GFP_NOFS); 770 mapping_set_gfp_mask(mapping, GFP_NOFS);
771 mapping->assoc_mapping = NULL; 771 mapping->private_data = NULL;
772 mapping->backing_dev_info = s->s_bdi; 772 mapping->backing_dev_info = s->s_bdi;
773 mapping->writeback_index = 0; 773 mapping->writeback_index = 0;
774 } 774 }
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 8ff95a2d54ee..9ceccb1595a3 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -393,12 +393,10 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
393 struct gfs2_meta_header *mh; 393 struct gfs2_meta_header *mh;
394 struct gfs2_trans *tr; 394 struct gfs2_trans *tr;
395 395
396 lock_buffer(bd->bd_bh);
397 gfs2_log_lock(sdp);
398 tr = current->journal_info; 396 tr = current->journal_info;
399 tr->tr_touched = 1; 397 tr->tr_touched = 1;
400 if (!list_empty(&bd->bd_list)) 398 if (!list_empty(&bd->bd_list))
401 goto out; 399 return;
402 set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); 400 set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
403 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); 401 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
404 mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; 402 mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
@@ -414,9 +412,6 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
414 sdp->sd_log_num_buf++; 412 sdp->sd_log_num_buf++;
415 list_add(&bd->bd_list, &sdp->sd_log_le_buf); 413 list_add(&bd->bd_list, &sdp->sd_log_le_buf);
416 tr->tr_num_buf_new++; 414 tr->tr_num_buf_new++;
417out:
418 gfs2_log_unlock(sdp);
419 unlock_buffer(bd->bd_bh);
420} 415}
421 416
422static void gfs2_check_magic(struct buffer_head *bh) 417static void gfs2_check_magic(struct buffer_head *bh)
@@ -621,7 +616,6 @@ static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
621 616
622static void revoke_lo_before_commit(struct gfs2_sbd *sdp) 617static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
623{ 618{
624 struct gfs2_log_descriptor *ld;
625 struct gfs2_meta_header *mh; 619 struct gfs2_meta_header *mh;
626 unsigned int offset; 620 unsigned int offset;
627 struct list_head *head = &sdp->sd_log_le_revoke; 621 struct list_head *head = &sdp->sd_log_le_revoke;
@@ -634,7 +628,6 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
634 628
635 length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, sizeof(u64)); 629 length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, sizeof(u64));
636 page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE, length, sdp->sd_log_num_revoke); 630 page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE, length, sdp->sd_log_num_revoke);
637 ld = page_address(page);
638 offset = sizeof(struct gfs2_log_descriptor); 631 offset = sizeof(struct gfs2_log_descriptor);
639 632
640 list_for_each_entry(bd, head, bd_list) { 633 list_for_each_entry(bd, head, bd_list) {
@@ -777,12 +770,10 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
777 struct address_space *mapping = bd->bd_bh->b_page->mapping; 770 struct address_space *mapping = bd->bd_bh->b_page->mapping;
778 struct gfs2_inode *ip = GFS2_I(mapping->host); 771 struct gfs2_inode *ip = GFS2_I(mapping->host);
779 772
780 lock_buffer(bd->bd_bh);
781 gfs2_log_lock(sdp);
782 if (tr) 773 if (tr)
783 tr->tr_touched = 1; 774 tr->tr_touched = 1;
784 if (!list_empty(&bd->bd_list)) 775 if (!list_empty(&bd->bd_list))
785 goto out; 776 return;
786 set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); 777 set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
787 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); 778 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
788 if (gfs2_is_jdata(ip)) { 779 if (gfs2_is_jdata(ip)) {
@@ -793,9 +784,6 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
793 } else { 784 } else {
794 list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered); 785 list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered);
795 } 786 }
796out:
797 gfs2_log_unlock(sdp);
798 unlock_buffer(bd->bd_bh);
799} 787}
800 788
801/** 789/**
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 40c4b0d42fa8..c5af8e18f27a 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -497,8 +497,11 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
497 struct gfs2_quota_data **qd; 497 struct gfs2_quota_data **qd;
498 int error; 498 int error;
499 499
500 if (ip->i_res == NULL) 500 if (ip->i_res == NULL) {
501 gfs2_rs_alloc(ip); 501 error = gfs2_rs_alloc(ip);
502 if (error)
503 return error;
504 }
502 505
503 qd = ip->i_res->rs_qa_qd; 506 qd = ip->i_res->rs_qa_qd;
504 507
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3cc402ce6fea..38fe18f2f055 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -553,7 +553,6 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd)
553 */ 553 */
554int gfs2_rs_alloc(struct gfs2_inode *ip) 554int gfs2_rs_alloc(struct gfs2_inode *ip)
555{ 555{
556 int error = 0;
557 struct gfs2_blkreserv *res; 556 struct gfs2_blkreserv *res;
558 557
559 if (ip->i_res) 558 if (ip->i_res)
@@ -561,7 +560,7 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)
561 560
562 res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS); 561 res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
563 if (!res) 562 if (!res)
564 error = -ENOMEM; 563 return -ENOMEM;
565 564
566 RB_CLEAR_NODE(&res->rs_node); 565 RB_CLEAR_NODE(&res->rs_node);
567 566
@@ -571,7 +570,7 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)
571 else 570 else
572 ip->i_res = res; 571 ip->i_res = res;
573 up_write(&ip->i_rw_mutex); 572 up_write(&ip->i_rw_mutex);
574 return error; 573 return 0;
575} 574}
576 575
577static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs) 576static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
@@ -1263,7 +1262,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
1263 int ret = 0; 1262 int ret = 0;
1264 u64 amt; 1263 u64 amt;
1265 u64 trimmed = 0; 1264 u64 trimmed = 0;
1265 u64 start, end, minlen;
1266 unsigned int x; 1266 unsigned int x;
1267 unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
1267 1268
1268 if (!capable(CAP_SYS_ADMIN)) 1269 if (!capable(CAP_SYS_ADMIN))
1269 return -EPERM; 1270 return -EPERM;
@@ -1271,19 +1272,25 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
1271 if (!blk_queue_discard(q)) 1272 if (!blk_queue_discard(q))
1272 return -EOPNOTSUPP; 1273 return -EOPNOTSUPP;
1273 1274
1274 if (argp == NULL) { 1275 if (copy_from_user(&r, argp, sizeof(r)))
1275 r.start = 0;
1276 r.len = ULLONG_MAX;
1277 r.minlen = 0;
1278 } else if (copy_from_user(&r, argp, sizeof(r)))
1279 return -EFAULT; 1276 return -EFAULT;
1280 1277
1281 ret = gfs2_rindex_update(sdp); 1278 ret = gfs2_rindex_update(sdp);
1282 if (ret) 1279 if (ret)
1283 return ret; 1280 return ret;
1284 1281
1285 rgd = gfs2_blk2rgrpd(sdp, r.start, 0); 1282 start = r.start >> bs_shift;
1286 rgd_end = gfs2_blk2rgrpd(sdp, r.start + r.len, 0); 1283 end = start + (r.len >> bs_shift);
1284 minlen = max_t(u64, r.minlen,
1285 q->limits.discard_granularity) >> bs_shift;
1286
1287 rgd = gfs2_blk2rgrpd(sdp, start, 0);
1288 rgd_end = gfs2_blk2rgrpd(sdp, end - 1, 0);
1289
1290 if (end <= start ||
1291 minlen > sdp->sd_max_rg_data ||
1292 start > rgd_end->rd_data0 + rgd_end->rd_data)
1293 return -EINVAL;
1287 1294
1288 while (1) { 1295 while (1) {
1289 1296
@@ -1295,7 +1302,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
1295 /* Trim each bitmap in the rgrp */ 1302 /* Trim each bitmap in the rgrp */
1296 for (x = 0; x < rgd->rd_length; x++) { 1303 for (x = 0; x < rgd->rd_length; x++) {
1297 struct gfs2_bitmap *bi = rgd->rd_bits + x; 1304 struct gfs2_bitmap *bi = rgd->rd_bits + x;
1298 ret = gfs2_rgrp_send_discards(sdp, rgd->rd_data0, NULL, bi, r.minlen, &amt); 1305 ret = gfs2_rgrp_send_discards(sdp,
1306 rgd->rd_data0, NULL, bi, minlen,
1307 &amt);
1299 if (ret) { 1308 if (ret) {
1300 gfs2_glock_dq_uninit(&gh); 1309 gfs2_glock_dq_uninit(&gh);
1301 goto out; 1310 goto out;
@@ -1324,7 +1333,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
1324 1333
1325out: 1334out:
1326 r.len = trimmed << 9; 1335 r.len = trimmed << 9;
1327 if (argp && copy_to_user(argp, &r, sizeof(r))) 1336 if (copy_to_user(argp, &r, sizeof(r)))
1328 return -EFAULT; 1337 return -EFAULT;
1329 1338
1330 return ret; 1339 return ret;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index bc737261f234..d6488674d916 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -810,7 +810,8 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
810 return; 810 return;
811 } 811 }
812 need_unlock = 1; 812 need_unlock = 1;
813 } 813 } else if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE))
814 return;
814 815
815 if (current->journal_info == NULL) { 816 if (current->journal_info == NULL) {
816 ret = gfs2_trans_begin(sdp, RES_DINODE, 0); 817 ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index adbd27875ef9..413627072f36 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -155,14 +155,22 @@ void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
155 struct gfs2_sbd *sdp = gl->gl_sbd; 155 struct gfs2_sbd *sdp = gl->gl_sbd;
156 struct gfs2_bufdata *bd; 156 struct gfs2_bufdata *bd;
157 157
158 lock_buffer(bh);
159 gfs2_log_lock(sdp);
158 bd = bh->b_private; 160 bd = bh->b_private;
159 if (bd) 161 if (bd)
160 gfs2_assert(sdp, bd->bd_gl == gl); 162 gfs2_assert(sdp, bd->bd_gl == gl);
161 else { 163 else {
164 gfs2_log_unlock(sdp);
165 unlock_buffer(bh);
162 gfs2_attach_bufdata(gl, bh, meta); 166 gfs2_attach_bufdata(gl, bh, meta);
163 bd = bh->b_private; 167 bd = bh->b_private;
168 lock_buffer(bh);
169 gfs2_log_lock(sdp);
164 } 170 }
165 lops_add(sdp, bd); 171 lops_add(sdp, bd);
172 gfs2_log_unlock(sdp);
173 unlock_buffer(bh);
166} 174}
167 175
168void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) 176void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c5bc355d8243..78bde32ea951 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * hugetlbpage-backed filesystem. Based on ramfs. 2 * hugetlbpage-backed filesystem. Based on ramfs.
3 * 3 *
4 * William Irwin, 2002 4 * Nadia Yvette Chambers, 2002
5 * 5 *
6 * Copyright (C) 2002 Linus Torvalds. 6 * Copyright (C) 2002 Linus Torvalds.
7 */ 7 */
@@ -151,8 +151,8 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
151{ 151{
152 struct mm_struct *mm = current->mm; 152 struct mm_struct *mm = current->mm;
153 struct vm_area_struct *vma; 153 struct vm_area_struct *vma;
154 unsigned long start_addr;
155 struct hstate *h = hstate_file(file); 154 struct hstate *h = hstate_file(file);
155 struct vm_unmapped_area_info info;
156 156
157 if (len & ~huge_page_mask(h)) 157 if (len & ~huge_page_mask(h))
158 return -EINVAL; 158 return -EINVAL;
@@ -173,39 +173,13 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
173 return addr; 173 return addr;
174 } 174 }
175 175
176 if (len > mm->cached_hole_size) 176 info.flags = 0;
177 start_addr = mm->free_area_cache; 177 info.length = len;
178 else { 178 info.low_limit = TASK_UNMAPPED_BASE;
179 start_addr = TASK_UNMAPPED_BASE; 179 info.high_limit = TASK_SIZE;
180 mm->cached_hole_size = 0; 180 info.align_mask = PAGE_MASK & ~huge_page_mask(h);
181 } 181 info.align_offset = 0;
182 182 return vm_unmapped_area(&info);
183full_search:
184 addr = ALIGN(start_addr, huge_page_size(h));
185
186 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
187 /* At this point: (!vma || addr < vma->vm_end). */
188 if (TASK_SIZE - len < addr) {
189 /*
190 * Start a new search - just in case we missed
191 * some holes.
192 */
193 if (start_addr != TASK_UNMAPPED_BASE) {
194 start_addr = TASK_UNMAPPED_BASE;
195 mm->cached_hole_size = 0;
196 goto full_search;
197 }
198 return -ENOMEM;
199 }
200
201 if (!vma || addr + len <= vma->vm_start) {
202 mm->free_area_cache = addr + len;
203 return addr;
204 }
205 if (addr + mm->cached_hole_size < vma->vm_start)
206 mm->cached_hole_size = vma->vm_start - addr;
207 addr = ALIGN(vma->vm_end, huge_page_size(h));
208 }
209} 183}
210#endif 184#endif
211 185
@@ -608,11 +582,11 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
608 int rc; 582 int rc;
609 583
610 rc = migrate_huge_page_move_mapping(mapping, newpage, page); 584 rc = migrate_huge_page_move_mapping(mapping, newpage, page);
611 if (rc) 585 if (rc != MIGRATEPAGE_SUCCESS)
612 return rc; 586 return rc;
613 migrate_page_copy(newpage, page); 587 migrate_page_copy(newpage, page);
614 588
615 return 0; 589 return MIGRATEPAGE_SUCCESS;
616} 590}
617 591
618static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) 592static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -923,7 +897,7 @@ static struct file_system_type hugetlbfs_fs_type = {
923 .kill_sb = kill_litter_super, 897 .kill_sb = kill_litter_super,
924}; 898};
925 899
926static struct vfsmount *hugetlbfs_vfsmount; 900static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
927 901
928static int can_do_hugetlb_shm(void) 902static int can_do_hugetlb_shm(void)
929{ 903{
@@ -932,9 +906,22 @@ static int can_do_hugetlb_shm(void)
932 return capable(CAP_IPC_LOCK) || in_group_p(shm_group); 906 return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
933} 907}
934 908
909static int get_hstate_idx(int page_size_log)
910{
911 struct hstate *h;
912
913 if (!page_size_log)
914 return default_hstate_idx;
915 h = size_to_hstate(1 << page_size_log);
916 if (!h)
917 return -1;
918 return h - hstates;
919}
920
935struct file *hugetlb_file_setup(const char *name, unsigned long addr, 921struct file *hugetlb_file_setup(const char *name, unsigned long addr,
936 size_t size, vm_flags_t acctflag, 922 size_t size, vm_flags_t acctflag,
937 struct user_struct **user, int creat_flags) 923 struct user_struct **user,
924 int creat_flags, int page_size_log)
938{ 925{
939 int error = -ENOMEM; 926 int error = -ENOMEM;
940 struct file *file; 927 struct file *file;
@@ -944,9 +931,14 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
944 struct qstr quick_string; 931 struct qstr quick_string;
945 struct hstate *hstate; 932 struct hstate *hstate;
946 unsigned long num_pages; 933 unsigned long num_pages;
934 int hstate_idx;
935
936 hstate_idx = get_hstate_idx(page_size_log);
937 if (hstate_idx < 0)
938 return ERR_PTR(-ENODEV);
947 939
948 *user = NULL; 940 *user = NULL;
949 if (!hugetlbfs_vfsmount) 941 if (!hugetlbfs_vfsmount[hstate_idx])
950 return ERR_PTR(-ENOENT); 942 return ERR_PTR(-ENOENT);
951 943
952 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { 944 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
@@ -963,7 +955,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
963 } 955 }
964 } 956 }
965 957
966 root = hugetlbfs_vfsmount->mnt_root; 958 root = hugetlbfs_vfsmount[hstate_idx]->mnt_root;
967 quick_string.name = name; 959 quick_string.name = name;
968 quick_string.len = strlen(quick_string.name); 960 quick_string.len = strlen(quick_string.name);
969 quick_string.hash = 0; 961 quick_string.hash = 0;
@@ -971,7 +963,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
971 if (!path.dentry) 963 if (!path.dentry)
972 goto out_shm_unlock; 964 goto out_shm_unlock;
973 965
974 path.mnt = mntget(hugetlbfs_vfsmount); 966 path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
975 error = -ENOSPC; 967 error = -ENOSPC;
976 inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0); 968 inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0);
977 if (!inode) 969 if (!inode)
@@ -1011,8 +1003,9 @@ out_shm_unlock:
1011 1003
1012static int __init init_hugetlbfs_fs(void) 1004static int __init init_hugetlbfs_fs(void)
1013{ 1005{
1006 struct hstate *h;
1014 int error; 1007 int error;
1015 struct vfsmount *vfsmount; 1008 int i;
1016 1009
1017 error = bdi_init(&hugetlbfs_backing_dev_info); 1010 error = bdi_init(&hugetlbfs_backing_dev_info);
1018 if (error) 1011 if (error)
@@ -1029,14 +1022,26 @@ static int __init init_hugetlbfs_fs(void)
1029 if (error) 1022 if (error)
1030 goto out; 1023 goto out;
1031 1024
1032 vfsmount = kern_mount(&hugetlbfs_fs_type); 1025 i = 0;
1026 for_each_hstate(h) {
1027 char buf[50];
1028 unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
1033 1029
1034 if (!IS_ERR(vfsmount)) { 1030 snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
1035 hugetlbfs_vfsmount = vfsmount; 1031 hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
1036 return 0; 1032 buf);
1037 }
1038 1033
1039 error = PTR_ERR(vfsmount); 1034 if (IS_ERR(hugetlbfs_vfsmount[i])) {
1035 pr_err("hugetlb: Cannot mount internal hugetlbfs for "
1036 "page size %uK", ps_kb);
1037 error = PTR_ERR(hugetlbfs_vfsmount[i]);
1038 hugetlbfs_vfsmount[i] = NULL;
1039 }
1040 i++;
1041 }
1042 /* Non default hstates are optional */
1043 if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
1044 return 0;
1040 1045
1041 out: 1046 out:
1042 kmem_cache_destroy(hugetlbfs_inode_cachep); 1047 kmem_cache_destroy(hugetlbfs_inode_cachep);
@@ -1047,13 +1052,19 @@ static int __init init_hugetlbfs_fs(void)
1047 1052
1048static void __exit exit_hugetlbfs_fs(void) 1053static void __exit exit_hugetlbfs_fs(void)
1049{ 1054{
1055 struct hstate *h;
1056 int i;
1057
1058
1050 /* 1059 /*
1051 * Make sure all delayed rcu free inodes are flushed before we 1060 * Make sure all delayed rcu free inodes are flushed before we
1052 * destroy cache. 1061 * destroy cache.
1053 */ 1062 */
1054 rcu_barrier(); 1063 rcu_barrier();
1055 kmem_cache_destroy(hugetlbfs_inode_cachep); 1064 kmem_cache_destroy(hugetlbfs_inode_cachep);
1056 kern_unmount(hugetlbfs_vfsmount); 1065 i = 0;
1066 for_each_hstate(h)
1067 kern_unmount(hugetlbfs_vfsmount[i++]);
1057 unregister_filesystem(&hugetlbfs_fs_type); 1068 unregister_filesystem(&hugetlbfs_fs_type);
1058 bdi_destroy(&hugetlbfs_backing_dev_info); 1069 bdi_destroy(&hugetlbfs_backing_dev_info);
1059} 1070}
diff --git a/fs/inode.c b/fs/inode.c
index b03c71957246..14084b72b259 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -165,7 +165,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
165 mapping->host = inode; 165 mapping->host = inode;
166 mapping->flags = 0; 166 mapping->flags = 0;
167 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); 167 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
168 mapping->assoc_mapping = NULL; 168 mapping->private_data = NULL;
169 mapping->backing_dev_info = &default_backing_dev_info; 169 mapping->backing_dev_info = &default_backing_dev_info;
170 mapping->writeback_index = 0; 170 mapping->writeback_index = 0;
171 171
@@ -408,6 +408,19 @@ static void inode_lru_list_add(struct inode *inode)
408 spin_unlock(&inode->i_sb->s_inode_lru_lock); 408 spin_unlock(&inode->i_sb->s_inode_lru_lock);
409} 409}
410 410
411/*
412 * Add inode to LRU if needed (inode is unused and clean).
413 *
414 * Needs inode->i_lock held.
415 */
416void inode_add_lru(struct inode *inode)
417{
418 if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) &&
419 !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
420 inode_lru_list_add(inode);
421}
422
423
411static void inode_lru_list_del(struct inode *inode) 424static void inode_lru_list_del(struct inode *inode)
412{ 425{
413 spin_lock(&inode->i_sb->s_inode_lru_lock); 426 spin_lock(&inode->i_sb->s_inode_lru_lock);
@@ -1390,8 +1403,7 @@ static void iput_final(struct inode *inode)
1390 1403
1391 if (!drop && (sb->s_flags & MS_ACTIVE)) { 1404 if (!drop && (sb->s_flags & MS_ACTIVE)) {
1392 inode->i_state |= I_REFERENCED; 1405 inode->i_state |= I_REFERENCED;
1393 if (!(inode->i_state & (I_DIRTY|I_SYNC))) 1406 inode_add_lru(inode);
1394 inode_lru_list_add(inode);
1395 spin_unlock(&inode->i_lock); 1407 spin_unlock(&inode->i_lock);
1396 return; 1408 return;
1397 } 1409 }
diff --git a/fs/internal.h b/fs/internal.h
index 916b7cbf3e3e..2f6af7f645eb 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -110,6 +110,7 @@ extern int open_check_o_direct(struct file *f);
110 * inode.c 110 * inode.c
111 */ 111 */
112extern spinlock_t inode_sb_list_lock; 112extern spinlock_t inode_sb_list_lock;
113extern void inode_add_lru(struct inode *inode);
113 114
114/* 115/*
115 * fs-writeback.c 116 * fs-writeback.c
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 78b7f84241d4..071d6905f0dd 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1259,7 +1259,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
1259 goto not_jbd; 1259 goto not_jbd;
1260 } 1260 }
1261 1261
1262 /* keep track of wether or not this transaction modified us */ 1262 /* keep track of whether or not this transaction modified us */
1263 was_modified = jh->b_modified; 1263 was_modified = jh->b_modified;
1264 1264
1265 /* 1265 /*
@@ -1961,7 +1961,9 @@ retry:
1961 spin_unlock(&journal->j_list_lock); 1961 spin_unlock(&journal->j_list_lock);
1962 jbd_unlock_bh_state(bh); 1962 jbd_unlock_bh_state(bh);
1963 spin_unlock(&journal->j_state_lock); 1963 spin_unlock(&journal->j_state_lock);
1964 unlock_buffer(bh);
1964 log_wait_commit(journal, tid); 1965 log_wait_commit(journal, tid);
1966 lock_buffer(bh);
1965 goto retry; 1967 goto retry;
1966 } 1968 }
1967 /* 1969 /*
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a74ba4659549..d8da40e99d84 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1261,7 +1261,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1261 goto not_jbd; 1261 goto not_jbd;
1262 } 1262 }
1263 1263
1264 /* keep track of wether or not this transaction modified us */ 1264 /* keep track of whether or not this transaction modified us */
1265 was_modified = jh->b_modified; 1265 was_modified = jh->b_modified;
1266 1266
1267 /* 1267 /*
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 60ef3fb707ff..1506673c087e 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -138,33 +138,39 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
138 struct page *pg; 138 struct page *pg;
139 struct inode *inode = mapping->host; 139 struct inode *inode = mapping->host;
140 struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); 140 struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
141 struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
142 struct jffs2_raw_inode ri;
143 uint32_t alloc_len = 0;
141 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 144 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
142 uint32_t pageofs = index << PAGE_CACHE_SHIFT; 145 uint32_t pageofs = index << PAGE_CACHE_SHIFT;
143 int ret = 0; 146 int ret = 0;
144 147
148 jffs2_dbg(1, "%s()\n", __func__);
149
150 if (pageofs > inode->i_size) {
151 ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
152 ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
153 if (ret)
154 return ret;
155 }
156
157 mutex_lock(&f->sem);
145 pg = grab_cache_page_write_begin(mapping, index, flags); 158 pg = grab_cache_page_write_begin(mapping, index, flags);
146 if (!pg) 159 if (!pg) {
160 if (alloc_len)
161 jffs2_complete_reservation(c);
162 mutex_unlock(&f->sem);
147 return -ENOMEM; 163 return -ENOMEM;
164 }
148 *pagep = pg; 165 *pagep = pg;
149 166
150 jffs2_dbg(1, "%s()\n", __func__); 167 if (alloc_len) {
151
152 if (pageofs > inode->i_size) {
153 /* Make new hole frag from old EOF to new page */ 168 /* Make new hole frag from old EOF to new page */
154 struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
155 struct jffs2_raw_inode ri;
156 struct jffs2_full_dnode *fn; 169 struct jffs2_full_dnode *fn;
157 uint32_t alloc_len;
158 170
159 jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n", 171 jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n",
160 (unsigned int)inode->i_size, pageofs); 172 (unsigned int)inode->i_size, pageofs);
161 173
162 ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
163 ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
164 if (ret)
165 goto out_page;
166
167 mutex_lock(&f->sem);
168 memset(&ri, 0, sizeof(ri)); 174 memset(&ri, 0, sizeof(ri));
169 175
170 ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); 176 ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -191,7 +197,6 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
191 if (IS_ERR(fn)) { 197 if (IS_ERR(fn)) {
192 ret = PTR_ERR(fn); 198 ret = PTR_ERR(fn);
193 jffs2_complete_reservation(c); 199 jffs2_complete_reservation(c);
194 mutex_unlock(&f->sem);
195 goto out_page; 200 goto out_page;
196 } 201 }
197 ret = jffs2_add_full_dnode_to_inode(c, f, fn); 202 ret = jffs2_add_full_dnode_to_inode(c, f, fn);
@@ -206,12 +211,10 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
206 jffs2_mark_node_obsolete(c, fn->raw); 211 jffs2_mark_node_obsolete(c, fn->raw);
207 jffs2_free_full_dnode(fn); 212 jffs2_free_full_dnode(fn);
208 jffs2_complete_reservation(c); 213 jffs2_complete_reservation(c);
209 mutex_unlock(&f->sem);
210 goto out_page; 214 goto out_page;
211 } 215 }
212 jffs2_complete_reservation(c); 216 jffs2_complete_reservation(c);
213 inode->i_size = pageofs; 217 inode->i_size = pageofs;
214 mutex_unlock(&f->sem);
215 } 218 }
216 219
217 /* 220 /*
@@ -220,18 +223,18 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
220 * case of a short-copy. 223 * case of a short-copy.
221 */ 224 */
222 if (!PageUptodate(pg)) { 225 if (!PageUptodate(pg)) {
223 mutex_lock(&f->sem);
224 ret = jffs2_do_readpage_nolock(inode, pg); 226 ret = jffs2_do_readpage_nolock(inode, pg);
225 mutex_unlock(&f->sem);
226 if (ret) 227 if (ret)
227 goto out_page; 228 goto out_page;
228 } 229 }
230 mutex_unlock(&f->sem);
229 jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags); 231 jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags);
230 return ret; 232 return ret;
231 233
232out_page: 234out_page:
233 unlock_page(pg); 235 unlock_page(pg);
234 page_cache_release(pg); 236 page_cache_release(pg);
237 mutex_unlock(&f->sem);
235 return ret; 238 return ret;
236} 239}
237 240
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index adb90116d36b..af49e2d6941a 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -33,7 +33,7 @@
33 * are being written out - and waiting for GC to make progress, naturally. 33 * are being written out - and waiting for GC to make progress, naturally.
34 * 34 *
35 * So we cannot just call iget() or some variant of it, but first have to check 35 * So we cannot just call iget() or some variant of it, but first have to check
36 * wether the inode in question might be in I_FREEING state. Therefore we 36 * whether the inode in question might be in I_FREEING state. Therefore we
37 * maintain our own per-sb list of "almost deleted" inodes and check against 37 * maintain our own per-sb list of "almost deleted" inodes and check against
38 * that list first. Normally this should be at most 1-2 entries long. 38 * that list first. Normally this should be at most 1-2 entries long.
39 * 39 *
diff --git a/fs/namei.c b/fs/namei.c
index 937f9d50c84b..5f4cdf3ad913 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2131,6 +2131,11 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
2131 if (!len) 2131 if (!len)
2132 return ERR_PTR(-EACCES); 2132 return ERR_PTR(-EACCES);
2133 2133
2134 if (unlikely(name[0] == '.')) {
2135 if (len < 2 || (len == 2 && name[1] == '.'))
2136 return ERR_PTR(-EACCES);
2137 }
2138
2134 while (len--) { 2139 while (len--) {
2135 c = *(const unsigned char *)name++; 2140 c = *(const unsigned char *)name++;
2136 if (c == '/' || c == '\0') 2141 if (c == '/' || c == '\0')
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index be20a7e171a0..63d14a99483d 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -89,7 +89,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area,
89 /* 89 /*
90 * If I understand ncp_read_kernel() properly, the above always 90 * If I understand ncp_read_kernel() properly, the above always
91 * fetches from the network, here the analogue of disk. 91 * fetches from the network, here the analogue of disk.
92 * -- wli 92 * -- nyc
93 */ 93 */
94 count_vm_event(PGMAJFAULT); 94 count_vm_event(PGMAJFAULT);
95 mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT); 95 mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ce8cb926526b..b9e66b7e0c14 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -450,7 +450,8 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
450 nfs_refresh_inode(dentry->d_inode, entry->fattr); 450 nfs_refresh_inode(dentry->d_inode, entry->fattr);
451 goto out; 451 goto out;
452 } else { 452 } else {
453 d_drop(dentry); 453 if (d_invalidate(dentry) != 0)
454 goto out;
454 dput(dentry); 455 dput(dentry);
455 } 456 }
456 } 457 }
@@ -1100,6 +1101,8 @@ out_set_verifier:
1100out_zap_parent: 1101out_zap_parent:
1101 nfs_zap_caches(dir); 1102 nfs_zap_caches(dir);
1102 out_bad: 1103 out_bad:
1104 nfs_free_fattr(fattr);
1105 nfs_free_fhandle(fhandle);
1103 nfs_mark_for_revalidate(dir); 1106 nfs_mark_for_revalidate(dir);
1104 if (inode && S_ISDIR(inode->i_mode)) { 1107 if (inode && S_ISDIR(inode->i_mode)) {
1105 /* Purge readdir caches. */ 1108 /* Purge readdir caches. */
@@ -1112,8 +1115,6 @@ out_zap_parent:
1112 shrink_dcache_parent(dentry); 1115 shrink_dcache_parent(dentry);
1113 } 1116 }
1114 d_drop(dentry); 1117 d_drop(dentry);
1115 nfs_free_fattr(fattr);
1116 nfs_free_fhandle(fhandle);
1117 dput(parent); 1118 dput(parent);
1118 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", 1119 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
1119 __func__, dentry->d_parent->d_name.name, 1120 __func__, dentry->d_parent->d_name.name,
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 31c26c4dcc23..ca4b11ec87a2 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -217,7 +217,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
217{ 217{
218 char buf1[NFS_DNS_HOSTNAME_MAXLEN+1]; 218 char buf1[NFS_DNS_HOSTNAME_MAXLEN+1];
219 struct nfs_dns_ent key, *item; 219 struct nfs_dns_ent key, *item;
220 unsigned long ttl; 220 unsigned int ttl;
221 ssize_t len; 221 ssize_t len;
222 int ret = -EINVAL; 222 int ret = -EINVAL;
223 223
@@ -240,7 +240,8 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
240 key.namelen = len; 240 key.namelen = len;
241 memset(&key.h, 0, sizeof(key.h)); 241 memset(&key.h, 0, sizeof(key.h));
242 242
243 ttl = get_expiry(&buf); 243 if (get_uint(&buf, &ttl) < 0)
244 goto out;
244 if (ttl == 0) 245 if (ttl == 0)
245 goto out; 246 goto out;
246 key.h.expiry_time = ttl + seconds_since_boot(); 247 key.h.expiry_time = ttl + seconds_since_boot();
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 5c7325c5c5e6..6fa01aea2488 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -685,7 +685,10 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
685 if (ctx->cred != NULL) 685 if (ctx->cred != NULL)
686 put_rpccred(ctx->cred); 686 put_rpccred(ctx->cred);
687 dput(ctx->dentry); 687 dput(ctx->dentry);
688 nfs_sb_deactive(sb); 688 if (is_sync)
689 nfs_sb_deactive(sb);
690 else
691 nfs_sb_deactive_async(sb);
689 kfree(ctx->mdsthreshold); 692 kfree(ctx->mdsthreshold);
690 kfree(ctx); 693 kfree(ctx);
691} 694}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 59b133c5d652..05521cadac2e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -351,10 +351,12 @@ extern int __init register_nfs_fs(void);
351extern void __exit unregister_nfs_fs(void); 351extern void __exit unregister_nfs_fs(void);
352extern void nfs_sb_active(struct super_block *sb); 352extern void nfs_sb_active(struct super_block *sb);
353extern void nfs_sb_deactive(struct super_block *sb); 353extern void nfs_sb_deactive(struct super_block *sb);
354extern void nfs_sb_deactive_async(struct super_block *sb);
354 355
355/* namespace.c */ 356/* namespace.c */
357#define NFS_PATH_CANONICAL 1
356extern char *nfs_path(char **p, struct dentry *dentry, 358extern char *nfs_path(char **p, struct dentry *dentry,
357 char *buffer, ssize_t buflen); 359 char *buffer, ssize_t buflen, unsigned flags);
358extern struct vfsmount *nfs_d_automount(struct path *path); 360extern struct vfsmount *nfs_d_automount(struct path *path);
359struct vfsmount *nfs_submount(struct nfs_server *, struct dentry *, 361struct vfsmount *nfs_submount(struct nfs_server *, struct dentry *,
360 struct nfs_fh *, struct nfs_fattr *); 362 struct nfs_fh *, struct nfs_fattr *);
@@ -498,7 +500,7 @@ static inline char *nfs_devname(struct dentry *dentry,
498 char *buffer, ssize_t buflen) 500 char *buffer, ssize_t buflen)
499{ 501{
500 char *dummy; 502 char *dummy;
501 return nfs_path(&dummy, dentry, buffer, buflen); 503 return nfs_path(&dummy, dentry, buffer, buflen, NFS_PATH_CANONICAL);
502} 504}
503 505
504/* 506/*
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 8e65c7f1f87c..015f71f8f62c 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -181,7 +181,7 @@ int nfs_mount(struct nfs_mount_request *info)
181 else 181 else
182 msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT]; 182 msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT];
183 183
184 status = rpc_call_sync(mnt_clnt, &msg, 0); 184 status = rpc_call_sync(mnt_clnt, &msg, RPC_TASK_SOFT|RPC_TASK_TIMEOUT);
185 rpc_shutdown_client(mnt_clnt); 185 rpc_shutdown_client(mnt_clnt);
186 186
187 if (status < 0) 187 if (status < 0)
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 655925373b91..dd057bc6b65b 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -33,6 +33,7 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ;
33 * @dentry - pointer to dentry 33 * @dentry - pointer to dentry
34 * @buffer - result buffer 34 * @buffer - result buffer
35 * @buflen - length of buffer 35 * @buflen - length of buffer
36 * @flags - options (see below)
36 * 37 *
37 * Helper function for constructing the server pathname 38 * Helper function for constructing the server pathname
38 * by arbitrary hashed dentry. 39 * by arbitrary hashed dentry.
@@ -40,8 +41,14 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ;
40 * This is mainly for use in figuring out the path on the 41 * This is mainly for use in figuring out the path on the
41 * server side when automounting on top of an existing partition 42 * server side when automounting on top of an existing partition
42 * and in generating /proc/mounts and friends. 43 * and in generating /proc/mounts and friends.
44 *
45 * Supported flags:
46 * NFS_PATH_CANONICAL: ensure there is exactly one slash after
47 * the original device (export) name
48 * (if unset, the original name is returned verbatim)
43 */ 49 */
44char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen) 50char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen,
51 unsigned flags)
45{ 52{
46 char *end; 53 char *end;
47 int namelen; 54 int namelen;
@@ -74,7 +81,7 @@ rename_retry:
74 rcu_read_unlock(); 81 rcu_read_unlock();
75 goto rename_retry; 82 goto rename_retry;
76 } 83 }
77 if (*end != '/') { 84 if ((flags & NFS_PATH_CANONICAL) && *end != '/') {
78 if (--buflen < 0) { 85 if (--buflen < 0) {
79 spin_unlock(&dentry->d_lock); 86 spin_unlock(&dentry->d_lock);
80 rcu_read_unlock(); 87 rcu_read_unlock();
@@ -91,9 +98,11 @@ rename_retry:
91 return end; 98 return end;
92 } 99 }
93 namelen = strlen(base); 100 namelen = strlen(base);
94 /* Strip off excess slashes in base string */ 101 if (flags & NFS_PATH_CANONICAL) {
95 while (namelen > 0 && base[namelen - 1] == '/') 102 /* Strip off excess slashes in base string */
96 namelen--; 103 while (namelen > 0 && base[namelen - 1] == '/')
104 namelen--;
105 }
97 buflen -= namelen; 106 buflen -= namelen;
98 if (buflen < 0) { 107 if (buflen < 0) {
99 spin_unlock(&dentry->d_lock); 108 spin_unlock(&dentry->d_lock);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 79fbb61ce202..1e09eb78543b 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -81,7 +81,8 @@ static char *nfs_path_component(const char *nfspath, const char *end)
81static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen) 81static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen)
82{ 82{
83 char *limit; 83 char *limit;
84 char *path = nfs_path(&limit, dentry, buffer, buflen); 84 char *path = nfs_path(&limit, dentry, buffer, buflen,
85 NFS_PATH_CANONICAL);
85 if (!IS_ERR(path)) { 86 if (!IS_ERR(path)) {
86 char *path_component = nfs_path_component(path, limit); 87 char *path_component = nfs_path_component(path, limit);
87 if (path_component) 88 if (path_component)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 68b21d81b7ac..5eec4429970c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -339,8 +339,7 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
339 dprintk("%s ERROR: %d Reset session\n", __func__, 339 dprintk("%s ERROR: %d Reset session\n", __func__,
340 errorcode); 340 errorcode);
341 nfs4_schedule_session_recovery(clp->cl_session, errorcode); 341 nfs4_schedule_session_recovery(clp->cl_session, errorcode);
342 exception->retry = 1; 342 goto wait_on_recovery;
343 break;
344#endif /* defined(CONFIG_NFS_V4_1) */ 343#endif /* defined(CONFIG_NFS_V4_1) */
345 case -NFS4ERR_FILE_OPEN: 344 case -NFS4ERR_FILE_OPEN:
346 if (exception->timeout > HZ) { 345 if (exception->timeout > HZ) {
@@ -1572,9 +1571,11 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
1572 data->timestamp = jiffies; 1571 data->timestamp = jiffies;
1573 if (nfs4_setup_sequence(data->o_arg.server, 1572 if (nfs4_setup_sequence(data->o_arg.server,
1574 &data->o_arg.seq_args, 1573 &data->o_arg.seq_args,
1575 &data->o_res.seq_res, task)) 1574 &data->o_res.seq_res,
1576 return; 1575 task) != 0)
1577 rpc_call_start(task); 1576 nfs_release_seqid(data->o_arg.seqid);
1577 else
1578 rpc_call_start(task);
1578 return; 1579 return;
1579unlock_no_action: 1580unlock_no_action:
1580 rcu_read_unlock(); 1581 rcu_read_unlock();
@@ -1748,7 +1749,7 @@ static int nfs4_opendata_access(struct rpc_cred *cred,
1748 1749
1749 /* even though OPEN succeeded, access is denied. Close the file */ 1750 /* even though OPEN succeeded, access is denied. Close the file */
1750 nfs4_close_state(state, fmode); 1751 nfs4_close_state(state, fmode);
1751 return -NFS4ERR_ACCESS; 1752 return -EACCES;
1752} 1753}
1753 1754
1754/* 1755/*
@@ -2196,7 +2197,7 @@ static void nfs4_free_closedata(void *data)
2196 nfs4_put_open_state(calldata->state); 2197 nfs4_put_open_state(calldata->state);
2197 nfs_free_seqid(calldata->arg.seqid); 2198 nfs_free_seqid(calldata->arg.seqid);
2198 nfs4_put_state_owner(sp); 2199 nfs4_put_state_owner(sp);
2199 nfs_sb_deactive(sb); 2200 nfs_sb_deactive_async(sb);
2200 kfree(calldata); 2201 kfree(calldata);
2201} 2202}
2202 2203
@@ -2296,9 +2297,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2296 if (nfs4_setup_sequence(NFS_SERVER(inode), 2297 if (nfs4_setup_sequence(NFS_SERVER(inode),
2297 &calldata->arg.seq_args, 2298 &calldata->arg.seq_args,
2298 &calldata->res.seq_res, 2299 &calldata->res.seq_res,
2299 task)) 2300 task) != 0)
2300 goto out; 2301 nfs_release_seqid(calldata->arg.seqid);
2301 rpc_call_start(task); 2302 else
2303 rpc_call_start(task);
2302out: 2304out:
2303 dprintk("%s: done!\n", __func__); 2305 dprintk("%s: done!\n", __func__);
2304} 2306}
@@ -4529,6 +4531,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
4529 if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) 4531 if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
4530 rpc_restart_call_prepare(task); 4532 rpc_restart_call_prepare(task);
4531 } 4533 }
4534 nfs_release_seqid(calldata->arg.seqid);
4532} 4535}
4533 4536
4534static void nfs4_locku_prepare(struct rpc_task *task, void *data) 4537static void nfs4_locku_prepare(struct rpc_task *task, void *data)
@@ -4545,9 +4548,11 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
4545 calldata->timestamp = jiffies; 4548 calldata->timestamp = jiffies;
4546 if (nfs4_setup_sequence(calldata->server, 4549 if (nfs4_setup_sequence(calldata->server,
4547 &calldata->arg.seq_args, 4550 &calldata->arg.seq_args,
4548 &calldata->res.seq_res, task)) 4551 &calldata->res.seq_res,
4549 return; 4552 task) != 0)
4550 rpc_call_start(task); 4553 nfs_release_seqid(calldata->arg.seqid);
4554 else
4555 rpc_call_start(task);
4551} 4556}
4552 4557
4553static const struct rpc_call_ops nfs4_locku_ops = { 4558static const struct rpc_call_ops nfs4_locku_ops = {
@@ -4692,7 +4697,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
4692 /* Do we need to do an open_to_lock_owner? */ 4697 /* Do we need to do an open_to_lock_owner? */
4693 if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) { 4698 if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
4694 if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) 4699 if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0)
4695 return; 4700 goto out_release_lock_seqid;
4696 data->arg.open_stateid = &state->stateid; 4701 data->arg.open_stateid = &state->stateid;
4697 data->arg.new_lock_owner = 1; 4702 data->arg.new_lock_owner = 1;
4698 data->res.open_seqid = data->arg.open_seqid; 4703 data->res.open_seqid = data->arg.open_seqid;
@@ -4701,10 +4706,15 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
4701 data->timestamp = jiffies; 4706 data->timestamp = jiffies;
4702 if (nfs4_setup_sequence(data->server, 4707 if (nfs4_setup_sequence(data->server,
4703 &data->arg.seq_args, 4708 &data->arg.seq_args,
4704 &data->res.seq_res, task)) 4709 &data->res.seq_res,
4710 task) == 0) {
4711 rpc_call_start(task);
4705 return; 4712 return;
4706 rpc_call_start(task); 4713 }
4707 dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status); 4714 nfs_release_seqid(data->arg.open_seqid);
4715out_release_lock_seqid:
4716 nfs_release_seqid(data->arg.lock_seqid);
4717 dprintk("%s: done!, ret = %d\n", __func__, task->tk_status);
4708} 4718}
4709 4719
4710static void nfs4_recover_lock_prepare(struct rpc_task *task, void *calldata) 4720static void nfs4_recover_lock_prepare(struct rpc_task *task, void *calldata)
@@ -5667,7 +5677,7 @@ static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl,
5667 tbl->slots = new; 5677 tbl->slots = new;
5668 tbl->max_slots = max_slots; 5678 tbl->max_slots = max_slots;
5669 } 5679 }
5670 tbl->highest_used_slotid = -1; /* no slot is currently used */ 5680 tbl->highest_used_slotid = NFS4_NO_SLOT;
5671 for (i = 0; i < tbl->max_slots; i++) 5681 for (i = 0; i < tbl->max_slots; i++)
5672 tbl->slots[i].seq_nr = ivalue; 5682 tbl->slots[i].seq_nr = ivalue;
5673 spin_unlock(&tbl->slot_tbl_lock); 5683 spin_unlock(&tbl->slot_tbl_lock);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index fe624c91bd00..2878f97bd78d 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -925,8 +925,8 @@ pnfs_find_alloc_layout(struct inode *ino,
925 if (likely(nfsi->layout == NULL)) { /* Won the race? */ 925 if (likely(nfsi->layout == NULL)) { /* Won the race? */
926 nfsi->layout = new; 926 nfsi->layout = new;
927 return new; 927 return new;
928 } 928 } else if (new != NULL)
929 pnfs_free_layout_hdr(new); 929 pnfs_free_layout_hdr(new);
930out_existing: 930out_existing:
931 pnfs_get_layout_hdr(nfsi->layout); 931 pnfs_get_layout_hdr(nfsi->layout);
932 return nfsi->layout; 932 return nfsi->layout;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e831bce49766..652d3f7176a9 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -54,6 +54,7 @@
54#include <linux/parser.h> 54#include <linux/parser.h>
55#include <linux/nsproxy.h> 55#include <linux/nsproxy.h>
56#include <linux/rcupdate.h> 56#include <linux/rcupdate.h>
57#include <linux/kthread.h>
57 58
58#include <asm/uaccess.h> 59#include <asm/uaccess.h>
59 60
@@ -415,6 +416,54 @@ void nfs_sb_deactive(struct super_block *sb)
415} 416}
416EXPORT_SYMBOL_GPL(nfs_sb_deactive); 417EXPORT_SYMBOL_GPL(nfs_sb_deactive);
417 418
419static int nfs_deactivate_super_async_work(void *ptr)
420{
421 struct super_block *sb = ptr;
422
423 deactivate_super(sb);
424 module_put_and_exit(0);
425 return 0;
426}
427
428/*
429 * same effect as deactivate_super, but will do final unmount in kthread
430 * context
431 */
432static void nfs_deactivate_super_async(struct super_block *sb)
433{
434 struct task_struct *task;
435 char buf[INET6_ADDRSTRLEN + 1];
436 struct nfs_server *server = NFS_SB(sb);
437 struct nfs_client *clp = server->nfs_client;
438
439 if (!atomic_add_unless(&sb->s_active, -1, 1)) {
440 rcu_read_lock();
441 snprintf(buf, sizeof(buf),
442 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
443 rcu_read_unlock();
444
445 __module_get(THIS_MODULE);
446 task = kthread_run(nfs_deactivate_super_async_work, sb,
447 "%s-deactivate-super", buf);
448 if (IS_ERR(task)) {
449 pr_err("%s: kthread_run: %ld\n",
450 __func__, PTR_ERR(task));
451 /* make synchronous call and hope for the best */
452 deactivate_super(sb);
453 module_put(THIS_MODULE);
454 }
455 }
456}
457
458void nfs_sb_deactive_async(struct super_block *sb)
459{
460 struct nfs_server *server = NFS_SB(sb);
461
462 if (atomic_dec_and_test(&server->active))
463 nfs_deactivate_super_async(sb);
464}
465EXPORT_SYMBOL_GPL(nfs_sb_deactive_async);
466
418/* 467/*
419 * Deliver file system statistics to userspace 468 * Deliver file system statistics to userspace
420 */ 469 */
@@ -771,7 +820,7 @@ int nfs_show_devname(struct seq_file *m, struct dentry *root)
771 int err = 0; 820 int err = 0;
772 if (!page) 821 if (!page)
773 return -ENOMEM; 822 return -ENOMEM;
774 devname = nfs_path(&dummy, root, page, PAGE_SIZE); 823 devname = nfs_path(&dummy, root, page, PAGE_SIZE, 0);
775 if (IS_ERR(devname)) 824 if (IS_ERR(devname))
776 err = PTR_ERR(devname); 825 err = PTR_ERR(devname);
777 else 826 else
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 13cea637eff8..3f79c77153b8 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -95,7 +95,7 @@ static void nfs_async_unlink_release(void *calldata)
95 95
96 nfs_dec_sillycount(data->dir); 96 nfs_dec_sillycount(data->dir);
97 nfs_free_unlinkdata(data); 97 nfs_free_unlinkdata(data);
98 nfs_sb_deactive(sb); 98 nfs_sb_deactive_async(sb);
99} 99}
100 100
101static void nfs_unlink_prepare(struct rpc_task *task, void *calldata) 101static void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 3e7b2a0dc0c8..07f76db04ec7 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -431,7 +431,7 @@ void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
431 mapping->host = inode; 431 mapping->host = inode;
432 mapping->flags = 0; 432 mapping->flags = 0;
433 mapping_set_gfp_mask(mapping, GFP_NOFS); 433 mapping_set_gfp_mask(mapping, GFP_NOFS);
434 mapping->assoc_mapping = NULL; 434 mapping->private_data = NULL;
435 mapping->backing_dev_info = bdi; 435 mapping->backing_dev_info = bdi;
436 mapping->a_ops = &empty_aops; 436 mapping->a_ops = &empty_aops;
437} 437}
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 7dceff005a67..e5f911bd80d2 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -4,7 +4,7 @@ config FANOTIFY
4 select ANON_INODES 4 select ANON_INODES
5 default n 5 default n
6 ---help--- 6 ---help---
7 Say Y here to enable fanotify suport. fanotify is a file access 7 Say Y here to enable fanotify support. fanotify is a file access
8 notification system which differs from inotify in that it sends 8 notification system which differs from inotify in that it sends
9 an open file descriptor to the userspace listener along with 9 an open file descriptor to the userspace listener along with
10 the event. 10 the event.
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index f35794b97e8e..a50636025364 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -21,6 +21,7 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
21 if ((old->path.mnt == new->path.mnt) && 21 if ((old->path.mnt == new->path.mnt) &&
22 (old->path.dentry == new->path.dentry)) 22 (old->path.dentry == new->path.dentry))
23 return true; 23 return true;
24 break;
24 case (FSNOTIFY_EVENT_NONE): 25 case (FSNOTIFY_EVENT_NONE):
25 return true; 26 return true;
26 default: 27 default:
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 721d692fa8d4..6fcaeb8c902e 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -258,7 +258,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
258 if (ret) 258 if (ret)
259 goto out_close_fd; 259 goto out_close_fd;
260 260
261 fd_install(fd, f); 261 if (fd != FAN_NOFD)
262 fd_install(fd, f);
262 return fanotify_event_metadata.event_len; 263 return fanotify_event_metadata.event_len;
263 264
264out_close_fd: 265out_close_fd:
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index c887b1378f7e..48cb994e4922 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -18,7 +18,7 @@
18 18
19/* 19/*
20 * Basic idea behind the notification queue: An fsnotify group (like inotify) 20 * Basic idea behind the notification queue: An fsnotify group (like inotify)
21 * sends the userspace notification about events asyncronously some time after 21 * sends the userspace notification about events asynchronously some time after
22 * the event happened. When inotify gets an event it will need to add that 22 * the event happened. When inotify gets an event it will need to add that
23 * event to the group notify queue. Since a single event might need to be on 23 * event to the group notify queue. Since a single event might need to be on
24 * multiple group's notification queues we can't add the event directly to each 24 * multiple group's notification queues we can't add the event directly to each
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 5a4ee77cec51..dda089804942 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2513,18 +2513,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
2513 ret = sd.num_spliced; 2513 ret = sd.num_spliced;
2514 2514
2515 if (ret > 0) { 2515 if (ret > 0) {
2516 unsigned long nr_pages;
2517 int err; 2516 int err;
2518 2517
2519 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2520
2521 err = generic_write_sync(out, *ppos, ret); 2518 err = generic_write_sync(out, *ppos, ret);
2522 if (err) 2519 if (err)
2523 ret = err; 2520 ret = err;
2524 else 2521 else
2525 *ppos += ret; 2522 *ppos += ret;
2526 2523
2527 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 2524 balance_dirty_pages_ratelimited(mapping);
2528 } 2525 }
2529 2526
2530 return ret; 2527 return ret;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index c1c207c36cae..d3696708fc1a 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -438,7 +438,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
438 438
439 min_flt += sig->min_flt; 439 min_flt += sig->min_flt;
440 maj_flt += sig->maj_flt; 440 maj_flt += sig->maj_flt;
441 thread_group_times(task, &utime, &stime); 441 thread_group_cputime_adjusted(task, &utime, &stime);
442 gtime += sig->gtime; 442 gtime += sig->gtime;
443 } 443 }
444 444
@@ -454,7 +454,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
454 if (!whole) { 454 if (!whole) {
455 min_flt = task->min_flt; 455 min_flt = task->min_flt;
456 maj_flt = task->maj_flt; 456 maj_flt = task->maj_flt;
457 task_times(task, &utime, &stime); 457 task_cputime_adjusted(task, &utime, &stime);
458 gtime = task->gtime; 458 gtime = task->gtime;
459 } 459 }
460 460
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 144a96732dd7..aa63d25157b8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -873,12 +873,119 @@ static const struct file_operations proc_environ_operations = {
873 .release = mem_release, 873 .release = mem_release,
874}; 874};
875 875
876static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
877 loff_t *ppos)
878{
879 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
880 char buffer[PROC_NUMBUF];
881 int oom_adj = OOM_ADJUST_MIN;
882 size_t len;
883 unsigned long flags;
884
885 if (!task)
886 return -ESRCH;
887 if (lock_task_sighand(task, &flags)) {
888 if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
889 oom_adj = OOM_ADJUST_MAX;
890 else
891 oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
892 OOM_SCORE_ADJ_MAX;
893 unlock_task_sighand(task, &flags);
894 }
895 put_task_struct(task);
896 len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
897 return simple_read_from_buffer(buf, count, ppos, buffer, len);
898}
899
900static ssize_t oom_adj_write(struct file *file, const char __user *buf,
901 size_t count, loff_t *ppos)
902{
903 struct task_struct *task;
904 char buffer[PROC_NUMBUF];
905 int oom_adj;
906 unsigned long flags;
907 int err;
908
909 memset(buffer, 0, sizeof(buffer));
910 if (count > sizeof(buffer) - 1)
911 count = sizeof(buffer) - 1;
912 if (copy_from_user(buffer, buf, count)) {
913 err = -EFAULT;
914 goto out;
915 }
916
917 err = kstrtoint(strstrip(buffer), 0, &oom_adj);
918 if (err)
919 goto out;
920 if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
921 oom_adj != OOM_DISABLE) {
922 err = -EINVAL;
923 goto out;
924 }
925
926 task = get_proc_task(file->f_path.dentry->d_inode);
927 if (!task) {
928 err = -ESRCH;
929 goto out;
930 }
931
932 task_lock(task);
933 if (!task->mm) {
934 err = -EINVAL;
935 goto err_task_lock;
936 }
937
938 if (!lock_task_sighand(task, &flags)) {
939 err = -ESRCH;
940 goto err_task_lock;
941 }
942
943 /*
944 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
945 * value is always attainable.
946 */
947 if (oom_adj == OOM_ADJUST_MAX)
948 oom_adj = OOM_SCORE_ADJ_MAX;
949 else
950 oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
951
952 if (oom_adj < task->signal->oom_score_adj &&
953 !capable(CAP_SYS_RESOURCE)) {
954 err = -EACCES;
955 goto err_sighand;
956 }
957
958 /*
959 * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
960 * /proc/pid/oom_score_adj instead.
961 */
962 printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
963 current->comm, task_pid_nr(current), task_pid_nr(task),
964 task_pid_nr(task));
965
966 task->signal->oom_score_adj = oom_adj;
967 trace_oom_score_adj_update(task);
968err_sighand:
969 unlock_task_sighand(task, &flags);
970err_task_lock:
971 task_unlock(task);
972 put_task_struct(task);
973out:
974 return err < 0 ? err : count;
975}
976
977static const struct file_operations proc_oom_adj_operations = {
978 .read = oom_adj_read,
979 .write = oom_adj_write,
980 .llseek = generic_file_llseek,
981};
982
876static ssize_t oom_score_adj_read(struct file *file, char __user *buf, 983static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
877 size_t count, loff_t *ppos) 984 size_t count, loff_t *ppos)
878{ 985{
879 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 986 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
880 char buffer[PROC_NUMBUF]; 987 char buffer[PROC_NUMBUF];
881 int oom_score_adj = OOM_SCORE_ADJ_MIN; 988 short oom_score_adj = OOM_SCORE_ADJ_MIN;
882 unsigned long flags; 989 unsigned long flags;
883 size_t len; 990 size_t len;
884 991
@@ -889,7 +996,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
889 unlock_task_sighand(task, &flags); 996 unlock_task_sighand(task, &flags);
890 } 997 }
891 put_task_struct(task); 998 put_task_struct(task);
892 len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj); 999 len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
893 return simple_read_from_buffer(buf, count, ppos, buffer, len); 1000 return simple_read_from_buffer(buf, count, ppos, buffer, len);
894} 1001}
895 1002
@@ -936,15 +1043,15 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
936 goto err_task_lock; 1043 goto err_task_lock;
937 } 1044 }
938 1045
939 if (oom_score_adj < task->signal->oom_score_adj_min && 1046 if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
940 !capable(CAP_SYS_RESOURCE)) { 1047 !capable(CAP_SYS_RESOURCE)) {
941 err = -EACCES; 1048 err = -EACCES;
942 goto err_sighand; 1049 goto err_sighand;
943 } 1050 }
944 1051
945 task->signal->oom_score_adj = oom_score_adj; 1052 task->signal->oom_score_adj = (short)oom_score_adj;
946 if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) 1053 if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
947 task->signal->oom_score_adj_min = oom_score_adj; 1054 task->signal->oom_score_adj_min = (short)oom_score_adj;
948 trace_oom_score_adj_update(task); 1055 trace_oom_score_adj_update(task);
949 1056
950err_sighand: 1057err_sighand:
@@ -1770,8 +1877,9 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
1770 if (!vma) 1877 if (!vma)
1771 goto out_no_vma; 1878 goto out_no_vma;
1772 1879
1773 result = proc_map_files_instantiate(dir, dentry, task, 1880 if (vma->vm_file)
1774 (void *)(unsigned long)vma->vm_file->f_mode); 1881 result = proc_map_files_instantiate(dir, dentry, task,
1882 (void *)(unsigned long)vma->vm_file->f_mode);
1775 1883
1776out_no_vma: 1884out_no_vma:
1777 up_read(&mm->mmap_sem); 1885 up_read(&mm->mmap_sem);
@@ -2598,6 +2706,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2598 REG("cgroup", S_IRUGO, proc_cgroup_operations), 2706 REG("cgroup", S_IRUGO, proc_cgroup_operations),
2599#endif 2707#endif
2600 INF("oom_score", S_IRUGO, proc_oom_score), 2708 INF("oom_score", S_IRUGO, proc_oom_score),
2709 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
2601 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), 2710 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
2602#ifdef CONFIG_AUDITSYSCALL 2711#ifdef CONFIG_AUDITSYSCALL
2603 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 2712 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
@@ -2964,6 +3073,7 @@ static const struct pid_entry tid_base_stuff[] = {
2964 REG("cgroup", S_IRUGO, proc_cgroup_operations), 3073 REG("cgroup", S_IRUGO, proc_cgroup_operations),
2965#endif 3074#endif
2966 INF("oom_score", S_IRUGO, proc_oom_score), 3075 INF("oom_score", S_IRUGO, proc_oom_score),
3076 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
2967 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), 3077 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
2968#ifdef CONFIG_AUDITSYSCALL 3078#ifdef CONFIG_AUDITSYSCALL
2969 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 3079 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 86c67eee439f..e96d4f18ca3a 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -249,7 +249,7 @@ static int kcore_update_ram(void)
249 /* Not inialized....update now */ 249 /* Not inialized....update now */
250 /* find out "max pfn" */ 250 /* find out "max pfn" */
251 end_pfn = 0; 251 end_pfn = 0;
252 for_each_node_state(nid, N_HIGH_MEMORY) { 252 for_each_node_state(nid, N_MEMORY) {
253 unsigned long node_end; 253 unsigned long node_end;
254 node_end = NODE_DATA(nid)->node_start_pfn + 254 node_end = NODE_DATA(nid)->node_start_pfn +
255 NODE_DATA(nid)->node_spanned_pages; 255 NODE_DATA(nid)->node_spanned_pages;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index a781bdf06694..701580ddfcc3 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -378,12 +378,13 @@ static int test_perm(int mode, int op)
378 return -EACCES; 378 return -EACCES;
379} 379}
380 380
381static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) 381static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op)
382{ 382{
383 struct ctl_table_root *root = head->root;
383 int mode; 384 int mode;
384 385
385 if (root->permissions) 386 if (root->permissions)
386 mode = root->permissions(root, current->nsproxy, table); 387 mode = root->permissions(head, table);
387 else 388 else
388 mode = table->mode; 389 mode = table->mode;
389 390
@@ -491,7 +492,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
491 * and won't be until we finish. 492 * and won't be until we finish.
492 */ 493 */
493 error = -EPERM; 494 error = -EPERM;
494 if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ)) 495 if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ))
495 goto out; 496 goto out;
496 497
497 /* if that can happen at all, it should be -EINVAL, not -EISDIR */ 498 /* if that can happen at all, it should be -EINVAL, not -EISDIR */
@@ -717,7 +718,7 @@ static int proc_sys_permission(struct inode *inode, int mask)
717 if (!table) /* global root - r-xr-xr-x */ 718 if (!table) /* global root - r-xr-xr-x */
718 error = mask & MAY_WRITE ? -EACCES : 0; 719 error = mask & MAY_WRITE ? -EACCES : 0;
719 else /* Use the permissions on the sysctl table entry */ 720 else /* Use the permissions on the sysctl table entry */
720 error = sysctl_perm(head->root, table, mask & ~MAY_NOT_BLOCK); 721 error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK);
721 722
722 sysctl_head_finish(head); 723 sysctl_head_finish(head);
723 return error; 724 return error;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 90c63f9392a5..48775628abbf 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -643,7 +643,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
643 spinlock_t *ptl; 643 spinlock_t *ptl;
644 struct page *page; 644 struct page *page;
645 645
646 split_huge_page_pmd(walk->mm, pmd); 646 split_huge_page_pmd(vma, addr, pmd);
647 if (pmd_trans_unstable(pmd)) 647 if (pmd_trans_unstable(pmd))
648 return 0; 648 return 0;
649 649
@@ -1126,7 +1126,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
1126 return NULL; 1126 return NULL;
1127 1127
1128 nid = page_to_nid(page); 1128 nid = page_to_nid(page);
1129 if (!node_isset(nid, node_states[N_HIGH_MEMORY])) 1129 if (!node_isset(nid, node_states[N_MEMORY]))
1130 return NULL; 1130 return NULL;
1131 1131
1132 return page; 1132 return page;
@@ -1279,7 +1279,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1279 if (md->writeback) 1279 if (md->writeback)
1280 seq_printf(m, " writeback=%lu", md->writeback); 1280 seq_printf(m, " writeback=%lu", md->writeback);
1281 1281
1282 for_each_node_state(n, N_HIGH_MEMORY) 1282 for_each_node_state(n, N_MEMORY)
1283 if (md->node[n]) 1283 if (md->node[n])
1284 seq_printf(m, " N%d=%lu", n, md->node[n]); 1284 seq_printf(m, " N%d=%lu", n, md->node[n]);
1285out: 1285out:
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 4ab572e6d277..ed1d8c7212da 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -49,6 +49,7 @@ struct pstore_private {
49 struct pstore_info *psi; 49 struct pstore_info *psi;
50 enum pstore_type_id type; 50 enum pstore_type_id type;
51 u64 id; 51 u64 id;
52 int count;
52 ssize_t size; 53 ssize_t size;
53 char data[]; 54 char data[];
54}; 55};
@@ -175,7 +176,8 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
175 struct pstore_private *p = dentry->d_inode->i_private; 176 struct pstore_private *p = dentry->d_inode->i_private;
176 177
177 if (p->psi->erase) 178 if (p->psi->erase)
178 p->psi->erase(p->type, p->id, p->psi); 179 p->psi->erase(p->type, p->id, p->count,
180 dentry->d_inode->i_ctime, p->psi);
179 181
180 return simple_unlink(dir, dentry); 182 return simple_unlink(dir, dentry);
181} 183}
@@ -270,7 +272,7 @@ int pstore_is_mounted(void)
270 * Load it up with "size" bytes of data from "buf". 272 * Load it up with "size" bytes of data from "buf".
271 * Set the mtime & ctime to the date that this record was originally stored. 273 * Set the mtime & ctime to the date that this record was originally stored.
272 */ 274 */
273int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, 275int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
274 char *data, size_t size, struct timespec time, 276 char *data, size_t size, struct timespec time,
275 struct pstore_info *psi) 277 struct pstore_info *psi)
276{ 278{
@@ -306,6 +308,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
306 goto fail_alloc; 308 goto fail_alloc;
307 private->type = type; 309 private->type = type;
308 private->id = id; 310 private->id = id;
311 private->count = count;
309 private->psi = psi; 312 private->psi = psi;
310 313
311 switch (type) { 314 switch (type) {
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 4847f588b7d5..937d820f273c 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -50,7 +50,7 @@ extern struct pstore_info *psinfo;
50extern void pstore_set_kmsg_bytes(int); 50extern void pstore_set_kmsg_bytes(int);
51extern void pstore_get_records(int); 51extern void pstore_get_records(int);
52extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, 52extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
53 char *data, size_t size, 53 int count, char *data, size_t size,
54 struct timespec time, struct pstore_info *psi); 54 struct timespec time, struct pstore_info *psi);
55extern int pstore_is_mounted(void); 55extern int pstore_is_mounted(void);
56 56
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index a40da07e93d6..5ea2e77ff023 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -136,7 +136,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
136 break; 136 break;
137 137
138 ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, 138 ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
139 hsize + len, psinfo); 139 oopscount, hsize + len, psinfo);
140 if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) 140 if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
141 pstore_new_entry = 1; 141 pstore_new_entry = 1;
142 142
@@ -161,6 +161,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
161 161
162 while (s < e) { 162 while (s < e) {
163 unsigned long flags; 163 unsigned long flags;
164 u64 id;
164 165
165 if (c > psinfo->bufsize) 166 if (c > psinfo->bufsize)
166 c = psinfo->bufsize; 167 c = psinfo->bufsize;
@@ -172,7 +173,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
172 spin_lock_irqsave(&psinfo->buf_lock, flags); 173 spin_lock_irqsave(&psinfo->buf_lock, flags);
173 } 174 }
174 memcpy(psinfo->buf, s, c); 175 memcpy(psinfo->buf, s, c);
175 psinfo->write(PSTORE_TYPE_CONSOLE, 0, NULL, 0, c, psinfo); 176 psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, c, psinfo);
176 spin_unlock_irqrestore(&psinfo->buf_lock, flags); 177 spin_unlock_irqrestore(&psinfo->buf_lock, flags);
177 s += c; 178 s += c;
178 c = e - s; 179 c = e - s;
@@ -196,7 +197,7 @@ static void pstore_register_console(void) {}
196 197
197static int pstore_write_compat(enum pstore_type_id type, 198static int pstore_write_compat(enum pstore_type_id type,
198 enum kmsg_dump_reason reason, 199 enum kmsg_dump_reason reason,
199 u64 *id, unsigned int part, 200 u64 *id, unsigned int part, int count,
200 size_t size, struct pstore_info *psi) 201 size_t size, struct pstore_info *psi)
201{ 202{
202 return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi); 203 return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi);
@@ -266,6 +267,7 @@ void pstore_get_records(int quiet)
266 char *buf = NULL; 267 char *buf = NULL;
267 ssize_t size; 268 ssize_t size;
268 u64 id; 269 u64 id;
270 int count;
269 enum pstore_type_id type; 271 enum pstore_type_id type;
270 struct timespec time; 272 struct timespec time;
271 int failed = 0, rc; 273 int failed = 0, rc;
@@ -277,9 +279,9 @@ void pstore_get_records(int quiet)
277 if (psi->open && psi->open(psi)) 279 if (psi->open && psi->open(psi))
278 goto out; 280 goto out;
279 281
280 while ((size = psi->read(&id, &type, &time, &buf, psi)) > 0) { 282 while ((size = psi->read(&id, &type, &count, &time, &buf, psi)) > 0) {
281 rc = pstore_mkfile(type, psi->name, id, buf, (size_t)size, 283 rc = pstore_mkfile(type, psi->name, id, count, buf,
282 time, psi); 284 (size_t)size, time, psi);
283 kfree(buf); 285 kfree(buf);
284 buf = NULL; 286 buf = NULL;
285 if (rc && (rc != -EEXIST || !quiet)) 287 if (rc && (rc != -EEXIST || !quiet))
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 1a4f6da58eab..2bfa36e0ffe8 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -132,9 +132,8 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max,
132} 132}
133 133
134static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, 134static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
135 struct timespec *time, 135 int *count, struct timespec *time,
136 char **buf, 136 char **buf, struct pstore_info *psi)
137 struct pstore_info *psi)
138{ 137{
139 ssize_t size; 138 ssize_t size;
140 struct ramoops_context *cxt = psi->data; 139 struct ramoops_context *cxt = psi->data;
@@ -236,8 +235,8 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
236 return 0; 235 return 0;
237} 236}
238 237
239static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, 238static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
240 struct pstore_info *psi) 239 struct timespec time, struct pstore_info *psi)
241{ 240{
242 struct ramoops_context *cxt = psi->data; 241 struct ramoops_context *cxt = psi->data;
243 struct persistent_ram_zone *prz; 242 struct persistent_ram_zone *prz;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index f27f01a98aa2..d83736fbc26c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1782,8 +1782,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1782 1782
1783 BUG_ON(!th->t_trans_id); 1783 BUG_ON(!th->t_trans_id);
1784 1784
1785 dquot_initialize(inode); 1785 reiserfs_write_unlock(inode->i_sb);
1786 err = dquot_alloc_inode(inode); 1786 err = dquot_alloc_inode(inode);
1787 reiserfs_write_lock(inode->i_sb);
1787 if (err) 1788 if (err)
1788 goto out_end_trans; 1789 goto out_end_trans;
1789 if (!dir->i_nlink) { 1790 if (!dir->i_nlink) {
@@ -1979,8 +1980,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1979 1980
1980 out_end_trans: 1981 out_end_trans:
1981 journal_end(th, th->t_super, th->t_blocks_allocated); 1982 journal_end(th, th->t_super, th->t_blocks_allocated);
1983 reiserfs_write_unlock(inode->i_sb);
1982 /* Drop can be outside and it needs more credits so it's better to have it outside */ 1984 /* Drop can be outside and it needs more credits so it's better to have it outside */
1983 dquot_drop(inode); 1985 dquot_drop(inode);
1986 reiserfs_write_lock(inode->i_sb);
1984 inode->i_flags |= S_NOQUOTA; 1987 inode->i_flags |= S_NOQUOTA;
1985 make_bad_inode(inode); 1988 make_bad_inode(inode);
1986 1989
@@ -3103,10 +3106,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3103 /* must be turned off for recursive notify_change calls */ 3106 /* must be turned off for recursive notify_change calls */
3104 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); 3107 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
3105 3108
3106 depth = reiserfs_write_lock_once(inode->i_sb);
3107 if (is_quota_modification(inode, attr)) 3109 if (is_quota_modification(inode, attr))
3108 dquot_initialize(inode); 3110 dquot_initialize(inode);
3109 3111 depth = reiserfs_write_lock_once(inode->i_sb);
3110 if (attr->ia_valid & ATTR_SIZE) { 3112 if (attr->ia_valid & ATTR_SIZE) {
3111 /* version 2 items will be caught by the s_maxbytes check 3113 /* version 2 items will be caught by the s_maxbytes check
3112 ** done for us in vmtruncate 3114 ** done for us in vmtruncate
@@ -3170,7 +3172,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3170 error = journal_begin(&th, inode->i_sb, jbegin_count); 3172 error = journal_begin(&th, inode->i_sb, jbegin_count);
3171 if (error) 3173 if (error)
3172 goto out; 3174 goto out;
3175 reiserfs_write_unlock_once(inode->i_sb, depth);
3173 error = dquot_transfer(inode, attr); 3176 error = dquot_transfer(inode, attr);
3177 depth = reiserfs_write_lock_once(inode->i_sb);
3174 if (error) { 3178 if (error) {
3175 journal_end(&th, inode->i_sb, jbegin_count); 3179 journal_end(&th, inode->i_sb, jbegin_count);
3176 goto out; 3180 goto out;
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index f8afa4b162b8..2f40a4c70a4d 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1968,7 +1968,9 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
1968 key2type(&(key->on_disk_key))); 1968 key2type(&(key->on_disk_key)));
1969#endif 1969#endif
1970 1970
1971 reiserfs_write_unlock(inode->i_sb);
1971 retval = dquot_alloc_space_nodirty(inode, pasted_size); 1972 retval = dquot_alloc_space_nodirty(inode, pasted_size);
1973 reiserfs_write_lock(inode->i_sb);
1972 if (retval) { 1974 if (retval) {
1973 pathrelse(search_path); 1975 pathrelse(search_path);
1974 return retval; 1976 return retval;
@@ -2061,9 +2063,11 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
2061 "reiserquota insert_item(): allocating %u id=%u type=%c", 2063 "reiserquota insert_item(): allocating %u id=%u type=%c",
2062 quota_bytes, inode->i_uid, head2type(ih)); 2064 quota_bytes, inode->i_uid, head2type(ih));
2063#endif 2065#endif
2066 reiserfs_write_unlock(inode->i_sb);
2064 /* We can't dirty inode here. It would be immediately written but 2067 /* We can't dirty inode here. It would be immediately written but
2065 * appropriate stat item isn't inserted yet... */ 2068 * appropriate stat item isn't inserted yet... */
2066 retval = dquot_alloc_space_nodirty(inode, quota_bytes); 2069 retval = dquot_alloc_space_nodirty(inode, quota_bytes);
2070 reiserfs_write_lock(inode->i_sb);
2067 if (retval) { 2071 if (retval) {
2068 pathrelse(path); 2072 pathrelse(path);
2069 return retval; 2073 return retval;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 1078ae179993..418bdc3a57da 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -298,7 +298,9 @@ static int finish_unfinished(struct super_block *s)
298 retval = remove_save_link_only(s, &save_link_key, 0); 298 retval = remove_save_link_only(s, &save_link_key, 0);
299 continue; 299 continue;
300 } 300 }
301 reiserfs_write_unlock(s);
301 dquot_initialize(inode); 302 dquot_initialize(inode);
303 reiserfs_write_lock(s);
302 304
303 if (truncate && S_ISDIR(inode->i_mode)) { 305 if (truncate && S_ISDIR(inode->i_mode)) {
304 /* We got a truncate request for a dir which is impossible. 306 /* We got a truncate request for a dir which is impossible.
@@ -1335,7 +1337,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1335 kfree(qf_names[i]); 1337 kfree(qf_names[i]);
1336#endif 1338#endif
1337 err = -EINVAL; 1339 err = -EINVAL;
1338 goto out_err; 1340 goto out_unlock;
1339 } 1341 }
1340#ifdef CONFIG_QUOTA 1342#ifdef CONFIG_QUOTA
1341 handle_quota_files(s, qf_names, &qfmt); 1343 handle_quota_files(s, qf_names, &qfmt);
@@ -1379,7 +1381,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1379 if (blocks) { 1381 if (blocks) {
1380 err = reiserfs_resize(s, blocks); 1382 err = reiserfs_resize(s, blocks);
1381 if (err != 0) 1383 if (err != 0)
1382 goto out_err; 1384 goto out_unlock;
1383 } 1385 }
1384 1386
1385 if (*mount_flags & MS_RDONLY) { 1387 if (*mount_flags & MS_RDONLY) {
@@ -1389,9 +1391,15 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1389 /* it is read-only already */ 1391 /* it is read-only already */
1390 goto out_ok; 1392 goto out_ok;
1391 1393
1394 /*
1395 * Drop write lock. Quota will retake it when needed and lock
1396 * ordering requires calling dquot_suspend() without it.
1397 */
1398 reiserfs_write_unlock(s);
1392 err = dquot_suspend(s, -1); 1399 err = dquot_suspend(s, -1);
1393 if (err < 0) 1400 if (err < 0)
1394 goto out_err; 1401 goto out_err;
1402 reiserfs_write_lock(s);
1395 1403
1396 /* try to remount file system with read-only permissions */ 1404 /* try to remount file system with read-only permissions */
1397 if (sb_umount_state(rs) == REISERFS_VALID_FS 1405 if (sb_umount_state(rs) == REISERFS_VALID_FS
@@ -1401,7 +1409,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1401 1409
1402 err = journal_begin(&th, s, 10); 1410 err = journal_begin(&th, s, 10);
1403 if (err) 1411 if (err)
1404 goto out_err; 1412 goto out_unlock;
1405 1413
1406 /* Mounting a rw partition read-only. */ 1414 /* Mounting a rw partition read-only. */
1407 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); 1415 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
@@ -1416,7 +1424,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1416 1424
1417 if (reiserfs_is_journal_aborted(journal)) { 1425 if (reiserfs_is_journal_aborted(journal)) {
1418 err = journal->j_errno; 1426 err = journal->j_errno;
1419 goto out_err; 1427 goto out_unlock;
1420 } 1428 }
1421 1429
1422 handle_data_mode(s, mount_options); 1430 handle_data_mode(s, mount_options);
@@ -1425,7 +1433,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1425 s->s_flags &= ~MS_RDONLY; /* now it is safe to call journal_begin */ 1433 s->s_flags &= ~MS_RDONLY; /* now it is safe to call journal_begin */
1426 err = journal_begin(&th, s, 10); 1434 err = journal_begin(&th, s, 10);
1427 if (err) 1435 if (err)
1428 goto out_err; 1436 goto out_unlock;
1429 1437
1430 /* Mount a partition which is read-only, read-write */ 1438 /* Mount a partition which is read-only, read-write */
1431 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); 1439 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
@@ -1442,10 +1450,16 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1442 SB_JOURNAL(s)->j_must_wait = 1; 1450 SB_JOURNAL(s)->j_must_wait = 1;
1443 err = journal_end(&th, s, 10); 1451 err = journal_end(&th, s, 10);
1444 if (err) 1452 if (err)
1445 goto out_err; 1453 goto out_unlock;
1446 1454
1447 if (!(*mount_flags & MS_RDONLY)) { 1455 if (!(*mount_flags & MS_RDONLY)) {
1456 /*
1457 * Drop write lock. Quota will retake it when needed and lock
1458 * ordering requires calling dquot_resume() without it.
1459 */
1460 reiserfs_write_unlock(s);
1448 dquot_resume(s, -1); 1461 dquot_resume(s, -1);
1462 reiserfs_write_lock(s);
1449 finish_unfinished(s); 1463 finish_unfinished(s);
1450 reiserfs_xattr_init(s, *mount_flags); 1464 reiserfs_xattr_init(s, *mount_flags);
1451 } 1465 }
@@ -1455,9 +1469,10 @@ out_ok:
1455 reiserfs_write_unlock(s); 1469 reiserfs_write_unlock(s);
1456 return 0; 1470 return 0;
1457 1471
1472out_unlock:
1473 reiserfs_write_unlock(s);
1458out_err: 1474out_err:
1459 kfree(new_opts); 1475 kfree(new_opts);
1460 reiserfs_write_unlock(s);
1461 return err; 1476 return err;
1462} 1477}
1463 1478
@@ -2095,13 +2110,15 @@ static int reiserfs_write_dquot(struct dquot *dquot)
2095 REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); 2110 REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
2096 if (ret) 2111 if (ret)
2097 goto out; 2112 goto out;
2113 reiserfs_write_unlock(dquot->dq_sb);
2098 ret = dquot_commit(dquot); 2114 ret = dquot_commit(dquot);
2115 reiserfs_write_lock(dquot->dq_sb);
2099 err = 2116 err =
2100 journal_end(&th, dquot->dq_sb, 2117 journal_end(&th, dquot->dq_sb,
2101 REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); 2118 REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
2102 if (!ret && err) 2119 if (!ret && err)
2103 ret = err; 2120 ret = err;
2104 out: 2121out:
2105 reiserfs_write_unlock(dquot->dq_sb); 2122 reiserfs_write_unlock(dquot->dq_sb);
2106 return ret; 2123 return ret;
2107} 2124}
@@ -2117,13 +2134,15 @@ static int reiserfs_acquire_dquot(struct dquot *dquot)
2117 REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); 2134 REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
2118 if (ret) 2135 if (ret)
2119 goto out; 2136 goto out;
2137 reiserfs_write_unlock(dquot->dq_sb);
2120 ret = dquot_acquire(dquot); 2138 ret = dquot_acquire(dquot);
2139 reiserfs_write_lock(dquot->dq_sb);
2121 err = 2140 err =
2122 journal_end(&th, dquot->dq_sb, 2141 journal_end(&th, dquot->dq_sb,
2123 REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); 2142 REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
2124 if (!ret && err) 2143 if (!ret && err)
2125 ret = err; 2144 ret = err;
2126 out: 2145out:
2127 reiserfs_write_unlock(dquot->dq_sb); 2146 reiserfs_write_unlock(dquot->dq_sb);
2128 return ret; 2147 return ret;
2129} 2148}
@@ -2137,19 +2156,21 @@ static int reiserfs_release_dquot(struct dquot *dquot)
2137 ret = 2156 ret =
2138 journal_begin(&th, dquot->dq_sb, 2157 journal_begin(&th, dquot->dq_sb,
2139 REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb)); 2158 REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
2159 reiserfs_write_unlock(dquot->dq_sb);
2140 if (ret) { 2160 if (ret) {
2141 /* Release dquot anyway to avoid endless cycle in dqput() */ 2161 /* Release dquot anyway to avoid endless cycle in dqput() */
2142 dquot_release(dquot); 2162 dquot_release(dquot);
2143 goto out; 2163 goto out;
2144 } 2164 }
2145 ret = dquot_release(dquot); 2165 ret = dquot_release(dquot);
2166 reiserfs_write_lock(dquot->dq_sb);
2146 err = 2167 err =
2147 journal_end(&th, dquot->dq_sb, 2168 journal_end(&th, dquot->dq_sb,
2148 REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb)); 2169 REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
2149 if (!ret && err) 2170 if (!ret && err)
2150 ret = err; 2171 ret = err;
2151 out:
2152 reiserfs_write_unlock(dquot->dq_sb); 2172 reiserfs_write_unlock(dquot->dq_sb);
2173out:
2153 return ret; 2174 return ret;
2154} 2175}
2155 2176
@@ -2174,11 +2195,13 @@ static int reiserfs_write_info(struct super_block *sb, int type)
2174 ret = journal_begin(&th, sb, 2); 2195 ret = journal_begin(&th, sb, 2);
2175 if (ret) 2196 if (ret)
2176 goto out; 2197 goto out;
2198 reiserfs_write_unlock(sb);
2177 ret = dquot_commit_info(sb, type); 2199 ret = dquot_commit_info(sb, type);
2200 reiserfs_write_lock(sb);
2178 err = journal_end(&th, sb, 2); 2201 err = journal_end(&th, sb, 2);
2179 if (!ret && err) 2202 if (!ret && err)
2180 ret = err; 2203 ret = err;
2181 out: 2204out:
2182 reiserfs_write_unlock(sb); 2205 reiserfs_write_unlock(sb);
2183 return ret; 2206 return ret;
2184} 2207}
@@ -2203,8 +2226,11 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2203 struct reiserfs_transaction_handle th; 2226 struct reiserfs_transaction_handle th;
2204 int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA; 2227 int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA;
2205 2228
2206 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) 2229 reiserfs_write_lock(sb);
2207 return -EINVAL; 2230 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) {
2231 err = -EINVAL;
2232 goto out;
2233 }
2208 2234
2209 /* Quotafile not on the same filesystem? */ 2235 /* Quotafile not on the same filesystem? */
2210 if (path->dentry->d_sb != sb) { 2236 if (path->dentry->d_sb != sb) {
@@ -2246,8 +2272,10 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2246 if (err) 2272 if (err)
2247 goto out; 2273 goto out;
2248 } 2274 }
2249 err = dquot_quota_on(sb, type, format_id, path); 2275 reiserfs_write_unlock(sb);
2276 return dquot_quota_on(sb, type, format_id, path);
2250out: 2277out:
2278 reiserfs_write_unlock(sb);
2251 return err; 2279 return err;
2252} 2280}
2253 2281
@@ -2320,7 +2348,9 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
2320 tocopy = sb->s_blocksize - offset < towrite ? 2348 tocopy = sb->s_blocksize - offset < towrite ?
2321 sb->s_blocksize - offset : towrite; 2349 sb->s_blocksize - offset : towrite;
2322 tmp_bh.b_state = 0; 2350 tmp_bh.b_state = 0;
2351 reiserfs_write_lock(sb);
2323 err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE); 2352 err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE);
2353 reiserfs_write_unlock(sb);
2324 if (err) 2354 if (err)
2325 goto out; 2355 goto out;
2326 if (offset || tocopy != sb->s_blocksize) 2356 if (offset || tocopy != sb->s_blocksize)
@@ -2336,10 +2366,12 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
2336 flush_dcache_page(bh->b_page); 2366 flush_dcache_page(bh->b_page);
2337 set_buffer_uptodate(bh); 2367 set_buffer_uptodate(bh);
2338 unlock_buffer(bh); 2368 unlock_buffer(bh);
2369 reiserfs_write_lock(sb);
2339 reiserfs_prepare_for_journal(sb, bh, 1); 2370 reiserfs_prepare_for_journal(sb, bh, 1);
2340 journal_mark_dirty(current->journal_info, sb, bh); 2371 journal_mark_dirty(current->journal_info, sb, bh);
2341 if (!journal_quota) 2372 if (!journal_quota)
2342 reiserfs_add_ordered_list(inode, bh); 2373 reiserfs_add_ordered_list(inode, bh);
2374 reiserfs_write_unlock(sb);
2343 brelse(bh); 2375 brelse(bh);
2344 offset = 0; 2376 offset = 0;
2345 towrite -= tocopy; 2377 towrite -= tocopy;
diff --git a/fs/splice.c b/fs/splice.c
index 13e5b4776e7a..8890604e3fcd 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1024,17 +1024,14 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
1024 ret = sd.num_spliced; 1024 ret = sd.num_spliced;
1025 1025
1026 if (ret > 0) { 1026 if (ret > 0) {
1027 unsigned long nr_pages;
1028 int err; 1027 int err;
1029 1028
1030 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1031
1032 err = generic_write_sync(out, *ppos, ret); 1029 err = generic_write_sync(out, *ppos, ret);
1033 if (err) 1030 if (err)
1034 ret = err; 1031 ret = err;
1035 else 1032 else
1036 *ppos += ret; 1033 *ppos += ret;
1037 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 1034 balance_dirty_pages_ratelimited(mapping);
1038 } 1035 }
1039 sb_end_write(inode->i_sb); 1036 sb_end_write(inode->i_sb);
1040 1037
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 00012e31829d..602f56db0442 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -485,8 +485,8 @@ const struct file_operations sysfs_file_operations = {
485 .poll = sysfs_poll, 485 .poll = sysfs_poll,
486}; 486};
487 487
488int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr, 488static int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
489 const void **pns) 489 const void **pns)
490{ 490{
491 struct sysfs_dirent *dir_sd = kobj->sd; 491 struct sysfs_dirent *dir_sd = kobj->sd;
492 const struct sysfs_ops *ops; 492 const struct sysfs_ops *ops;
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 28ec13af28d9..2dcf3d473fec 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -681,8 +681,16 @@ int ubifs_find_free_leb_for_idx(struct ubifs_info *c)
681 if (!lprops) { 681 if (!lprops) {
682 lprops = ubifs_fast_find_freeable(c); 682 lprops = ubifs_fast_find_freeable(c);
683 if (!lprops) { 683 if (!lprops) {
684 ubifs_assert(c->freeable_cnt == 0); 684 /*
685 if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) { 685 * The first condition means the following: go scan the
686 * LPT if there are uncategorized lprops, which means
687 * there may be freeable LEBs there (UBIFS does not
688 * store the information about freeable LEBs in the
689 * master node).
690 */
691 if (c->in_a_category_cnt != c->main_lebs ||
692 c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
693 ubifs_assert(c->freeable_cnt == 0);
686 lprops = scan_for_leb_for_idx(c); 694 lprops = scan_for_leb_for_idx(c);
687 if (IS_ERR(lprops)) { 695 if (IS_ERR(lprops)) {
688 err = PTR_ERR(lprops); 696 err = PTR_ERR(lprops);
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index e5a2a35a46dc..46190a7c42a6 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -300,8 +300,11 @@ void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
300 default: 300 default:
301 ubifs_assert(0); 301 ubifs_assert(0);
302 } 302 }
303
303 lprops->flags &= ~LPROPS_CAT_MASK; 304 lprops->flags &= ~LPROPS_CAT_MASK;
304 lprops->flags |= cat; 305 lprops->flags |= cat;
306 c->in_a_category_cnt += 1;
307 ubifs_assert(c->in_a_category_cnt <= c->main_lebs);
305} 308}
306 309
307/** 310/**
@@ -334,6 +337,9 @@ static void ubifs_remove_from_cat(struct ubifs_info *c,
334 default: 337 default:
335 ubifs_assert(0); 338 ubifs_assert(0);
336 } 339 }
340
341 c->in_a_category_cnt -= 1;
342 ubifs_assert(c->in_a_category_cnt >= 0);
337} 343}
338 344
339/** 345/**
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 5486346d0a3f..d133c276fe05 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1183,6 +1183,8 @@ struct ubifs_debug_info;
1183 * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size) 1183 * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size)
1184 * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size) 1184 * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size)
1185 * @freeable_cnt: number of freeable LEBs in @freeable_list 1185 * @freeable_cnt: number of freeable LEBs in @freeable_list
1186 * @in_a_category_cnt: count of lprops which are in a certain category, which
1187 * basically meants that they were loaded from the flash
1186 * 1188 *
1187 * @ltab_lnum: LEB number of LPT's own lprops table 1189 * @ltab_lnum: LEB number of LPT's own lprops table
1188 * @ltab_offs: offset of LPT's own lprops table 1190 * @ltab_offs: offset of LPT's own lprops table
@@ -1412,6 +1414,7 @@ struct ubifs_info {
1412 struct list_head freeable_list; 1414 struct list_head freeable_list;
1413 struct list_head frdi_idx_list; 1415 struct list_head frdi_idx_list;
1414 int freeable_cnt; 1416 int freeable_cnt;
1417 int in_a_category_cnt;
1415 1418
1416 int ltab_lnum; 1419 int ltab_lnum;
1417 int ltab_offs; 1420 int ltab_offs;
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 6100ec0fa1d4..5a7ffe54f5d5 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -2,6 +2,7 @@ config XFS_FS
2 tristate "XFS filesystem support" 2 tristate "XFS filesystem support"
3 depends on BLOCK 3 depends on BLOCK
4 select EXPORTFS 4 select EXPORTFS
5 select LIBCRC32C
5 help 6 help
6 XFS is a high performance journaling filesystem which originated 7 XFS is a high performance journaling filesystem which originated
7 on the SGI IRIX platform. It is completely multi-threaded, can 8 on the SGI IRIX platform. It is completely multi-threaded, can
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d2bf974b1a2f..d02201df855b 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -37,9 +37,8 @@ xfs-y += xfs_aops.o \
37 xfs_file.o \ 37 xfs_file.o \
38 xfs_filestream.o \ 38 xfs_filestream.o \
39 xfs_fsops.o \ 39 xfs_fsops.o \
40 xfs_fs_subr.o \
41 xfs_globals.o \ 40 xfs_globals.o \
42 xfs_iget.o \ 41 xfs_icache.o \
43 xfs_ioctl.o \ 42 xfs_ioctl.o \
44 xfs_iomap.o \ 43 xfs_iomap.o \
45 xfs_iops.o \ 44 xfs_iops.o \
@@ -47,7 +46,6 @@ xfs-y += xfs_aops.o \
47 xfs_message.o \ 46 xfs_message.o \
48 xfs_mru_cache.o \ 47 xfs_mru_cache.o \
49 xfs_super.o \ 48 xfs_super.o \
50 xfs_sync.o \
51 xfs_xattr.o \ 49 xfs_xattr.o \
52 xfs_rename.o \ 50 xfs_rename.o \
53 xfs_utils.o \ 51 xfs_utils.o \
diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h
index 4732d71262cc..104db0f3bed6 100644
--- a/fs/xfs/uuid.h
+++ b/fs/xfs/uuid.h
@@ -26,4 +26,10 @@ extern int uuid_is_nil(uuid_t *uuid);
26extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); 26extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
27extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); 27extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
28 28
29static inline void
30uuid_copy(uuid_t *dst, uuid_t *src)
31{
32 memcpy(dst, src, sizeof(uuid_t));
33}
34
29#endif /* __XFS_SUPPORT_UUID_H__ */ 35#endif /* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 44d65c1533c0..f2aeedb6a579 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -108,6 +108,8 @@ typedef struct xfs_agf {
108extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, 108extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
109 xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); 109 xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
110 110
111extern const struct xfs_buf_ops xfs_agf_buf_ops;
112
111/* 113/*
112 * Size of the unlinked inode hash table in the agi. 114 * Size of the unlinked inode hash table in the agi.
113 */ 115 */
@@ -161,6 +163,8 @@ typedef struct xfs_agi {
161extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, 163extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
162 xfs_agnumber_t agno, struct xfs_buf **bpp); 164 xfs_agnumber_t agno, struct xfs_buf **bpp);
163 165
166extern const struct xfs_buf_ops xfs_agi_buf_ops;
167
164/* 168/*
165 * The third a.g. block contains the a.g. freelist, an array 169 * The third a.g. block contains the a.g. freelist, an array
166 * of block pointers to blocks owned by the allocation btree code. 170 * of block pointers to blocks owned by the allocation btree code.
@@ -233,6 +237,7 @@ typedef struct xfs_perag {
233#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup 237#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
234 in xfs_inode_ag_iterator */ 238 in xfs_inode_ag_iterator */
235#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ 239#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
240#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */
236 241
237#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) 242#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
238#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ 243#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 4f33c32affe3..393055fe3aef 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -430,6 +430,60 @@ xfs_alloc_fixup_trees(
430 return 0; 430 return 0;
431} 431}
432 432
433static void
434xfs_agfl_verify(
435 struct xfs_buf *bp)
436{
437#ifdef WHEN_CRCS_COME_ALONG
438 /*
439 * we cannot actually do any verification of the AGFL because mkfs does
440 * not initialise the AGFL to zero or NULL. Hence the only valid part of
441 * the AGFL is what the AGF says is active. We can't get to the AGF, so
442 * we can't verify just those entries are valid.
443 *
444 * This problem goes away when the CRC format change comes along as that
445 * requires the AGFL to be initialised by mkfs. At that point, we can
446 * verify the blocks in the agfl -active or not- lie within the bounds
447 * of the AG. Until then, just leave this check ifdef'd out.
448 */
449 struct xfs_mount *mp = bp->b_target->bt_mount;
450 struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
451 int agfl_ok = 1;
452
453 int i;
454
455 for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
456 if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK ||
457 be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
458 agfl_ok = 0;
459 }
460
461 if (!agfl_ok) {
462 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl);
463 xfs_buf_ioerror(bp, EFSCORRUPTED);
464 }
465#endif
466}
467
468static void
469xfs_agfl_write_verify(
470 struct xfs_buf *bp)
471{
472 xfs_agfl_verify(bp);
473}
474
475static void
476xfs_agfl_read_verify(
477 struct xfs_buf *bp)
478{
479 xfs_agfl_verify(bp);
480}
481
482const struct xfs_buf_ops xfs_agfl_buf_ops = {
483 .verify_read = xfs_agfl_read_verify,
484 .verify_write = xfs_agfl_write_verify,
485};
486
433/* 487/*
434 * Read in the allocation group free block array. 488 * Read in the allocation group free block array.
435 */ 489 */
@@ -447,7 +501,7 @@ xfs_alloc_read_agfl(
447 error = xfs_trans_read_buf( 501 error = xfs_trans_read_buf(
448 mp, tp, mp->m_ddev_targp, 502 mp, tp, mp->m_ddev_targp,
449 XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), 503 XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
450 XFS_FSS_TO_BB(mp, 1), 0, &bp); 504 XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
451 if (error) 505 if (error)
452 return error; 506 return error;
453 ASSERT(!xfs_buf_geterror(bp)); 507 ASSERT(!xfs_buf_geterror(bp));
@@ -1866,6 +1920,7 @@ xfs_alloc_fix_freelist(
1866 /* 1920 /*
1867 * Initialize the args structure. 1921 * Initialize the args structure.
1868 */ 1922 */
1923 memset(&targs, 0, sizeof(targs));
1869 targs.tp = tp; 1924 targs.tp = tp;
1870 targs.mp = mp; 1925 targs.mp = mp;
1871 targs.agbp = agbp; 1926 targs.agbp = agbp;
@@ -2090,6 +2145,63 @@ xfs_alloc_put_freelist(
2090 return 0; 2145 return 0;
2091} 2146}
2092 2147
2148static void
2149xfs_agf_verify(
2150 struct xfs_buf *bp)
2151 {
2152 struct xfs_mount *mp = bp->b_target->bt_mount;
2153 struct xfs_agf *agf;
2154 int agf_ok;
2155
2156 agf = XFS_BUF_TO_AGF(bp);
2157
2158 agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
2159 XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
2160 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
2161 be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
2162 be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
2163 be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
2164
2165 /*
2166 * during growfs operations, the perag is not fully initialised,
2167 * so we can't use it for any useful checking. growfs ensures we can't
2168 * use it by using uncached buffers that don't have the perag attached
2169 * so we can detect and avoid this problem.
2170 */
2171 if (bp->b_pag)
2172 agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) ==
2173 bp->b_pag->pag_agno;
2174
2175 if (xfs_sb_version_haslazysbcount(&mp->m_sb))
2176 agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
2177 be32_to_cpu(agf->agf_length);
2178
2179 if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
2180 XFS_RANDOM_ALLOC_READ_AGF))) {
2181 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf);
2182 xfs_buf_ioerror(bp, EFSCORRUPTED);
2183 }
2184}
2185
2186static void
2187xfs_agf_read_verify(
2188 struct xfs_buf *bp)
2189{
2190 xfs_agf_verify(bp);
2191}
2192
2193static void
2194xfs_agf_write_verify(
2195 struct xfs_buf *bp)
2196{
2197 xfs_agf_verify(bp);
2198}
2199
2200const struct xfs_buf_ops xfs_agf_buf_ops = {
2201 .verify_read = xfs_agf_read_verify,
2202 .verify_write = xfs_agf_write_verify,
2203};
2204
2093/* 2205/*
2094 * Read in the allocation group header (free/alloc section). 2206 * Read in the allocation group header (free/alloc section).
2095 */ 2207 */
@@ -2101,44 +2213,19 @@ xfs_read_agf(
2101 int flags, /* XFS_BUF_ */ 2213 int flags, /* XFS_BUF_ */
2102 struct xfs_buf **bpp) /* buffer for the ag freelist header */ 2214 struct xfs_buf **bpp) /* buffer for the ag freelist header */
2103{ 2215{
2104 struct xfs_agf *agf; /* ag freelist header */
2105 int agf_ok; /* set if agf is consistent */
2106 int error; 2216 int error;
2107 2217
2108 ASSERT(agno != NULLAGNUMBER); 2218 ASSERT(agno != NULLAGNUMBER);
2109 error = xfs_trans_read_buf( 2219 error = xfs_trans_read_buf(
2110 mp, tp, mp->m_ddev_targp, 2220 mp, tp, mp->m_ddev_targp,
2111 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), 2221 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
2112 XFS_FSS_TO_BB(mp, 1), flags, bpp); 2222 XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
2113 if (error) 2223 if (error)
2114 return error; 2224 return error;
2115 if (!*bpp) 2225 if (!*bpp)
2116 return 0; 2226 return 0;
2117 2227
2118 ASSERT(!(*bpp)->b_error); 2228 ASSERT(!(*bpp)->b_error);
2119 agf = XFS_BUF_TO_AGF(*bpp);
2120
2121 /*
2122 * Validate the magic number of the agf block.
2123 */
2124 agf_ok =
2125 agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
2126 XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
2127 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
2128 be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
2129 be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
2130 be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) &&
2131 be32_to_cpu(agf->agf_seqno) == agno;
2132 if (xfs_sb_version_haslazysbcount(&mp->m_sb))
2133 agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
2134 be32_to_cpu(agf->agf_length);
2135 if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
2136 XFS_RANDOM_ALLOC_READ_AGF))) {
2137 XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
2138 XFS_ERRLEVEL_LOW, mp, agf);
2139 xfs_trans_brelse(tp, *bpp);
2140 return XFS_ERROR(EFSCORRUPTED);
2141 }
2142 xfs_buf_set_ref(*bpp, XFS_AGF_REF); 2229 xfs_buf_set_ref(*bpp, XFS_AGF_REF);
2143 return 0; 2230 return 0;
2144} 2231}
@@ -2207,7 +2294,7 @@ xfs_alloc_read_agf(
2207 * group or loop over the allocation groups to find the result. 2294 * group or loop over the allocation groups to find the result.
2208 */ 2295 */
2209int /* error */ 2296int /* error */
2210__xfs_alloc_vextent( 2297xfs_alloc_vextent(
2211 xfs_alloc_arg_t *args) /* allocation argument structure */ 2298 xfs_alloc_arg_t *args) /* allocation argument structure */
2212{ 2299{
2213 xfs_agblock_t agsize; /* allocation group size */ 2300 xfs_agblock_t agsize; /* allocation group size */
@@ -2417,46 +2504,6 @@ error0:
2417 return error; 2504 return error;
2418} 2505}
2419 2506
2420static void
2421xfs_alloc_vextent_worker(
2422 struct work_struct *work)
2423{
2424 struct xfs_alloc_arg *args = container_of(work,
2425 struct xfs_alloc_arg, work);
2426 unsigned long pflags;
2427
2428 /* we are in a transaction context here */
2429 current_set_flags_nested(&pflags, PF_FSTRANS);
2430
2431 args->result = __xfs_alloc_vextent(args);
2432 complete(args->done);
2433
2434 current_restore_flags_nested(&pflags, PF_FSTRANS);
2435}
2436
2437/*
2438 * Data allocation requests often come in with little stack to work on. Push
2439 * them off to a worker thread so there is lots of stack to use. Metadata
2440 * requests, OTOH, are generally from low stack usage paths, so avoid the
2441 * context switch overhead here.
2442 */
2443int
2444xfs_alloc_vextent(
2445 struct xfs_alloc_arg *args)
2446{
2447 DECLARE_COMPLETION_ONSTACK(done);
2448
2449 if (!args->userdata)
2450 return __xfs_alloc_vextent(args);
2451
2452
2453 args->done = &done;
2454 INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker);
2455 queue_work(xfs_alloc_wq, &args->work);
2456 wait_for_completion(&done);
2457 return args->result;
2458}
2459
2460/* 2507/*
2461 * Free an extent. 2508 * Free an extent.
2462 * Just break up the extent address and hand off to xfs_free_ag_extent 2509 * Just break up the extent address and hand off to xfs_free_ag_extent
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 93be4a667ca1..99d0a6101558 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -120,9 +120,6 @@ typedef struct xfs_alloc_arg {
120 char isfl; /* set if is freelist blocks - !acctg */ 120 char isfl; /* set if is freelist blocks - !acctg */
121 char userdata; /* set if this is user data */ 121 char userdata; /* set if this is user data */
122 xfs_fsblock_t firstblock; /* io first block allocated */ 122 xfs_fsblock_t firstblock; /* io first block allocated */
123 struct completion *done;
124 struct work_struct work;
125 int result;
126} xfs_alloc_arg_t; 123} xfs_alloc_arg_t;
127 124
128/* 125/*
@@ -234,4 +231,7 @@ xfs_alloc_get_rec(
234 xfs_extlen_t *len, /* output: length of extent */ 231 xfs_extlen_t *len, /* output: length of extent */
235 int *stat); /* output: success/failure */ 232 int *stat); /* output: success/failure */
236 233
234extern const struct xfs_buf_ops xfs_agf_buf_ops;
235extern const struct xfs_buf_ops xfs_agfl_buf_ops;
236
237#endif /* __XFS_ALLOC_H__ */ 237#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index f1647caace8f..b1ddef6b2689 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -121,6 +121,8 @@ xfs_allocbt_free_block(
121 xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, 121 xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
122 XFS_EXTENT_BUSY_SKIP_DISCARD); 122 XFS_EXTENT_BUSY_SKIP_DISCARD);
123 xfs_trans_agbtree_delta(cur->bc_tp, -1); 123 xfs_trans_agbtree_delta(cur->bc_tp, -1);
124
125 xfs_trans_binval(cur->bc_tp, bp);
124 return 0; 126 return 0;
125} 127}
126 128
@@ -270,6 +272,82 @@ xfs_allocbt_key_diff(
270 return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; 272 return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
271} 273}
272 274
275static void
276xfs_allocbt_verify(
277 struct xfs_buf *bp)
278{
279 struct xfs_mount *mp = bp->b_target->bt_mount;
280 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
281 struct xfs_perag *pag = bp->b_pag;
282 unsigned int level;
283 int sblock_ok; /* block passes checks */
284
285 /*
286 * magic number and level verification
287 *
288 * During growfs operations, we can't verify the exact level as the
289 * perag is not fully initialised and hence not attached to the buffer.
290 * In this case, check against the maximum tree depth.
291 */
292 level = be16_to_cpu(block->bb_level);
293 switch (block->bb_magic) {
294 case cpu_to_be32(XFS_ABTB_MAGIC):
295 if (pag)
296 sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi];
297 else
298 sblock_ok = level < mp->m_ag_maxlevels;
299 break;
300 case cpu_to_be32(XFS_ABTC_MAGIC):
301 if (pag)
302 sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi];
303 else
304 sblock_ok = level < mp->m_ag_maxlevels;
305 break;
306 default:
307 sblock_ok = 0;
308 break;
309 }
310
311 /* numrecs verification */
312 sblock_ok = sblock_ok &&
313 be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0];
314
315 /* sibling pointer verification */
316 sblock_ok = sblock_ok &&
317 (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
318 be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
319 block->bb_u.s.bb_leftsib &&
320 (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
321 be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
322 block->bb_u.s.bb_rightsib;
323
324 if (!sblock_ok) {
325 trace_xfs_btree_corrupt(bp, _RET_IP_);
326 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
327 xfs_buf_ioerror(bp, EFSCORRUPTED);
328 }
329}
330
331static void
332xfs_allocbt_read_verify(
333 struct xfs_buf *bp)
334{
335 xfs_allocbt_verify(bp);
336}
337
338static void
339xfs_allocbt_write_verify(
340 struct xfs_buf *bp)
341{
342 xfs_allocbt_verify(bp);
343}
344
345const struct xfs_buf_ops xfs_allocbt_buf_ops = {
346 .verify_read = xfs_allocbt_read_verify,
347 .verify_write = xfs_allocbt_write_verify,
348};
349
350
273#ifdef DEBUG 351#ifdef DEBUG
274STATIC int 352STATIC int
275xfs_allocbt_keys_inorder( 353xfs_allocbt_keys_inorder(
@@ -325,6 +403,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
325 .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, 403 .init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
326 .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, 404 .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
327 .key_diff = xfs_allocbt_key_diff, 405 .key_diff = xfs_allocbt_key_diff,
406 .buf_ops = &xfs_allocbt_buf_ops,
328#ifdef DEBUG 407#ifdef DEBUG
329 .keys_inorder = xfs_allocbt_keys_inorder, 408 .keys_inorder = xfs_allocbt_keys_inorder,
330 .recs_inorder = xfs_allocbt_recs_inorder, 409 .recs_inorder = xfs_allocbt_recs_inorder,
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index 359fb86ed876..7e89a2b429dd 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -93,4 +93,6 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
93 xfs_agnumber_t, xfs_btnum_t); 93 xfs_agnumber_t, xfs_btnum_t);
94extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); 94extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
95 95
96extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
97
96#endif /* __XFS_ALLOC_BTREE_H__ */ 98#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e562dd43f41f..4111a40ebe1a 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -124,7 +124,7 @@ xfs_setfilesize_trans_alloc(
124 ioend->io_append_trans = tp; 124 ioend->io_append_trans = tp;
125 125
126 /* 126 /*
127 * We will pass freeze protection with a transaction. So tell lockdep 127 * We may pass freeze protection with a transaction. So tell lockdep
128 * we released it. 128 * we released it.
129 */ 129 */
130 rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 130 rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
@@ -149,11 +149,13 @@ xfs_setfilesize(
149 xfs_fsize_t isize; 149 xfs_fsize_t isize;
150 150
151 /* 151 /*
152 * The transaction was allocated in the I/O submission thread, 152 * The transaction may have been allocated in the I/O submission thread,
153 * thus we need to mark ourselves as beeing in a transaction 153 * thus we need to mark ourselves as beeing in a transaction manually.
154 * manually. 154 * Similarly for freeze protection.
155 */ 155 */
156 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); 156 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
157 rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
158 0, 1, _THIS_IP_);
157 159
158 xfs_ilock(ip, XFS_ILOCK_EXCL); 160 xfs_ilock(ip, XFS_ILOCK_EXCL);
159 isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); 161 isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
@@ -187,7 +189,8 @@ xfs_finish_ioend(
187 189
188 if (ioend->io_type == XFS_IO_UNWRITTEN) 190 if (ioend->io_type == XFS_IO_UNWRITTEN)
189 queue_work(mp->m_unwritten_workqueue, &ioend->io_work); 191 queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
190 else if (ioend->io_append_trans) 192 else if (ioend->io_append_trans ||
193 (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
191 queue_work(mp->m_data_workqueue, &ioend->io_work); 194 queue_work(mp->m_data_workqueue, &ioend->io_work);
192 else 195 else
193 xfs_destroy_ioend(ioend); 196 xfs_destroy_ioend(ioend);
@@ -205,15 +208,6 @@ xfs_end_io(
205 struct xfs_inode *ip = XFS_I(ioend->io_inode); 208 struct xfs_inode *ip = XFS_I(ioend->io_inode);
206 int error = 0; 209 int error = 0;
207 210
208 if (ioend->io_append_trans) {
209 /*
210 * We've got freeze protection passed with the transaction.
211 * Tell lockdep about it.
212 */
213 rwsem_acquire_read(
214 &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
215 0, 1, _THIS_IP_);
216 }
217 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 211 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
218 ioend->io_error = -EIO; 212 ioend->io_error = -EIO;
219 goto done; 213 goto done;
@@ -226,35 +220,31 @@ xfs_end_io(
226 * range to normal written extens after the data I/O has finished. 220 * range to normal written extens after the data I/O has finished.
227 */ 221 */
228 if (ioend->io_type == XFS_IO_UNWRITTEN) { 222 if (ioend->io_type == XFS_IO_UNWRITTEN) {
223 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
224 ioend->io_size);
225 } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
229 /* 226 /*
230 * For buffered I/O we never preallocate a transaction when 227 * For direct I/O we do not know if we need to allocate blocks
231 * doing the unwritten extent conversion, but for direct I/O 228 * or not so we can't preallocate an append transaction as that
232 * we do not know if we are converting an unwritten extent 229 * results in nested reservations and log space deadlocks. Hence
233 * or not at the point where we preallocate the transaction. 230 * allocate the transaction here. While this is sub-optimal and
231 * can block IO completion for some time, we're stuck with doing
232 * it this way until we can pass the ioend to the direct IO
233 * allocation callbacks and avoid nesting that way.
234 */ 234 */
235 if (ioend->io_append_trans) { 235 error = xfs_setfilesize_trans_alloc(ioend);
236 ASSERT(ioend->io_isdirect); 236 if (error)
237
238 current_set_flags_nested(
239 &ioend->io_append_trans->t_pflags, PF_FSTRANS);
240 xfs_trans_cancel(ioend->io_append_trans, 0);
241 }
242
243 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
244 ioend->io_size);
245 if (error) {
246 ioend->io_error = -error;
247 goto done; 237 goto done;
248 } 238 error = xfs_setfilesize(ioend);
249 } else if (ioend->io_append_trans) { 239 } else if (ioend->io_append_trans) {
250 error = xfs_setfilesize(ioend); 240 error = xfs_setfilesize(ioend);
251 if (error)
252 ioend->io_error = -error;
253 } else { 241 } else {
254 ASSERT(!xfs_ioend_is_append(ioend)); 242 ASSERT(!xfs_ioend_is_append(ioend));
255 } 243 }
256 244
257done: 245done:
246 if (error)
247 ioend->io_error = -error;
258 xfs_destroy_ioend(ioend); 248 xfs_destroy_ioend(ioend);
259} 249}
260 250
@@ -481,11 +471,17 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
481 * 471 *
482 * The fix is two passes across the ioend list - one to start writeback on the 472 * The fix is two passes across the ioend list - one to start writeback on the
483 * buffer_heads, and then submit them for I/O on the second pass. 473 * buffer_heads, and then submit them for I/O on the second pass.
474 *
475 * If @fail is non-zero, it means that we have a situation where some part of
476 * the submission process has failed after we have marked paged for writeback
477 * and unlocked them. In this situation, we need to fail the ioend chain rather
478 * than submit it to IO. This typically only happens on a filesystem shutdown.
484 */ 479 */
485STATIC void 480STATIC void
486xfs_submit_ioend( 481xfs_submit_ioend(
487 struct writeback_control *wbc, 482 struct writeback_control *wbc,
488 xfs_ioend_t *ioend) 483 xfs_ioend_t *ioend,
484 int fail)
489{ 485{
490 xfs_ioend_t *head = ioend; 486 xfs_ioend_t *head = ioend;
491 xfs_ioend_t *next; 487 xfs_ioend_t *next;
@@ -506,6 +502,18 @@ xfs_submit_ioend(
506 next = ioend->io_list; 502 next = ioend->io_list;
507 bio = NULL; 503 bio = NULL;
508 504
505 /*
506 * If we are failing the IO now, just mark the ioend with an
507 * error and finish it. This will run IO completion immediately
508 * as there is only one reference to the ioend at this point in
509 * time.
510 */
511 if (fail) {
512 ioend->io_error = -fail;
513 xfs_finish_ioend(ioend);
514 continue;
515 }
516
509 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 517 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
510 518
511 if (!bio) { 519 if (!bio) {
@@ -1060,7 +1068,18 @@ xfs_vm_writepage(
1060 1068
1061 xfs_start_page_writeback(page, 1, count); 1069 xfs_start_page_writeback(page, 1, count);
1062 1070
1063 if (ioend && imap_valid) { 1071 /* if there is no IO to be submitted for this page, we are done */
1072 if (!ioend)
1073 return 0;
1074
1075 ASSERT(iohead);
1076
1077 /*
1078 * Any errors from this point onwards need tobe reported through the IO
1079 * completion path as we have marked the initial page as under writeback
1080 * and unlocked it.
1081 */
1082 if (imap_valid) {
1064 xfs_off_t end_index; 1083 xfs_off_t end_index;
1065 1084
1066 end_index = imap.br_startoff + imap.br_blockcount; 1085 end_index = imap.br_startoff + imap.br_blockcount;
@@ -1079,20 +1098,15 @@ xfs_vm_writepage(
1079 wbc, end_index); 1098 wbc, end_index);
1080 } 1099 }
1081 1100
1082 if (iohead) {
1083 /*
1084 * Reserve log space if we might write beyond the on-disk
1085 * inode size.
1086 */
1087 if (ioend->io_type != XFS_IO_UNWRITTEN &&
1088 xfs_ioend_is_append(ioend)) {
1089 err = xfs_setfilesize_trans_alloc(ioend);
1090 if (err)
1091 goto error;
1092 }
1093 1101
1094 xfs_submit_ioend(wbc, iohead); 1102 /*
1095 } 1103 * Reserve log space if we might write beyond the on-disk inode size.
1104 */
1105 err = 0;
1106 if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
1107 err = xfs_setfilesize_trans_alloc(ioend);
1108
1109 xfs_submit_ioend(wbc, iohead, err);
1096 1110
1097 return 0; 1111 return 0;
1098 1112
@@ -1408,25 +1422,21 @@ xfs_vm_direct_IO(
1408 size_t size = iov_length(iov, nr_segs); 1422 size_t size = iov_length(iov, nr_segs);
1409 1423
1410 /* 1424 /*
1411 * We need to preallocate a transaction for a size update 1425 * We cannot preallocate a size update transaction here as we
1412 * here. In the case that this write both updates the size 1426 * don't know whether allocation is necessary or not. Hence we
1413 * and converts at least on unwritten extent we will cancel 1427 * can only tell IO completion that one is necessary if we are
1414 * the still clean transaction after the I/O has finished. 1428 * not doing unwritten extent conversion.
1415 */ 1429 */
1416 iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT); 1430 iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
1417 if (offset + size > XFS_I(inode)->i_d.di_size) { 1431 if (offset + size > XFS_I(inode)->i_d.di_size)
1418 ret = xfs_setfilesize_trans_alloc(ioend);
1419 if (ret)
1420 goto out_destroy_ioend;
1421 ioend->io_isdirect = 1; 1432 ioend->io_isdirect = 1;
1422 }
1423 1433
1424 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1434 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1425 offset, nr_segs, 1435 offset, nr_segs,
1426 xfs_get_blocks_direct, 1436 xfs_get_blocks_direct,
1427 xfs_end_io_direct_write, NULL, 0); 1437 xfs_end_io_direct_write, NULL, 0);
1428 if (ret != -EIOCBQUEUED && iocb->private) 1438 if (ret != -EIOCBQUEUED && iocb->private)
1429 goto out_trans_cancel; 1439 goto out_destroy_ioend;
1430 } else { 1440 } else {
1431 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1441 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1432 offset, nr_segs, 1442 offset, nr_segs,
@@ -1436,15 +1446,6 @@ xfs_vm_direct_IO(
1436 1446
1437 return ret; 1447 return ret;
1438 1448
1439out_trans_cancel:
1440 if (ioend->io_append_trans) {
1441 current_set_flags_nested(&ioend->io_append_trans->t_pflags,
1442 PF_FSTRANS);
1443 rwsem_acquire_read(
1444 &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
1445 0, 1, _THIS_IP_);
1446 xfs_trans_cancel(ioend->io_append_trans, 0);
1447 }
1448out_destroy_ioend: 1449out_destroy_ioend:
1449 xfs_destroy_ioend(ioend); 1450 xfs_destroy_ioend(ioend);
1450 return ret; 1451 return ret;
@@ -1617,7 +1618,7 @@ xfs_vm_bmap(
1617 1618
1618 trace_xfs_vm_bmap(XFS_I(inode)); 1619 trace_xfs_vm_bmap(XFS_I(inode));
1619 xfs_ilock(ip, XFS_IOLOCK_SHARED); 1620 xfs_ilock(ip, XFS_IOLOCK_SHARED);
1620 xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); 1621 filemap_write_and_wait(mapping);
1621 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 1622 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
1622 return generic_block_bmap(mapping, block, xfs_get_blocks); 1623 return generic_block_bmap(mapping, block, xfs_get_blocks);
1623} 1624}
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 0ca1f0be62d2..aaf472532b3c 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -903,11 +903,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
903 */ 903 */
904 dp = args->dp; 904 dp = args->dp;
905 args->blkno = 0; 905 args->blkno = 0;
906 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, 906 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
907 XFS_ATTR_FORK);
908 if (error) 907 if (error)
909 return(error); 908 return error;
910 ASSERT(bp != NULL);
911 909
912 /* 910 /*
913 * Look up the given attribute in the leaf block. Figure out if 911 * Look up the given attribute in the leaf block. Figure out if
@@ -1031,12 +1029,12 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
1031 * Read in the block containing the "old" attr, then 1029 * Read in the block containing the "old" attr, then
1032 * remove the "old" attr from that block (neat, huh!) 1030 * remove the "old" attr from that block (neat, huh!)
1033 */ 1031 */
1034 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, 1032 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno,
1035 &bp, XFS_ATTR_FORK); 1033 -1, &bp);
1036 if (error) 1034 if (error)
1037 return(error); 1035 return error;
1038 ASSERT(bp != NULL); 1036
1039 (void)xfs_attr_leaf_remove(bp, args); 1037 xfs_attr_leaf_remove(bp, args);
1040 1038
1041 /* 1039 /*
1042 * If the result is small enough, shrink it all into the inode. 1040 * If the result is small enough, shrink it all into the inode.
@@ -1100,20 +1098,17 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
1100 */ 1098 */
1101 dp = args->dp; 1099 dp = args->dp;
1102 args->blkno = 0; 1100 args->blkno = 0;
1103 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, 1101 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
1104 XFS_ATTR_FORK); 1102 if (error)
1105 if (error) { 1103 return error;
1106 return(error);
1107 }
1108 1104
1109 ASSERT(bp != NULL);
1110 error = xfs_attr_leaf_lookup_int(bp, args); 1105 error = xfs_attr_leaf_lookup_int(bp, args);
1111 if (error == ENOATTR) { 1106 if (error == ENOATTR) {
1112 xfs_trans_brelse(args->trans, bp); 1107 xfs_trans_brelse(args->trans, bp);
1113 return(error); 1108 return(error);
1114 } 1109 }
1115 1110
1116 (void)xfs_attr_leaf_remove(bp, args); 1111 xfs_attr_leaf_remove(bp, args);
1117 1112
1118 /* 1113 /*
1119 * If the result is small enough, shrink it all into the inode. 1114 * If the result is small enough, shrink it all into the inode.
@@ -1155,12 +1150,12 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
1155 struct xfs_buf *bp; 1150 struct xfs_buf *bp;
1156 int error; 1151 int error;
1157 1152
1153 trace_xfs_attr_leaf_get(args);
1154
1158 args->blkno = 0; 1155 args->blkno = 0;
1159 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, 1156 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
1160 XFS_ATTR_FORK);
1161 if (error) 1157 if (error)
1162 return(error); 1158 return error;
1163 ASSERT(bp != NULL);
1164 1159
1165 error = xfs_attr_leaf_lookup_int(bp, args); 1160 error = xfs_attr_leaf_lookup_int(bp, args);
1166 if (error != EEXIST) { 1161 if (error != EEXIST) {
@@ -1181,22 +1176,15 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
1181STATIC int 1176STATIC int
1182xfs_attr_leaf_list(xfs_attr_list_context_t *context) 1177xfs_attr_leaf_list(xfs_attr_list_context_t *context)
1183{ 1178{
1184 xfs_attr_leafblock_t *leaf;
1185 int error; 1179 int error;
1186 struct xfs_buf *bp; 1180 struct xfs_buf *bp;
1187 1181
1182 trace_xfs_attr_leaf_list(context);
1183
1188 context->cursor->blkno = 0; 1184 context->cursor->blkno = 0;
1189 error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK); 1185 error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp);
1190 if (error) 1186 if (error)
1191 return XFS_ERROR(error); 1187 return XFS_ERROR(error);
1192 ASSERT(bp != NULL);
1193 leaf = bp->b_addr;
1194 if (unlikely(leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) {
1195 XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW,
1196 context->dp->i_mount, leaf);
1197 xfs_trans_brelse(NULL, bp);
1198 return XFS_ERROR(EFSCORRUPTED);
1199 }
1200 1188
1201 error = xfs_attr_leaf_list_int(bp, context); 1189 error = xfs_attr_leaf_list_int(bp, context);
1202 xfs_trans_brelse(NULL, bp); 1190 xfs_trans_brelse(NULL, bp);
@@ -1600,12 +1588,9 @@ xfs_attr_node_removename(xfs_da_args_t *args)
1600 ASSERT(state->path.blk[0].bp); 1588 ASSERT(state->path.blk[0].bp);
1601 state->path.blk[0].bp = NULL; 1589 state->path.blk[0].bp = NULL;
1602 1590
1603 error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, 1591 error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp);
1604 XFS_ATTR_FORK);
1605 if (error) 1592 if (error)
1606 goto out; 1593 goto out;
1607 ASSERT((((xfs_attr_leafblock_t *)bp->b_addr)->hdr.info.magic) ==
1608 cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1609 1594
1610 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { 1595 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
1611 xfs_bmap_init(args->flist, args->firstblock); 1596 xfs_bmap_init(args->flist, args->firstblock);
@@ -1653,6 +1638,8 @@ xfs_attr_fillstate(xfs_da_state_t *state)
1653 xfs_da_state_blk_t *blk; 1638 xfs_da_state_blk_t *blk;
1654 int level; 1639 int level;
1655 1640
1641 trace_xfs_attr_fillstate(state->args);
1642
1656 /* 1643 /*
1657 * Roll down the "path" in the state structure, storing the on-disk 1644 * Roll down the "path" in the state structure, storing the on-disk
1658 * block number for those buffers in the "path". 1645 * block number for those buffers in the "path".
@@ -1699,6 +1686,8 @@ xfs_attr_refillstate(xfs_da_state_t *state)
1699 xfs_da_state_blk_t *blk; 1686 xfs_da_state_blk_t *blk;
1700 int level, error; 1687 int level, error;
1701 1688
1689 trace_xfs_attr_refillstate(state->args);
1690
1702 /* 1691 /*
1703 * Roll down the "path" in the state structure, storing the on-disk 1692 * Roll down the "path" in the state structure, storing the on-disk
1704 * block number for those buffers in the "path". 1693 * block number for those buffers in the "path".
@@ -1707,7 +1696,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
1707 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); 1696 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1708 for (blk = path->blk, level = 0; level < path->active; blk++, level++) { 1697 for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
1709 if (blk->disk_blkno) { 1698 if (blk->disk_blkno) {
1710 error = xfs_da_read_buf(state->args->trans, 1699 error = xfs_da_node_read(state->args->trans,
1711 state->args->dp, 1700 state->args->dp,
1712 blk->blkno, blk->disk_blkno, 1701 blk->blkno, blk->disk_blkno,
1713 &blk->bp, XFS_ATTR_FORK); 1702 &blk->bp, XFS_ATTR_FORK);
@@ -1726,7 +1715,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
1726 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); 1715 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1727 for (blk = path->blk, level = 0; level < path->active; blk++, level++) { 1716 for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
1728 if (blk->disk_blkno) { 1717 if (blk->disk_blkno) {
1729 error = xfs_da_read_buf(state->args->trans, 1718 error = xfs_da_node_read(state->args->trans,
1730 state->args->dp, 1719 state->args->dp,
1731 blk->blkno, blk->disk_blkno, 1720 blk->blkno, blk->disk_blkno,
1732 &blk->bp, XFS_ATTR_FORK); 1721 &blk->bp, XFS_ATTR_FORK);
@@ -1755,6 +1744,8 @@ xfs_attr_node_get(xfs_da_args_t *args)
1755 int error, retval; 1744 int error, retval;
1756 int i; 1745 int i;
1757 1746
1747 trace_xfs_attr_node_get(args);
1748
1758 state = xfs_da_state_alloc(); 1749 state = xfs_da_state_alloc();
1759 state->args = args; 1750 state->args = args;
1760 state->mp = args->dp->i_mount; 1751 state->mp = args->dp->i_mount;
@@ -1804,6 +1795,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1804 int error, i; 1795 int error, i;
1805 struct xfs_buf *bp; 1796 struct xfs_buf *bp;
1806 1797
1798 trace_xfs_attr_node_list(context);
1799
1807 cursor = context->cursor; 1800 cursor = context->cursor;
1808 cursor->initted = 1; 1801 cursor->initted = 1;
1809 1802
@@ -1814,7 +1807,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1814 */ 1807 */
1815 bp = NULL; 1808 bp = NULL;
1816 if (cursor->blkno > 0) { 1809 if (cursor->blkno > 0) {
1817 error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, 1810 error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1,
1818 &bp, XFS_ATTR_FORK); 1811 &bp, XFS_ATTR_FORK);
1819 if ((error != 0) && (error != EFSCORRUPTED)) 1812 if ((error != 0) && (error != EFSCORRUPTED))
1820 return(error); 1813 return(error);
@@ -1856,17 +1849,11 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1856 if (bp == NULL) { 1849 if (bp == NULL) {
1857 cursor->blkno = 0; 1850 cursor->blkno = 0;
1858 for (;;) { 1851 for (;;) {
1859 error = xfs_da_read_buf(NULL, context->dp, 1852 error = xfs_da_node_read(NULL, context->dp,
1860 cursor->blkno, -1, &bp, 1853 cursor->blkno, -1, &bp,
1861 XFS_ATTR_FORK); 1854 XFS_ATTR_FORK);
1862 if (error) 1855 if (error)
1863 return(error); 1856 return(error);
1864 if (unlikely(bp == NULL)) {
1865 XFS_ERROR_REPORT("xfs_attr_node_list(2)",
1866 XFS_ERRLEVEL_LOW,
1867 context->dp->i_mount);
1868 return(XFS_ERROR(EFSCORRUPTED));
1869 }
1870 node = bp->b_addr; 1857 node = bp->b_addr;
1871 if (node->hdr.info.magic == 1858 if (node->hdr.info.magic ==
1872 cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) 1859 cpu_to_be16(XFS_ATTR_LEAF_MAGIC))
@@ -1907,14 +1894,6 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1907 */ 1894 */
1908 for (;;) { 1895 for (;;) {
1909 leaf = bp->b_addr; 1896 leaf = bp->b_addr;
1910 if (unlikely(leaf->hdr.info.magic !=
1911 cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) {
1912 XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)",
1913 XFS_ERRLEVEL_LOW,
1914 context->dp->i_mount, leaf);
1915 xfs_trans_brelse(NULL, bp);
1916 return(XFS_ERROR(EFSCORRUPTED));
1917 }
1918 error = xfs_attr_leaf_list_int(bp, context); 1897 error = xfs_attr_leaf_list_int(bp, context);
1919 if (error) { 1898 if (error) {
1920 xfs_trans_brelse(NULL, bp); 1899 xfs_trans_brelse(NULL, bp);
@@ -1924,16 +1903,10 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1924 break; 1903 break;
1925 cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); 1904 cursor->blkno = be32_to_cpu(leaf->hdr.info.forw);
1926 xfs_trans_brelse(NULL, bp); 1905 xfs_trans_brelse(NULL, bp);
1927 error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, 1906 error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1,
1928 &bp, XFS_ATTR_FORK); 1907 &bp);
1929 if (error) 1908 if (error)
1930 return(error); 1909 return error;
1931 if (unlikely((bp == NULL))) {
1932 XFS_ERROR_REPORT("xfs_attr_node_list(5)",
1933 XFS_ERRLEVEL_LOW,
1934 context->dp->i_mount);
1935 return(XFS_ERROR(EFSCORRUPTED));
1936 }
1937 } 1910 }
1938 xfs_trans_brelse(NULL, bp); 1911 xfs_trans_brelse(NULL, bp);
1939 return(0); 1912 return(0);
@@ -1959,6 +1932,8 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
1959 int nmap, error, tmp, valuelen, blkcnt, i; 1932 int nmap, error, tmp, valuelen, blkcnt, i;
1960 xfs_dablk_t lblkno; 1933 xfs_dablk_t lblkno;
1961 1934
1935 trace_xfs_attr_rmtval_get(args);
1936
1962 ASSERT(!(args->flags & ATTR_KERNOVAL)); 1937 ASSERT(!(args->flags & ATTR_KERNOVAL));
1963 1938
1964 mp = args->dp->i_mount; 1939 mp = args->dp->i_mount;
@@ -1980,7 +1955,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
1980 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); 1955 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
1981 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); 1956 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
1982 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, 1957 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
1983 dblkno, blkcnt, 0, &bp); 1958 dblkno, blkcnt, 0, &bp, NULL);
1984 if (error) 1959 if (error)
1985 return(error); 1960 return(error);
1986 1961
@@ -2014,6 +1989,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2014 xfs_dablk_t lblkno; 1989 xfs_dablk_t lblkno;
2015 int blkcnt, valuelen, nmap, error, tmp, committed; 1990 int blkcnt, valuelen, nmap, error, tmp, committed;
2016 1991
1992 trace_xfs_attr_rmtval_set(args);
1993
2017 dp = args->dp; 1994 dp = args->dp;
2018 mp = dp->i_mount; 1995 mp = dp->i_mount;
2019 src = args->value; 1996 src = args->value;
@@ -2143,6 +2120,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2143 xfs_dablk_t lblkno; 2120 xfs_dablk_t lblkno;
2144 int valuelen, blkcnt, nmap, error, done, committed; 2121 int valuelen, blkcnt, nmap, error, done, committed;
2145 2122
2123 trace_xfs_attr_rmtval_remove(args);
2124
2146 mp = args->dp->i_mount; 2125 mp = args->dp->i_mount;
2147 2126
2148 /* 2127 /*
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index d330111ca738..ee24993c7d12 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -57,7 +57,8 @@ STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block,
57 struct xfs_buf **bpp); 57 struct xfs_buf **bpp);
58STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer, 58STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer,
59 xfs_da_args_t *args, int freemap_index); 59 xfs_da_args_t *args, int freemap_index);
60STATIC void xfs_attr_leaf_compact(xfs_trans_t *tp, struct xfs_buf *leaf_buffer); 60STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args,
61 struct xfs_buf *leaf_buffer);
61STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state, 62STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state,
62 xfs_da_state_blk_t *blk1, 63 xfs_da_state_blk_t *blk1,
63 xfs_da_state_blk_t *blk2); 64 xfs_da_state_blk_t *blk2);
@@ -87,6 +88,52 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
87 xfs_mount_t *mp); 88 xfs_mount_t *mp);
88STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); 89STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
89 90
91static void
92xfs_attr_leaf_verify(
93 struct xfs_buf *bp)
94{
95 struct xfs_mount *mp = bp->b_target->bt_mount;
96 struct xfs_attr_leaf_hdr *hdr = bp->b_addr;
97 int block_ok = 0;
98
99 block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC);
100 if (!block_ok) {
101 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
102 xfs_buf_ioerror(bp, EFSCORRUPTED);
103 }
104}
105
106static void
107xfs_attr_leaf_read_verify(
108 struct xfs_buf *bp)
109{
110 xfs_attr_leaf_verify(bp);
111}
112
113static void
114xfs_attr_leaf_write_verify(
115 struct xfs_buf *bp)
116{
117 xfs_attr_leaf_verify(bp);
118}
119
120const struct xfs_buf_ops xfs_attr_leaf_buf_ops = {
121 .verify_read = xfs_attr_leaf_read_verify,
122 .verify_write = xfs_attr_leaf_write_verify,
123};
124
125int
126xfs_attr_leaf_read(
127 struct xfs_trans *tp,
128 struct xfs_inode *dp,
129 xfs_dablk_t bno,
130 xfs_daddr_t mappedbno,
131 struct xfs_buf **bpp)
132{
133 return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
134 XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops);
135}
136
90/*======================================================================== 137/*========================================================================
91 * Namespace helper routines 138 * Namespace helper routines
92 *========================================================================*/ 139 *========================================================================*/
@@ -869,17 +916,16 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args)
869 error = xfs_da_grow_inode(args, &blkno); 916 error = xfs_da_grow_inode(args, &blkno);
870 if (error) 917 if (error)
871 goto out; 918 goto out;
872 error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1, 919 error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1);
873 XFS_ATTR_FORK);
874 if (error) 920 if (error)
875 goto out; 921 goto out;
876 ASSERT(bp1 != NULL); 922
877 bp2 = NULL; 923 bp2 = NULL;
878 error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2, 924 error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2,
879 XFS_ATTR_FORK); 925 XFS_ATTR_FORK);
880 if (error) 926 if (error)
881 goto out; 927 goto out;
882 ASSERT(bp2 != NULL); 928 bp2->b_ops = bp1->b_ops;
883 memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount)); 929 memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount));
884 bp1 = NULL; 930 bp1 = NULL;
885 xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); 931 xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
@@ -933,7 +979,7 @@ xfs_attr_leaf_create(
933 XFS_ATTR_FORK); 979 XFS_ATTR_FORK);
934 if (error) 980 if (error)
935 return(error); 981 return(error);
936 ASSERT(bp != NULL); 982 bp->b_ops = &xfs_attr_leaf_buf_ops;
937 leaf = bp->b_addr; 983 leaf = bp->b_addr;
938 memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); 984 memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
939 hdr = &leaf->hdr; 985 hdr = &leaf->hdr;
@@ -1071,7 +1117,7 @@ xfs_attr_leaf_add(
1071 * Compact the entries to coalesce free space. 1117 * Compact the entries to coalesce free space.
1072 * This may change the hdr->count via dropping INCOMPLETE entries. 1118 * This may change the hdr->count via dropping INCOMPLETE entries.
1073 */ 1119 */
1074 xfs_attr_leaf_compact(args->trans, bp); 1120 xfs_attr_leaf_compact(args, bp);
1075 1121
1076 /* 1122 /*
1077 * After compaction, the block is guaranteed to have only one 1123 * After compaction, the block is guaranteed to have only one
@@ -1102,6 +1148,8 @@ xfs_attr_leaf_add_work(
1102 xfs_mount_t *mp; 1148 xfs_mount_t *mp;
1103 int tmp, i; 1149 int tmp, i;
1104 1150
1151 trace_xfs_attr_leaf_add_work(args);
1152
1105 leaf = bp->b_addr; 1153 leaf = bp->b_addr;
1106 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); 1154 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1107 hdr = &leaf->hdr; 1155 hdr = &leaf->hdr;
@@ -1214,15 +1262,17 @@ xfs_attr_leaf_add_work(
1214 */ 1262 */
1215STATIC void 1263STATIC void
1216xfs_attr_leaf_compact( 1264xfs_attr_leaf_compact(
1217 struct xfs_trans *trans, 1265 struct xfs_da_args *args,
1218 struct xfs_buf *bp) 1266 struct xfs_buf *bp)
1219{ 1267{
1220 xfs_attr_leafblock_t *leaf_s, *leaf_d; 1268 xfs_attr_leafblock_t *leaf_s, *leaf_d;
1221 xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; 1269 xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
1222 xfs_mount_t *mp; 1270 struct xfs_trans *trans = args->trans;
1223 char *tmpbuffer; 1271 struct xfs_mount *mp = trans->t_mountp;
1272 char *tmpbuffer;
1273
1274 trace_xfs_attr_leaf_compact(args);
1224 1275
1225 mp = trans->t_mountp;
1226 tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); 1276 tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
1227 ASSERT(tmpbuffer != NULL); 1277 ASSERT(tmpbuffer != NULL);
1228 memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp)); 1278 memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp));
@@ -1291,6 +1341,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
1291 leaf2 = blk2->bp->b_addr; 1341 leaf2 = blk2->bp->b_addr;
1292 ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); 1342 ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1293 ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); 1343 ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1344 ASSERT(leaf2->hdr.count == 0);
1294 args = state->args; 1345 args = state->args;
1295 1346
1296 trace_xfs_attr_leaf_rebalance(args); 1347 trace_xfs_attr_leaf_rebalance(args);
@@ -1344,9 +1395,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
1344 max = be16_to_cpu(hdr2->firstused) 1395 max = be16_to_cpu(hdr2->firstused)
1345 - sizeof(xfs_attr_leaf_hdr_t); 1396 - sizeof(xfs_attr_leaf_hdr_t);
1346 max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t); 1397 max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t);
1347 if (space > max) { 1398 if (space > max)
1348 xfs_attr_leaf_compact(args->trans, blk2->bp); 1399 xfs_attr_leaf_compact(args, blk2->bp);
1349 }
1350 1400
1351 /* 1401 /*
1352 * Move high entries from leaf1 to low end of leaf2. 1402 * Move high entries from leaf1 to low end of leaf2.
@@ -1361,6 +1411,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
1361 * I assert that since all callers pass in an empty 1411 * I assert that since all callers pass in an empty
1362 * second buffer, this code should never execute. 1412 * second buffer, this code should never execute.
1363 */ 1413 */
1414 ASSERT(0);
1364 1415
1365 /* 1416 /*
1366 * Figure the total bytes to be added to the destination leaf. 1417 * Figure the total bytes to be added to the destination leaf.
@@ -1376,9 +1427,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
1376 max = be16_to_cpu(hdr1->firstused) 1427 max = be16_to_cpu(hdr1->firstused)
1377 - sizeof(xfs_attr_leaf_hdr_t); 1428 - sizeof(xfs_attr_leaf_hdr_t);
1378 max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t); 1429 max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t);
1379 if (space > max) { 1430 if (space > max)
1380 xfs_attr_leaf_compact(args->trans, blk1->bp); 1431 xfs_attr_leaf_compact(args, blk1->bp);
1381 }
1382 1432
1383 /* 1433 /*
1384 * Move low entries from leaf2 to high end of leaf1. 1434 * Move low entries from leaf2 to high end of leaf1.
@@ -1422,10 +1472,24 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
1422 args->index2 = 0; 1472 args->index2 = 0;
1423 args->blkno2 = blk2->blkno; 1473 args->blkno2 = blk2->blkno;
1424 } else { 1474 } else {
1475 /*
1476 * On a double leaf split, the original attr location
1477 * is already stored in blkno2/index2, so don't
1478 * overwrite it overwise we corrupt the tree.
1479 */
1425 blk2->index = blk1->index 1480 blk2->index = blk1->index
1426 - be16_to_cpu(leaf1->hdr.count); 1481 - be16_to_cpu(leaf1->hdr.count);
1427 args->index = args->index2 = blk2->index; 1482 args->index = blk2->index;
1428 args->blkno = args->blkno2 = blk2->blkno; 1483 args->blkno = blk2->blkno;
1484 if (!state->extravalid) {
1485 /*
1486 * set the new attr location to match the old
1487 * one and let the higher level split code
1488 * decide where in the leaf to place it.
1489 */
1490 args->index2 = blk2->index;
1491 args->blkno2 = blk2->blkno;
1492 }
1429 } 1493 }
1430 } else { 1494 } else {
1431 ASSERT(state->inleaf == 1); 1495 ASSERT(state->inleaf == 1);
@@ -1561,6 +1625,8 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
1561 xfs_dablk_t blkno; 1625 xfs_dablk_t blkno;
1562 struct xfs_buf *bp; 1626 struct xfs_buf *bp;
1563 1627
1628 trace_xfs_attr_leaf_toosmall(state->args);
1629
1564 /* 1630 /*
1565 * Check for the degenerate case of the block being over 50% full. 1631 * Check for the degenerate case of the block being over 50% full.
1566 * If so, it's not worth even looking to see if we might be able 1632 * If so, it's not worth even looking to see if we might be able
@@ -1620,18 +1686,16 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
1620 blkno = be32_to_cpu(info->back); 1686 blkno = be32_to_cpu(info->back);
1621 if (blkno == 0) 1687 if (blkno == 0)
1622 continue; 1688 continue;
1623 error = xfs_da_read_buf(state->args->trans, state->args->dp, 1689 error = xfs_attr_leaf_read(state->args->trans, state->args->dp,
1624 blkno, -1, &bp, XFS_ATTR_FORK); 1690 blkno, -1, &bp);
1625 if (error) 1691 if (error)
1626 return(error); 1692 return(error);
1627 ASSERT(bp != NULL);
1628 1693
1629 leaf = (xfs_attr_leafblock_t *)info; 1694 leaf = (xfs_attr_leafblock_t *)info;
1630 count = be16_to_cpu(leaf->hdr.count); 1695 count = be16_to_cpu(leaf->hdr.count);
1631 bytes = state->blocksize - (state->blocksize>>2); 1696 bytes = state->blocksize - (state->blocksize>>2);
1632 bytes -= be16_to_cpu(leaf->hdr.usedbytes); 1697 bytes -= be16_to_cpu(leaf->hdr.usedbytes);
1633 leaf = bp->b_addr; 1698 leaf = bp->b_addr;
1634 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1635 count += be16_to_cpu(leaf->hdr.count); 1699 count += be16_to_cpu(leaf->hdr.count);
1636 bytes -= be16_to_cpu(leaf->hdr.usedbytes); 1700 bytes -= be16_to_cpu(leaf->hdr.usedbytes);
1637 bytes -= count * sizeof(xfs_attr_leaf_entry_t); 1701 bytes -= count * sizeof(xfs_attr_leaf_entry_t);
@@ -1686,6 +1750,8 @@ xfs_attr_leaf_remove(
1686 int tablesize, tmp, i; 1750 int tablesize, tmp, i;
1687 xfs_mount_t *mp; 1751 xfs_mount_t *mp;
1688 1752
1753 trace_xfs_attr_leaf_remove(args);
1754
1689 leaf = bp->b_addr; 1755 leaf = bp->b_addr;
1690 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); 1756 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1691 hdr = &leaf->hdr; 1757 hdr = &leaf->hdr;
@@ -2495,15 +2561,11 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
2495 /* 2561 /*
2496 * Set up the operation. 2562 * Set up the operation.
2497 */ 2563 */
2498 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, 2564 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
2499 XFS_ATTR_FORK); 2565 if (error)
2500 if (error) {
2501 return(error); 2566 return(error);
2502 }
2503 ASSERT(bp != NULL);
2504 2567
2505 leaf = bp->b_addr; 2568 leaf = bp->b_addr;
2506 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
2507 ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); 2569 ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
2508 ASSERT(args->index >= 0); 2570 ASSERT(args->index >= 0);
2509 entry = &leaf->entries[ args->index ]; 2571 entry = &leaf->entries[ args->index ];
@@ -2560,15 +2622,11 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
2560 /* 2622 /*
2561 * Set up the operation. 2623 * Set up the operation.
2562 */ 2624 */
2563 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, 2625 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
2564 XFS_ATTR_FORK); 2626 if (error)
2565 if (error) {
2566 return(error); 2627 return(error);
2567 }
2568 ASSERT(bp != NULL);
2569 2628
2570 leaf = bp->b_addr; 2629 leaf = bp->b_addr;
2571 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
2572 ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); 2630 ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
2573 ASSERT(args->index >= 0); 2631 ASSERT(args->index >= 0);
2574 entry = &leaf->entries[ args->index ]; 2632 entry = &leaf->entries[ args->index ];
@@ -2617,35 +2675,28 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
2617 /* 2675 /*
2618 * Read the block containing the "old" attr 2676 * Read the block containing the "old" attr
2619 */ 2677 */
2620 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1, 2678 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
2621 XFS_ATTR_FORK); 2679 if (error)
2622 if (error) { 2680 return error;
2623 return(error);
2624 }
2625 ASSERT(bp1 != NULL);
2626 2681
2627 /* 2682 /*
2628 * Read the block containing the "new" attr, if it is different 2683 * Read the block containing the "new" attr, if it is different
2629 */ 2684 */
2630 if (args->blkno2 != args->blkno) { 2685 if (args->blkno2 != args->blkno) {
2631 error = xfs_da_read_buf(args->trans, args->dp, args->blkno2, 2686 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2,
2632 -1, &bp2, XFS_ATTR_FORK); 2687 -1, &bp2);
2633 if (error) { 2688 if (error)
2634 return(error); 2689 return error;
2635 }
2636 ASSERT(bp2 != NULL);
2637 } else { 2690 } else {
2638 bp2 = bp1; 2691 bp2 = bp1;
2639 } 2692 }
2640 2693
2641 leaf1 = bp1->b_addr; 2694 leaf1 = bp1->b_addr;
2642 ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
2643 ASSERT(args->index < be16_to_cpu(leaf1->hdr.count)); 2695 ASSERT(args->index < be16_to_cpu(leaf1->hdr.count));
2644 ASSERT(args->index >= 0); 2696 ASSERT(args->index >= 0);
2645 entry1 = &leaf1->entries[ args->index ]; 2697 entry1 = &leaf1->entries[ args->index ];
2646 2698
2647 leaf2 = bp2->b_addr; 2699 leaf2 = bp2->b_addr;
2648 ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
2649 ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count)); 2700 ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count));
2650 ASSERT(args->index2 >= 0); 2701 ASSERT(args->index2 >= 0);
2651 entry2 = &leaf2->entries[ args->index2 ]; 2702 entry2 = &leaf2->entries[ args->index2 ];
@@ -2730,7 +2781,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
2730 * the extents in reverse order the extent containing 2781 * the extents in reverse order the extent containing
2731 * block 0 must still be there. 2782 * block 0 must still be there.
2732 */ 2783 */
2733 error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); 2784 error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
2734 if (error) 2785 if (error)
2735 return(error); 2786 return(error);
2736 blkno = XFS_BUF_ADDR(bp); 2787 blkno = XFS_BUF_ADDR(bp);
@@ -2815,7 +2866,7 @@ xfs_attr_node_inactive(
2815 * traversal of the tree so we may deal with many blocks 2866 * traversal of the tree so we may deal with many blocks
2816 * before we come back to this one. 2867 * before we come back to this one.
2817 */ 2868 */
2818 error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp, 2869 error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp,
2819 XFS_ATTR_FORK); 2870 XFS_ATTR_FORK);
2820 if (error) 2871 if (error)
2821 return(error); 2872 return(error);
@@ -2856,8 +2907,8 @@ xfs_attr_node_inactive(
2856 * child block number. 2907 * child block number.
2857 */ 2908 */
2858 if ((i+1) < count) { 2909 if ((i+1) < count) {
2859 error = xfs_da_read_buf(*trans, dp, 0, parent_blkno, 2910 error = xfs_da_node_read(*trans, dp, 0, parent_blkno,
2860 &bp, XFS_ATTR_FORK); 2911 &bp, XFS_ATTR_FORK);
2861 if (error) 2912 if (error)
2862 return(error); 2913 return(error);
2863 child_fsb = be32_to_cpu(node->btree[i+1].before); 2914 child_fsb = be32_to_cpu(node->btree[i+1].before);
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index dea17722945e..77de139a58f0 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -261,4 +261,10 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
261 struct xfs_buf *leaf2_bp); 261 struct xfs_buf *leaf2_bp);
262int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, 262int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
263 int *local); 263 int *local);
264int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
265 xfs_dablk_t bno, xfs_daddr_t mappedbno,
266 struct xfs_buf **bpp);
267
268extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops;
269
264#endif /* __XFS_ATTR_LEAF_H__ */ 270#endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 848ffa77707b..0e92d12765d2 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2437,6 +2437,7 @@ xfs_bmap_btalloc(
2437 * Normal allocation, done through xfs_alloc_vextent. 2437 * Normal allocation, done through xfs_alloc_vextent.
2438 */ 2438 */
2439 tryagain = isaligned = 0; 2439 tryagain = isaligned = 0;
2440 memset(&args, 0, sizeof(args));
2440 args.tp = ap->tp; 2441 args.tp = ap->tp;
2441 args.mp = mp; 2442 args.mp = mp;
2442 args.fsbno = ap->blkno; 2443 args.fsbno = ap->blkno;
@@ -2661,8 +2662,9 @@ xfs_bmap_btree_to_extents(
2661 if ((error = xfs_btree_check_lptr(cur, cbno, 1))) 2662 if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
2662 return error; 2663 return error;
2663#endif 2664#endif
2664 if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, 2665 error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
2665 XFS_BMAP_BTREE_REF))) 2666 &xfs_bmbt_buf_ops);
2667 if (error)
2666 return error; 2668 return error;
2667 cblock = XFS_BUF_TO_BLOCK(cbp); 2669 cblock = XFS_BUF_TO_BLOCK(cbp);
2668 if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) 2670 if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
@@ -3082,6 +3084,7 @@ xfs_bmap_extents_to_btree(
3082 * Convert to a btree with two levels, one record in root. 3084 * Convert to a btree with two levels, one record in root.
3083 */ 3085 */
3084 XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); 3086 XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
3087 memset(&args, 0, sizeof(args));
3085 args.tp = tp; 3088 args.tp = tp;
3086 args.mp = mp; 3089 args.mp = mp;
3087 args.firstblock = *firstblock; 3090 args.firstblock = *firstblock;
@@ -3121,6 +3124,7 @@ xfs_bmap_extents_to_btree(
3121 /* 3124 /*
3122 * Fill in the child block. 3125 * Fill in the child block.
3123 */ 3126 */
3127 abp->b_ops = &xfs_bmbt_buf_ops;
3124 ablock = XFS_BUF_TO_BLOCK(abp); 3128 ablock = XFS_BUF_TO_BLOCK(abp);
3125 ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); 3129 ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
3126 ablock->bb_level = 0; 3130 ablock->bb_level = 0;
@@ -3237,6 +3241,7 @@ xfs_bmap_local_to_extents(
3237 xfs_buf_t *bp; /* buffer for extent block */ 3241 xfs_buf_t *bp; /* buffer for extent block */
3238 xfs_bmbt_rec_host_t *ep;/* extent record pointer */ 3242 xfs_bmbt_rec_host_t *ep;/* extent record pointer */
3239 3243
3244 memset(&args, 0, sizeof(args));
3240 args.tp = tp; 3245 args.tp = tp;
3241 args.mp = ip->i_mount; 3246 args.mp = ip->i_mount;
3242 args.firstblock = *firstblock; 3247 args.firstblock = *firstblock;
@@ -3266,6 +3271,7 @@ xfs_bmap_local_to_extents(
3266 ASSERT(args.len == 1); 3271 ASSERT(args.len == 1);
3267 *firstblock = args.fsbno; 3272 *firstblock = args.fsbno;
3268 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); 3273 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
3274 bp->b_ops = &xfs_bmbt_buf_ops;
3269 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); 3275 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
3270 xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); 3276 xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
3271 xfs_bmap_forkoff_reset(args.mp, ip, whichfork); 3277 xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
@@ -4075,8 +4081,9 @@ xfs_bmap_read_extents(
4075 * pointer (leftmost) at each level. 4081 * pointer (leftmost) at each level.
4076 */ 4082 */
4077 while (level-- > 0) { 4083 while (level-- > 0) {
4078 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, 4084 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
4079 XFS_BMAP_BTREE_REF))) 4085 XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
4086 if (error)
4080 return error; 4087 return error;
4081 block = XFS_BUF_TO_BLOCK(bp); 4088 block = XFS_BUF_TO_BLOCK(bp);
4082 XFS_WANT_CORRUPTED_GOTO( 4089 XFS_WANT_CORRUPTED_GOTO(
@@ -4121,7 +4128,8 @@ xfs_bmap_read_extents(
4121 */ 4128 */
4122 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); 4129 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
4123 if (nextbno != NULLFSBLOCK) 4130 if (nextbno != NULLFSBLOCK)
4124 xfs_btree_reada_bufl(mp, nextbno, 1); 4131 xfs_btree_reada_bufl(mp, nextbno, 1,
4132 &xfs_bmbt_buf_ops);
4125 /* 4133 /*
4126 * Copy records into the extent records. 4134 * Copy records into the extent records.
4127 */ 4135 */
@@ -4153,8 +4161,9 @@ xfs_bmap_read_extents(
4153 */ 4161 */
4154 if (bno == NULLFSBLOCK) 4162 if (bno == NULLFSBLOCK)
4155 break; 4163 break;
4156 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, 4164 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
4157 XFS_BMAP_BTREE_REF))) 4165 XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
4166 if (error)
4158 return error; 4167 return error;
4159 block = XFS_BUF_TO_BLOCK(bp); 4168 block = XFS_BUF_TO_BLOCK(bp);
4160 } 4169 }
@@ -4616,12 +4625,11 @@ xfs_bmapi_delay(
4616 4625
4617 4626
4618STATIC int 4627STATIC int
4619xfs_bmapi_allocate( 4628__xfs_bmapi_allocate(
4620 struct xfs_bmalloca *bma, 4629 struct xfs_bmalloca *bma)
4621 int flags)
4622{ 4630{
4623 struct xfs_mount *mp = bma->ip->i_mount; 4631 struct xfs_mount *mp = bma->ip->i_mount;
4624 int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? 4632 int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ?
4625 XFS_ATTR_FORK : XFS_DATA_FORK; 4633 XFS_ATTR_FORK : XFS_DATA_FORK;
4626 struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); 4634 struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
4627 int tmp_logflags = 0; 4635 int tmp_logflags = 0;
@@ -4654,24 +4662,27 @@ xfs_bmapi_allocate(
4654 * Indicate if this is the first user data in the file, or just any 4662 * Indicate if this is the first user data in the file, or just any
4655 * user data. 4663 * user data.
4656 */ 4664 */
4657 if (!(flags & XFS_BMAPI_METADATA)) { 4665 if (!(bma->flags & XFS_BMAPI_METADATA)) {
4658 bma->userdata = (bma->offset == 0) ? 4666 bma->userdata = (bma->offset == 0) ?
4659 XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; 4667 XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
4660 } 4668 }
4661 4669
4662 bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1; 4670 bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
4663 4671
4664 /* 4672 /*
4665 * Only want to do the alignment at the eof if it is userdata and 4673 * Only want to do the alignment at the eof if it is userdata and
4666 * allocation length is larger than a stripe unit. 4674 * allocation length is larger than a stripe unit.
4667 */ 4675 */
4668 if (mp->m_dalign && bma->length >= mp->m_dalign && 4676 if (mp->m_dalign && bma->length >= mp->m_dalign &&
4669 !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { 4677 !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
4670 error = xfs_bmap_isaeof(bma, whichfork); 4678 error = xfs_bmap_isaeof(bma, whichfork);
4671 if (error) 4679 if (error)
4672 return error; 4680 return error;
4673 } 4681 }
4674 4682
4683 if (bma->flags & XFS_BMAPI_STACK_SWITCH)
4684 bma->stack_switch = 1;
4685
4675 error = xfs_bmap_alloc(bma); 4686 error = xfs_bmap_alloc(bma);
4676 if (error) 4687 if (error)
4677 return error; 4688 return error;
@@ -4706,7 +4717,7 @@ xfs_bmapi_allocate(
4706 * A wasdelay extent has been initialized, so shouldn't be flagged 4717 * A wasdelay extent has been initialized, so shouldn't be flagged
4707 * as unwritten. 4718 * as unwritten.
4708 */ 4719 */
4709 if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) && 4720 if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) &&
4710 xfs_sb_version_hasextflgbit(&mp->m_sb)) 4721 xfs_sb_version_hasextflgbit(&mp->m_sb))
4711 bma->got.br_state = XFS_EXT_UNWRITTEN; 4722 bma->got.br_state = XFS_EXT_UNWRITTEN;
4712 4723
@@ -4734,6 +4745,45 @@ xfs_bmapi_allocate(
4734 return 0; 4745 return 0;
4735} 4746}
4736 4747
4748static void
4749xfs_bmapi_allocate_worker(
4750 struct work_struct *work)
4751{
4752 struct xfs_bmalloca *args = container_of(work,
4753 struct xfs_bmalloca, work);
4754 unsigned long pflags;
4755
4756 /* we are in a transaction context here */
4757 current_set_flags_nested(&pflags, PF_FSTRANS);
4758
4759 args->result = __xfs_bmapi_allocate(args);
4760 complete(args->done);
4761
4762 current_restore_flags_nested(&pflags, PF_FSTRANS);
4763}
4764
4765/*
4766 * Some allocation requests often come in with little stack to work on. Push
4767 * them off to a worker thread so there is lots of stack to use. Otherwise just
4768 * call directly to avoid the context switch overhead here.
4769 */
4770int
4771xfs_bmapi_allocate(
4772 struct xfs_bmalloca *args)
4773{
4774 DECLARE_COMPLETION_ONSTACK(done);
4775
4776 if (!args->stack_switch)
4777 return __xfs_bmapi_allocate(args);
4778
4779
4780 args->done = &done;
4781 INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
4782 queue_work(xfs_alloc_wq, &args->work);
4783 wait_for_completion(&done);
4784 return args->result;
4785}
4786
4737STATIC int 4787STATIC int
4738xfs_bmapi_convert_unwritten( 4788xfs_bmapi_convert_unwritten(
4739 struct xfs_bmalloca *bma, 4789 struct xfs_bmalloca *bma,
@@ -4919,6 +4969,7 @@ xfs_bmapi_write(
4919 bma.conv = !!(flags & XFS_BMAPI_CONVERT); 4969 bma.conv = !!(flags & XFS_BMAPI_CONVERT);
4920 bma.wasdel = wasdelay; 4970 bma.wasdel = wasdelay;
4921 bma.offset = bno; 4971 bma.offset = bno;
4972 bma.flags = flags;
4922 4973
4923 /* 4974 /*
4924 * There's a 32/64 bit type mismatch between the 4975 * There's a 32/64 bit type mismatch between the
@@ -4934,7 +4985,7 @@ xfs_bmapi_write(
4934 4985
4935 ASSERT(len > 0); 4986 ASSERT(len > 0);
4936 ASSERT(bma.length > 0); 4987 ASSERT(bma.length > 0);
4937 error = xfs_bmapi_allocate(&bma, flags); 4988 error = xfs_bmapi_allocate(&bma);
4938 if (error) 4989 if (error)
4939 goto error0; 4990 goto error0;
4940 if (bma.blkno == NULLFSBLOCK) 4991 if (bma.blkno == NULLFSBLOCK)
@@ -5554,7 +5605,7 @@ xfs_getbmap(
5554 xfs_ilock(ip, XFS_IOLOCK_SHARED); 5605 xfs_ilock(ip, XFS_IOLOCK_SHARED);
5555 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { 5606 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
5556 if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { 5607 if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
5557 error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); 5608 error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
5558 if (error) 5609 if (error)
5559 goto out_unlock_iolock; 5610 goto out_unlock_iolock;
5560 } 5611 }
@@ -5823,15 +5874,16 @@ xfs_bmap_check_leaf_extents(
5823 */ 5874 */
5824 while (level-- > 0) { 5875 while (level-- > 0) {
5825 /* See if buf is in cur first */ 5876 /* See if buf is in cur first */
5877 bp_release = 0;
5826 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); 5878 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
5827 if (bp) { 5879 if (!bp) {
5828 bp_release = 0;
5829 } else {
5830 bp_release = 1; 5880 bp_release = 1;
5881 error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
5882 XFS_BMAP_BTREE_REF,
5883 &xfs_bmbt_buf_ops);
5884 if (error)
5885 goto error_norelse;
5831 } 5886 }
5832 if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
5833 XFS_BMAP_BTREE_REF)))
5834 goto error_norelse;
5835 block = XFS_BUF_TO_BLOCK(bp); 5887 block = XFS_BUF_TO_BLOCK(bp);
5836 XFS_WANT_CORRUPTED_GOTO( 5888 XFS_WANT_CORRUPTED_GOTO(
5837 xfs_bmap_sanity_check(mp, bp, level), 5889 xfs_bmap_sanity_check(mp, bp, level),
@@ -5908,15 +5960,16 @@ xfs_bmap_check_leaf_extents(
5908 if (bno == NULLFSBLOCK) 5960 if (bno == NULLFSBLOCK)
5909 break; 5961 break;
5910 5962
5963 bp_release = 0;
5911 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); 5964 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
5912 if (bp) { 5965 if (!bp) {
5913 bp_release = 0;
5914 } else {
5915 bp_release = 1; 5966 bp_release = 1;
5967 error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
5968 XFS_BMAP_BTREE_REF,
5969 &xfs_bmbt_buf_ops);
5970 if (error)
5971 goto error_norelse;
5916 } 5972 }
5917 if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
5918 XFS_BMAP_BTREE_REF)))
5919 goto error_norelse;
5920 block = XFS_BUF_TO_BLOCK(bp); 5973 block = XFS_BUF_TO_BLOCK(bp);
5921 } 5974 }
5922 if (bp_release) { 5975 if (bp_release) {
@@ -6007,7 +6060,9 @@ xfs_bmap_count_tree(
6007 struct xfs_btree_block *block, *nextblock; 6060 struct xfs_btree_block *block, *nextblock;
6008 int numrecs; 6061 int numrecs;
6009 6062
6010 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF))) 6063 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
6064 &xfs_bmbt_buf_ops);
6065 if (error)
6011 return error; 6066 return error;
6012 *count += 1; 6067 *count += 1;
6013 block = XFS_BUF_TO_BLOCK(bp); 6068 block = XFS_BUF_TO_BLOCK(bp);
@@ -6016,8 +6071,10 @@ xfs_bmap_count_tree(
6016 /* Not at node above leaves, count this level of nodes */ 6071 /* Not at node above leaves, count this level of nodes */
6017 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); 6072 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
6018 while (nextbno != NULLFSBLOCK) { 6073 while (nextbno != NULLFSBLOCK) {
6019 if ((error = xfs_btree_read_bufl(mp, tp, nextbno, 6074 error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
6020 0, &nbp, XFS_BMAP_BTREE_REF))) 6075 XFS_BMAP_BTREE_REF,
6076 &xfs_bmbt_buf_ops);
6077 if (error)
6021 return error; 6078 return error;
6022 *count += 1; 6079 *count += 1;
6023 nextblock = XFS_BUF_TO_BLOCK(nbp); 6080 nextblock = XFS_BUF_TO_BLOCK(nbp);
@@ -6046,8 +6103,10 @@ xfs_bmap_count_tree(
6046 if (nextbno == NULLFSBLOCK) 6103 if (nextbno == NULLFSBLOCK)
6047 break; 6104 break;
6048 bno = nextbno; 6105 bno = nextbno;
6049 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, 6106 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
6050 XFS_BMAP_BTREE_REF))) 6107 XFS_BMAP_BTREE_REF,
6108 &xfs_bmbt_buf_ops);
6109 if (error)
6051 return error; 6110 return error;
6052 *count += 1; 6111 *count += 1;
6053 block = XFS_BUF_TO_BLOCK(bp); 6112 block = XFS_BUF_TO_BLOCK(bp);
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 803b56d7ce16..5f469c3516eb 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -77,6 +77,7 @@ typedef struct xfs_bmap_free
77 * from written to unwritten, otherwise convert from unwritten to written. 77 * from written to unwritten, otherwise convert from unwritten to written.
78 */ 78 */
79#define XFS_BMAPI_CONVERT 0x040 79#define XFS_BMAPI_CONVERT 0x040
80#define XFS_BMAPI_STACK_SWITCH 0x080
80 81
81#define XFS_BMAPI_FLAGS \ 82#define XFS_BMAPI_FLAGS \
82 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ 83 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \
@@ -85,7 +86,8 @@ typedef struct xfs_bmap_free
85 { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ 86 { XFS_BMAPI_PREALLOC, "PREALLOC" }, \
86 { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ 87 { XFS_BMAPI_IGSTATE, "IGSTATE" }, \
87 { XFS_BMAPI_CONTIG, "CONTIG" }, \ 88 { XFS_BMAPI_CONTIG, "CONTIG" }, \
88 { XFS_BMAPI_CONVERT, "CONVERT" } 89 { XFS_BMAPI_CONVERT, "CONVERT" }, \
90 { XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" }
89 91
90 92
91static inline int xfs_bmapi_aflag(int w) 93static inline int xfs_bmapi_aflag(int w)
@@ -133,6 +135,11 @@ typedef struct xfs_bmalloca {
133 char userdata;/* set if is user data */ 135 char userdata;/* set if is user data */
134 char aeof; /* allocated space at eof */ 136 char aeof; /* allocated space at eof */
135 char conv; /* overwriting unwritten extents */ 137 char conv; /* overwriting unwritten extents */
138 char stack_switch;
139 int flags;
140 struct completion *done;
141 struct work_struct work;
142 int result;
136} xfs_bmalloca_t; 143} xfs_bmalloca_t;
137 144
138/* 145/*
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 862084a47a7e..061b45cbe614 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -36,6 +36,7 @@
36#include "xfs_bmap.h" 36#include "xfs_bmap.h"
37#include "xfs_error.h" 37#include "xfs_error.h"
38#include "xfs_quota.h" 38#include "xfs_quota.h"
39#include "xfs_trace.h"
39 40
40/* 41/*
41 * Determine the extent state. 42 * Determine the extent state.
@@ -707,6 +708,67 @@ xfs_bmbt_key_diff(
707 cur->bc_rec.b.br_startoff; 708 cur->bc_rec.b.br_startoff;
708} 709}
709 710
711static void
712xfs_bmbt_verify(
713 struct xfs_buf *bp)
714{
715 struct xfs_mount *mp = bp->b_target->bt_mount;
716 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
717 unsigned int level;
718 int lblock_ok; /* block passes checks */
719
720 /* magic number and level verification.
721 *
722 * We don't know waht fork we belong to, so just verify that the level
723 * is less than the maximum of the two. Later checks will be more
724 * precise.
725 */
726 level = be16_to_cpu(block->bb_level);
727 lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) &&
728 level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]);
729
730 /* numrecs verification */
731 lblock_ok = lblock_ok &&
732 be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0];
733
734 /* sibling pointer verification */
735 lblock_ok = lblock_ok &&
736 block->bb_u.l.bb_leftsib &&
737 (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
738 XFS_FSB_SANITY_CHECK(mp,
739 be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
740 block->bb_u.l.bb_rightsib &&
741 (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
742 XFS_FSB_SANITY_CHECK(mp,
743 be64_to_cpu(block->bb_u.l.bb_rightsib)));
744
745 if (!lblock_ok) {
746 trace_xfs_btree_corrupt(bp, _RET_IP_);
747 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
748 xfs_buf_ioerror(bp, EFSCORRUPTED);
749 }
750}
751
752static void
753xfs_bmbt_read_verify(
754 struct xfs_buf *bp)
755{
756 xfs_bmbt_verify(bp);
757}
758
759static void
760xfs_bmbt_write_verify(
761 struct xfs_buf *bp)
762{
763 xfs_bmbt_verify(bp);
764}
765
766const struct xfs_buf_ops xfs_bmbt_buf_ops = {
767 .verify_read = xfs_bmbt_read_verify,
768 .verify_write = xfs_bmbt_write_verify,
769};
770
771
710#ifdef DEBUG 772#ifdef DEBUG
711STATIC int 773STATIC int
712xfs_bmbt_keys_inorder( 774xfs_bmbt_keys_inorder(
@@ -746,6 +808,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
746 .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, 808 .init_rec_from_cur = xfs_bmbt_init_rec_from_cur,
747 .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, 809 .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur,
748 .key_diff = xfs_bmbt_key_diff, 810 .key_diff = xfs_bmbt_key_diff,
811 .buf_ops = &xfs_bmbt_buf_ops,
749#ifdef DEBUG 812#ifdef DEBUG
750 .keys_inorder = xfs_bmbt_keys_inorder, 813 .keys_inorder = xfs_bmbt_keys_inorder,
751 .recs_inorder = xfs_bmbt_recs_inorder, 814 .recs_inorder = xfs_bmbt_recs_inorder,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0e66c4ea0f85..88469ca08696 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -236,5 +236,6 @@ extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
236extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, 236extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
237 struct xfs_trans *, struct xfs_inode *, int); 237 struct xfs_trans *, struct xfs_inode *, int);
238 238
239extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
239 240
240#endif /* __XFS_BMAP_BTREE_H__ */ 241#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e53e317b1582..db010408d701 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -266,9 +266,13 @@ xfs_btree_dup_cursor(
266 for (i = 0; i < new->bc_nlevels; i++) { 266 for (i = 0; i < new->bc_nlevels; i++) {
267 new->bc_ptrs[i] = cur->bc_ptrs[i]; 267 new->bc_ptrs[i] = cur->bc_ptrs[i];
268 new->bc_ra[i] = cur->bc_ra[i]; 268 new->bc_ra[i] = cur->bc_ra[i];
269 if ((bp = cur->bc_bufs[i])) { 269 bp = cur->bc_bufs[i];
270 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 270 if (bp) {
271 XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) { 271 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
272 XFS_BUF_ADDR(bp), mp->m_bsize,
273 0, &bp,
274 cur->bc_ops->buf_ops);
275 if (error) {
272 xfs_btree_del_cursor(new, error); 276 xfs_btree_del_cursor(new, error);
273 *ncur = NULL; 277 *ncur = NULL;
274 return error; 278 return error;
@@ -609,25 +613,26 @@ xfs_btree_offsets(
609 * Get a buffer for the block, return it read in. 613 * Get a buffer for the block, return it read in.
610 * Long-form addressing. 614 * Long-form addressing.
611 */ 615 */
612int /* error */ 616int
613xfs_btree_read_bufl( 617xfs_btree_read_bufl(
614 xfs_mount_t *mp, /* file system mount point */ 618 struct xfs_mount *mp, /* file system mount point */
615 xfs_trans_t *tp, /* transaction pointer */ 619 struct xfs_trans *tp, /* transaction pointer */
616 xfs_fsblock_t fsbno, /* file system block number */ 620 xfs_fsblock_t fsbno, /* file system block number */
617 uint lock, /* lock flags for read_buf */ 621 uint lock, /* lock flags for read_buf */
618 xfs_buf_t **bpp, /* buffer for fsbno */ 622 struct xfs_buf **bpp, /* buffer for fsbno */
619 int refval) /* ref count value for buffer */ 623 int refval, /* ref count value for buffer */
620{ 624 const struct xfs_buf_ops *ops)
621 xfs_buf_t *bp; /* return value */ 625{
626 struct xfs_buf *bp; /* return value */
622 xfs_daddr_t d; /* real disk block address */ 627 xfs_daddr_t d; /* real disk block address */
623 int error; 628 int error;
624 629
625 ASSERT(fsbno != NULLFSBLOCK); 630 ASSERT(fsbno != NULLFSBLOCK);
626 d = XFS_FSB_TO_DADDR(mp, fsbno); 631 d = XFS_FSB_TO_DADDR(mp, fsbno);
627 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, 632 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
628 mp->m_bsize, lock, &bp))) { 633 mp->m_bsize, lock, &bp, ops);
634 if (error)
629 return error; 635 return error;
630 }
631 ASSERT(!xfs_buf_geterror(bp)); 636 ASSERT(!xfs_buf_geterror(bp));
632 if (bp) 637 if (bp)
633 xfs_buf_set_ref(bp, refval); 638 xfs_buf_set_ref(bp, refval);
@@ -642,15 +647,16 @@ xfs_btree_read_bufl(
642/* ARGSUSED */ 647/* ARGSUSED */
643void 648void
644xfs_btree_reada_bufl( 649xfs_btree_reada_bufl(
645 xfs_mount_t *mp, /* file system mount point */ 650 struct xfs_mount *mp, /* file system mount point */
646 xfs_fsblock_t fsbno, /* file system block number */ 651 xfs_fsblock_t fsbno, /* file system block number */
647 xfs_extlen_t count) /* count of filesystem blocks */ 652 xfs_extlen_t count, /* count of filesystem blocks */
653 const struct xfs_buf_ops *ops)
648{ 654{
649 xfs_daddr_t d; 655 xfs_daddr_t d;
650 656
651 ASSERT(fsbno != NULLFSBLOCK); 657 ASSERT(fsbno != NULLFSBLOCK);
652 d = XFS_FSB_TO_DADDR(mp, fsbno); 658 d = XFS_FSB_TO_DADDR(mp, fsbno);
653 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); 659 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
654} 660}
655 661
656/* 662/*
@@ -660,17 +666,18 @@ xfs_btree_reada_bufl(
660/* ARGSUSED */ 666/* ARGSUSED */
661void 667void
662xfs_btree_reada_bufs( 668xfs_btree_reada_bufs(
663 xfs_mount_t *mp, /* file system mount point */ 669 struct xfs_mount *mp, /* file system mount point */
664 xfs_agnumber_t agno, /* allocation group number */ 670 xfs_agnumber_t agno, /* allocation group number */
665 xfs_agblock_t agbno, /* allocation group block number */ 671 xfs_agblock_t agbno, /* allocation group block number */
666 xfs_extlen_t count) /* count of filesystem blocks */ 672 xfs_extlen_t count, /* count of filesystem blocks */
673 const struct xfs_buf_ops *ops)
667{ 674{
668 xfs_daddr_t d; 675 xfs_daddr_t d;
669 676
670 ASSERT(agno != NULLAGNUMBER); 677 ASSERT(agno != NULLAGNUMBER);
671 ASSERT(agbno != NULLAGBLOCK); 678 ASSERT(agbno != NULLAGBLOCK);
672 d = XFS_AGB_TO_DADDR(mp, agno, agbno); 679 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
673 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); 680 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
674} 681}
675 682
676STATIC int 683STATIC int
@@ -684,12 +691,14 @@ xfs_btree_readahead_lblock(
684 xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 691 xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
685 692
686 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) { 693 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
687 xfs_btree_reada_bufl(cur->bc_mp, left, 1); 694 xfs_btree_reada_bufl(cur->bc_mp, left, 1,
695 cur->bc_ops->buf_ops);
688 rval++; 696 rval++;
689 } 697 }
690 698
691 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) { 699 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
692 xfs_btree_reada_bufl(cur->bc_mp, right, 1); 700 xfs_btree_reada_bufl(cur->bc_mp, right, 1,
701 cur->bc_ops->buf_ops);
693 rval++; 702 rval++;
694 } 703 }
695 704
@@ -709,13 +718,13 @@ xfs_btree_readahead_sblock(
709 718
710 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { 719 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
711 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, 720 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
712 left, 1); 721 left, 1, cur->bc_ops->buf_ops);
713 rval++; 722 rval++;
714 } 723 }
715 724
716 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { 725 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
717 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, 726 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
718 right, 1); 727 right, 1, cur->bc_ops->buf_ops);
719 rval++; 728 rval++;
720 } 729 }
721 730
@@ -853,18 +862,22 @@ xfs_btree_set_sibling(
853 } 862 }
854} 863}
855 864
856STATIC void 865void
857xfs_btree_init_block( 866xfs_btree_init_block(
858 struct xfs_btree_cur *cur, 867 struct xfs_mount *mp,
859 int level, 868 struct xfs_buf *bp,
860 int numrecs, 869 __u32 magic,
861 struct xfs_btree_block *new) /* new block */ 870 __u16 level,
871 __u16 numrecs,
872 unsigned int flags)
862{ 873{
863 new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]); 874 struct xfs_btree_block *new = XFS_BUF_TO_BLOCK(bp);
875
876 new->bb_magic = cpu_to_be32(magic);
864 new->bb_level = cpu_to_be16(level); 877 new->bb_level = cpu_to_be16(level);
865 new->bb_numrecs = cpu_to_be16(numrecs); 878 new->bb_numrecs = cpu_to_be16(numrecs);
866 879
867 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 880 if (flags & XFS_BTREE_LONG_PTRS) {
868 new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); 881 new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
869 new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); 882 new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
870 } else { 883 } else {
@@ -873,6 +886,17 @@ xfs_btree_init_block(
873 } 886 }
874} 887}
875 888
889STATIC void
890xfs_btree_init_block_cur(
891 struct xfs_btree_cur *cur,
892 int level,
893 int numrecs,
894 struct xfs_buf *bp)
895{
896 xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum],
897 level, numrecs, cur->bc_flags);
898}
899
876/* 900/*
877 * Return true if ptr is the last record in the btree and 901 * Return true if ptr is the last record in the btree and
878 * we need to track updateѕ to this record. The decision 902 * we need to track updateѕ to this record. The decision
@@ -972,6 +996,7 @@ xfs_btree_get_buf_block(
972 if (!*bpp) 996 if (!*bpp)
973 return ENOMEM; 997 return ENOMEM;
974 998
999 (*bpp)->b_ops = cur->bc_ops->buf_ops;
975 *block = XFS_BUF_TO_BLOCK(*bpp); 1000 *block = XFS_BUF_TO_BLOCK(*bpp);
976 return 0; 1001 return 0;
977} 1002}
@@ -998,19 +1023,15 @@ xfs_btree_read_buf_block(
998 1023
999 d = xfs_btree_ptr_to_daddr(cur, ptr); 1024 d = xfs_btree_ptr_to_daddr(cur, ptr);
1000 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, 1025 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1001 mp->m_bsize, flags, bpp); 1026 mp->m_bsize, flags, bpp,
1027 cur->bc_ops->buf_ops);
1002 if (error) 1028 if (error)
1003 return error; 1029 return error;
1004 1030
1005 ASSERT(!xfs_buf_geterror(*bpp)); 1031 ASSERT(!xfs_buf_geterror(*bpp));
1006
1007 xfs_btree_set_refs(cur, *bpp); 1032 xfs_btree_set_refs(cur, *bpp);
1008 *block = XFS_BUF_TO_BLOCK(*bpp); 1033 *block = XFS_BUF_TO_BLOCK(*bpp);
1009 1034 return 0;
1010 error = xfs_btree_check_block(cur, *block, level, *bpp);
1011 if (error)
1012 xfs_trans_brelse(cur->bc_tp, *bpp);
1013 return error;
1014} 1035}
1015 1036
1016/* 1037/*
@@ -2183,7 +2204,7 @@ xfs_btree_split(
2183 goto error0; 2204 goto error0;
2184 2205
2185 /* Fill in the btree header for the new right block. */ 2206 /* Fill in the btree header for the new right block. */
2186 xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right); 2207 xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp);
2187 2208
2188 /* 2209 /*
2189 * Split the entries between the old and the new block evenly. 2210 * Split the entries between the old and the new block evenly.
@@ -2492,7 +2513,7 @@ xfs_btree_new_root(
2492 nptr = 2; 2513 nptr = 2;
2493 } 2514 }
2494 /* Fill in the new block's btree header and log it. */ 2515 /* Fill in the new block's btree header and log it. */
2495 xfs_btree_init_block(cur, cur->bc_nlevels, 2, new); 2516 xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp);
2496 xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); 2517 xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
2497 ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && 2518 ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
2498 !xfs_btree_ptr_is_null(cur, &rptr)); 2519 !xfs_btree_ptr_is_null(cur, &rptr));
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 5b240de104c0..f932897194eb 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -188,6 +188,8 @@ struct xfs_btree_ops {
188 __int64_t (*key_diff)(struct xfs_btree_cur *cur, 188 __int64_t (*key_diff)(struct xfs_btree_cur *cur,
189 union xfs_btree_key *key); 189 union xfs_btree_key *key);
190 190
191 const struct xfs_buf_ops *buf_ops;
192
191#ifdef DEBUG 193#ifdef DEBUG
192 /* check that k1 is lower than k2 */ 194 /* check that k1 is lower than k2 */
193 int (*keys_inorder)(struct xfs_btree_cur *cur, 195 int (*keys_inorder)(struct xfs_btree_cur *cur,
@@ -355,7 +357,8 @@ xfs_btree_read_bufl(
355 xfs_fsblock_t fsbno, /* file system block number */ 357 xfs_fsblock_t fsbno, /* file system block number */
356 uint lock, /* lock flags for read_buf */ 358 uint lock, /* lock flags for read_buf */
357 struct xfs_buf **bpp, /* buffer for fsbno */ 359 struct xfs_buf **bpp, /* buffer for fsbno */
358 int refval);/* ref count value for buffer */ 360 int refval, /* ref count value for buffer */
361 const struct xfs_buf_ops *ops);
359 362
360/* 363/*
361 * Read-ahead the block, don't wait for it, don't return a buffer. 364 * Read-ahead the block, don't wait for it, don't return a buffer.
@@ -365,7 +368,8 @@ void /* error */
365xfs_btree_reada_bufl( 368xfs_btree_reada_bufl(
366 struct xfs_mount *mp, /* file system mount point */ 369 struct xfs_mount *mp, /* file system mount point */
367 xfs_fsblock_t fsbno, /* file system block number */ 370 xfs_fsblock_t fsbno, /* file system block number */
368 xfs_extlen_t count); /* count of filesystem blocks */ 371 xfs_extlen_t count, /* count of filesystem blocks */
372 const struct xfs_buf_ops *ops);
369 373
370/* 374/*
371 * Read-ahead the block, don't wait for it, don't return a buffer. 375 * Read-ahead the block, don't wait for it, don't return a buffer.
@@ -376,8 +380,20 @@ xfs_btree_reada_bufs(
376 struct xfs_mount *mp, /* file system mount point */ 380 struct xfs_mount *mp, /* file system mount point */
377 xfs_agnumber_t agno, /* allocation group number */ 381 xfs_agnumber_t agno, /* allocation group number */
378 xfs_agblock_t agbno, /* allocation group block number */ 382 xfs_agblock_t agbno, /* allocation group block number */
379 xfs_extlen_t count); /* count of filesystem blocks */ 383 xfs_extlen_t count, /* count of filesystem blocks */
384 const struct xfs_buf_ops *ops);
380 385
386/*
387 * Initialise a new btree block header
388 */
389void
390xfs_btree_init_block(
391 struct xfs_mount *mp,
392 struct xfs_buf *bp,
393 __u32 magic,
394 __u16 level,
395 __u16 numrecs,
396 unsigned int flags);
381 397
382/* 398/*
383 * Common btree core entry points. 399 * Common btree core entry points.
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 933b7930b863..26673a0b20e7 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -569,7 +569,9 @@ found:
569 */ 569 */
570 if (bp->b_flags & XBF_STALE) { 570 if (bp->b_flags & XBF_STALE) {
571 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 571 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
572 ASSERT(bp->b_iodone == NULL);
572 bp->b_flags &= _XBF_KMEM | _XBF_PAGES; 573 bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
574 bp->b_ops = NULL;
573 } 575 }
574 576
575 trace_xfs_buf_find(bp, flags, _RET_IP_); 577 trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -654,7 +656,8 @@ xfs_buf_read_map(
654 struct xfs_buftarg *target, 656 struct xfs_buftarg *target,
655 struct xfs_buf_map *map, 657 struct xfs_buf_map *map,
656 int nmaps, 658 int nmaps,
657 xfs_buf_flags_t flags) 659 xfs_buf_flags_t flags,
660 const struct xfs_buf_ops *ops)
658{ 661{
659 struct xfs_buf *bp; 662 struct xfs_buf *bp;
660 663
@@ -666,6 +669,7 @@ xfs_buf_read_map(
666 669
667 if (!XFS_BUF_ISDONE(bp)) { 670 if (!XFS_BUF_ISDONE(bp)) {
668 XFS_STATS_INC(xb_get_read); 671 XFS_STATS_INC(xb_get_read);
672 bp->b_ops = ops;
669 _xfs_buf_read(bp, flags); 673 _xfs_buf_read(bp, flags);
670 } else if (flags & XBF_ASYNC) { 674 } else if (flags & XBF_ASYNC) {
671 /* 675 /*
@@ -691,13 +695,14 @@ void
691xfs_buf_readahead_map( 695xfs_buf_readahead_map(
692 struct xfs_buftarg *target, 696 struct xfs_buftarg *target,
693 struct xfs_buf_map *map, 697 struct xfs_buf_map *map,
694 int nmaps) 698 int nmaps,
699 const struct xfs_buf_ops *ops)
695{ 700{
696 if (bdi_read_congested(target->bt_bdi)) 701 if (bdi_read_congested(target->bt_bdi))
697 return; 702 return;
698 703
699 xfs_buf_read_map(target, map, nmaps, 704 xfs_buf_read_map(target, map, nmaps,
700 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); 705 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops);
701} 706}
702 707
703/* 708/*
@@ -709,10 +714,10 @@ xfs_buf_read_uncached(
709 struct xfs_buftarg *target, 714 struct xfs_buftarg *target,
710 xfs_daddr_t daddr, 715 xfs_daddr_t daddr,
711 size_t numblks, 716 size_t numblks,
712 int flags) 717 int flags,
718 const struct xfs_buf_ops *ops)
713{ 719{
714 xfs_buf_t *bp; 720 struct xfs_buf *bp;
715 int error;
716 721
717 bp = xfs_buf_get_uncached(target, numblks, flags); 722 bp = xfs_buf_get_uncached(target, numblks, flags);
718 if (!bp) 723 if (!bp)
@@ -723,13 +728,10 @@ xfs_buf_read_uncached(
723 bp->b_bn = daddr; 728 bp->b_bn = daddr;
724 bp->b_maps[0].bm_bn = daddr; 729 bp->b_maps[0].bm_bn = daddr;
725 bp->b_flags |= XBF_READ; 730 bp->b_flags |= XBF_READ;
731 bp->b_ops = ops;
726 732
727 xfsbdstrat(target->bt_mount, bp); 733 xfsbdstrat(target->bt_mount, bp);
728 error = xfs_buf_iowait(bp); 734 xfs_buf_iowait(bp);
729 if (error) {
730 xfs_buf_relse(bp);
731 return NULL;
732 }
733 return bp; 735 return bp;
734} 736}
735 737
@@ -999,27 +1001,37 @@ STATIC void
999xfs_buf_iodone_work( 1001xfs_buf_iodone_work(
1000 struct work_struct *work) 1002 struct work_struct *work)
1001{ 1003{
1002 xfs_buf_t *bp = 1004 struct xfs_buf *bp =
1003 container_of(work, xfs_buf_t, b_iodone_work); 1005 container_of(work, xfs_buf_t, b_iodone_work);
1006 bool read = !!(bp->b_flags & XBF_READ);
1007
1008 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1009 if (read && bp->b_ops)
1010 bp->b_ops->verify_read(bp);
1004 1011
1005 if (bp->b_iodone) 1012 if (bp->b_iodone)
1006 (*(bp->b_iodone))(bp); 1013 (*(bp->b_iodone))(bp);
1007 else if (bp->b_flags & XBF_ASYNC) 1014 else if (bp->b_flags & XBF_ASYNC)
1008 xfs_buf_relse(bp); 1015 xfs_buf_relse(bp);
1016 else {
1017 ASSERT(read && bp->b_ops);
1018 complete(&bp->b_iowait);
1019 }
1009} 1020}
1010 1021
1011void 1022void
1012xfs_buf_ioend( 1023xfs_buf_ioend(
1013 xfs_buf_t *bp, 1024 struct xfs_buf *bp,
1014 int schedule) 1025 int schedule)
1015{ 1026{
1027 bool read = !!(bp->b_flags & XBF_READ);
1028
1016 trace_xfs_buf_iodone(bp, _RET_IP_); 1029 trace_xfs_buf_iodone(bp, _RET_IP_);
1017 1030
1018 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1019 if (bp->b_error == 0) 1031 if (bp->b_error == 0)
1020 bp->b_flags |= XBF_DONE; 1032 bp->b_flags |= XBF_DONE;
1021 1033
1022 if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { 1034 if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
1023 if (schedule) { 1035 if (schedule) {
1024 INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); 1036 INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
1025 queue_work(xfslogd_workqueue, &bp->b_iodone_work); 1037 queue_work(xfslogd_workqueue, &bp->b_iodone_work);
@@ -1027,6 +1039,7 @@ xfs_buf_ioend(
1027 xfs_buf_iodone_work(&bp->b_iodone_work); 1039 xfs_buf_iodone_work(&bp->b_iodone_work);
1028 } 1040 }
1029 } else { 1041 } else {
1042 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1030 complete(&bp->b_iowait); 1043 complete(&bp->b_iowait);
1031 } 1044 }
1032} 1045}
@@ -1197,9 +1210,14 @@ xfs_buf_bio_end_io(
1197{ 1210{
1198 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; 1211 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
1199 1212
1200 xfs_buf_ioerror(bp, -error); 1213 /*
1214 * don't overwrite existing errors - otherwise we can lose errors on
1215 * buffers that require multiple bios to complete.
1216 */
1217 if (!bp->b_error)
1218 xfs_buf_ioerror(bp, -error);
1201 1219
1202 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1220 if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1203 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1221 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1204 1222
1205 _xfs_buf_ioend(bp, 1); 1223 _xfs_buf_ioend(bp, 1);
@@ -1279,6 +1297,11 @@ next_chunk:
1279 if (size) 1297 if (size)
1280 goto next_chunk; 1298 goto next_chunk;
1281 } else { 1299 } else {
1300 /*
1301 * This is guaranteed not to be the last io reference count
1302 * because the caller (xfs_buf_iorequest) holds a count itself.
1303 */
1304 atomic_dec(&bp->b_io_remaining);
1282 xfs_buf_ioerror(bp, EIO); 1305 xfs_buf_ioerror(bp, EIO);
1283 bio_put(bio); 1306 bio_put(bio);
1284 } 1307 }
@@ -1304,6 +1327,20 @@ _xfs_buf_ioapply(
1304 rw |= REQ_FUA; 1327 rw |= REQ_FUA;
1305 if (bp->b_flags & XBF_FLUSH) 1328 if (bp->b_flags & XBF_FLUSH)
1306 rw |= REQ_FLUSH; 1329 rw |= REQ_FLUSH;
1330
1331 /*
1332 * Run the write verifier callback function if it exists. If
1333 * this function fails it will mark the buffer with an error and
1334 * the IO should not be dispatched.
1335 */
1336 if (bp->b_ops) {
1337 bp->b_ops->verify_write(bp);
1338 if (bp->b_error) {
1339 xfs_force_shutdown(bp->b_target->bt_mount,
1340 SHUTDOWN_CORRUPT_INCORE);
1341 return;
1342 }
1343 }
1307 } else if (bp->b_flags & XBF_READ_AHEAD) { 1344 } else if (bp->b_flags & XBF_READ_AHEAD) {
1308 rw = READA; 1345 rw = READA;
1309 } else { 1346 } else {
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 7c0b6a0a1557..23f5642480bb 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -100,6 +100,7 @@ typedef struct xfs_buftarg {
100struct xfs_buf; 100struct xfs_buf;
101typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); 101typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
102 102
103
103#define XB_PAGES 2 104#define XB_PAGES 2
104 105
105struct xfs_buf_map { 106struct xfs_buf_map {
@@ -110,6 +111,11 @@ struct xfs_buf_map {
110#define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \ 111#define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \
111 struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; 112 struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) };
112 113
114struct xfs_buf_ops {
115 void (*verify_read)(struct xfs_buf *);
116 void (*verify_write)(struct xfs_buf *);
117};
118
113typedef struct xfs_buf { 119typedef struct xfs_buf {
114 /* 120 /*
115 * first cacheline holds all the fields needed for an uncontended cache 121 * first cacheline holds all the fields needed for an uncontended cache
@@ -153,13 +159,13 @@ typedef struct xfs_buf {
153 unsigned int b_page_count; /* size of page array */ 159 unsigned int b_page_count; /* size of page array */
154 unsigned int b_offset; /* page offset in first page */ 160 unsigned int b_offset; /* page offset in first page */
155 unsigned short b_error; /* error code on I/O */ 161 unsigned short b_error; /* error code on I/O */
162 const struct xfs_buf_ops *b_ops;
156 163
157#ifdef XFS_BUF_LOCK_TRACKING 164#ifdef XFS_BUF_LOCK_TRACKING
158 int b_last_holder; 165 int b_last_holder;
159#endif 166#endif
160} xfs_buf_t; 167} xfs_buf_t;
161 168
162
163/* Finding and Reading Buffers */ 169/* Finding and Reading Buffers */
164struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, 170struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target,
165 struct xfs_buf_map *map, int nmaps, 171 struct xfs_buf_map *map, int nmaps,
@@ -196,9 +202,11 @@ struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target,
196 xfs_buf_flags_t flags); 202 xfs_buf_flags_t flags);
197struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target, 203struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target,
198 struct xfs_buf_map *map, int nmaps, 204 struct xfs_buf_map *map, int nmaps,
199 xfs_buf_flags_t flags); 205 xfs_buf_flags_t flags,
206 const struct xfs_buf_ops *ops);
200void xfs_buf_readahead_map(struct xfs_buftarg *target, 207void xfs_buf_readahead_map(struct xfs_buftarg *target,
201 struct xfs_buf_map *map, int nmaps); 208 struct xfs_buf_map *map, int nmaps,
209 const struct xfs_buf_ops *ops);
202 210
203static inline struct xfs_buf * 211static inline struct xfs_buf *
204xfs_buf_get( 212xfs_buf_get(
@@ -216,20 +224,22 @@ xfs_buf_read(
216 struct xfs_buftarg *target, 224 struct xfs_buftarg *target,
217 xfs_daddr_t blkno, 225 xfs_daddr_t blkno,
218 size_t numblks, 226 size_t numblks,
219 xfs_buf_flags_t flags) 227 xfs_buf_flags_t flags,
228 const struct xfs_buf_ops *ops)
220{ 229{
221 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); 230 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
222 return xfs_buf_read_map(target, &map, 1, flags); 231 return xfs_buf_read_map(target, &map, 1, flags, ops);
223} 232}
224 233
225static inline void 234static inline void
226xfs_buf_readahead( 235xfs_buf_readahead(
227 struct xfs_buftarg *target, 236 struct xfs_buftarg *target,
228 xfs_daddr_t blkno, 237 xfs_daddr_t blkno,
229 size_t numblks) 238 size_t numblks,
239 const struct xfs_buf_ops *ops)
230{ 240{
231 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); 241 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
232 return xfs_buf_readahead_map(target, &map, 1); 242 return xfs_buf_readahead_map(target, &map, 1, ops);
233} 243}
234 244
235struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); 245struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks);
@@ -239,7 +249,8 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
239struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, 249struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
240 int flags); 250 int flags);
241struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, 251struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target,
242 xfs_daddr_t daddr, size_t numblks, int flags); 252 xfs_daddr_t daddr, size_t numblks, int flags,
253 const struct xfs_buf_ops *ops);
243void xfs_buf_hold(struct xfs_buf *bp); 254void xfs_buf_hold(struct xfs_buf *bp);
244 255
245/* Releasing Buffers */ 256/* Releasing Buffers */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index a8d0ed911196..becf4a97efc6 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -526,7 +526,25 @@ xfs_buf_item_unpin(
526 } 526 }
527 xfs_buf_relse(bp); 527 xfs_buf_relse(bp);
528 } else if (freed && remove) { 528 } else if (freed && remove) {
529 /*
530 * There are currently two references to the buffer - the active
531 * LRU reference and the buf log item. What we are about to do
532 * here - simulate a failed IO completion - requires 3
533 * references.
534 *
535 * The LRU reference is removed by the xfs_buf_stale() call. The
536 * buf item reference is removed by the xfs_buf_iodone()
537 * callback that is run by xfs_buf_do_callbacks() during ioend
538 * processing (via the bp->b_iodone callback), and then finally
539 * the ioend processing will drop the IO reference if the buffer
540 * is marked XBF_ASYNC.
541 *
542 * Hence we need to take an additional reference here so that IO
543 * completion processing doesn't free the buffer prematurely.
544 */
529 xfs_buf_lock(bp); 545 xfs_buf_lock(bp);
546 xfs_buf_hold(bp);
547 bp->b_flags |= XBF_ASYNC;
530 xfs_buf_ioerror(bp, EIO); 548 xfs_buf_ioerror(bp, EIO);
531 XFS_BUF_UNDONE(bp); 549 XFS_BUF_UNDONE(bp);
532 xfs_buf_stale(bp); 550 xfs_buf_stale(bp);
diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/xfs_cksum.h
new file mode 100644
index 000000000000..fad1676ad8cd
--- /dev/null
+++ b/fs/xfs/xfs_cksum.h
@@ -0,0 +1,63 @@
1#ifndef _XFS_CKSUM_H
2#define _XFS_CKSUM_H 1
3
4#define XFS_CRC_SEED (~(__uint32_t)0)
5
6/*
7 * Calculate the intermediate checksum for a buffer that has the CRC field
8 * inside it. The offset of the 32bit crc fields is passed as the
9 * cksum_offset parameter.
10 */
11static inline __uint32_t
12xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
13{
14 __uint32_t zero = 0;
15 __uint32_t crc;
16
17 /* Calculate CRC up to the checksum. */
18 crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
19
20 /* Skip checksum field */
21 crc = crc32c(crc, &zero, sizeof(__u32));
22
23 /* Calculate the rest of the CRC. */
24 return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)],
25 length - (cksum_offset + sizeof(__be32)));
26}
27
28/*
29 * Convert the intermediate checksum to the final ondisk format.
30 *
31 * The CRC32c calculation uses LE format even on BE machines, but returns the
32 * result in host endian format. Hence we need to byte swap it back to LE format
33 * so that it is consistent on disk.
34 */
35static inline __le32
36xfs_end_cksum(__uint32_t crc)
37{
38 return ~cpu_to_le32(crc);
39}
40
41/*
42 * Helper to generate the checksum for a buffer.
43 */
44static inline void
45xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
46{
47 __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
48
49 *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
50}
51
52/*
53 * Helper to verify the checksum for a buffer.
54 */
55static inline int
56xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
57{
58 __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
59
60 return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
61}
62
63#endif /* _XFS_CKSUM_H */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 7bfb7dd334fc..4d7696a02418 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -91,6 +91,84 @@ STATIC int xfs_da_blk_unlink(xfs_da_state_t *state,
91 xfs_da_state_blk_t *save_blk); 91 xfs_da_state_blk_t *save_blk);
92STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); 92STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state);
93 93
94static void
95xfs_da_node_verify(
96 struct xfs_buf *bp)
97{
98 struct xfs_mount *mp = bp->b_target->bt_mount;
99 struct xfs_da_node_hdr *hdr = bp->b_addr;
100 int block_ok = 0;
101
102 block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC);
103 block_ok = block_ok &&
104 be16_to_cpu(hdr->level) > 0 &&
105 be16_to_cpu(hdr->count) > 0 ;
106 if (!block_ok) {
107 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
108 xfs_buf_ioerror(bp, EFSCORRUPTED);
109 }
110
111}
112
113static void
114xfs_da_node_write_verify(
115 struct xfs_buf *bp)
116{
117 xfs_da_node_verify(bp);
118}
119
120/*
121 * leaf/node format detection on trees is sketchy, so a node read can be done on
122 * leaf level blocks when detection identifies the tree as a node format tree
123 * incorrectly. In this case, we need to swap the verifier to match the correct
124 * format of the block being read.
125 */
126static void
127xfs_da_node_read_verify(
128 struct xfs_buf *bp)
129{
130 struct xfs_mount *mp = bp->b_target->bt_mount;
131 struct xfs_da_blkinfo *info = bp->b_addr;
132
133 switch (be16_to_cpu(info->magic)) {
134 case XFS_DA_NODE_MAGIC:
135 xfs_da_node_verify(bp);
136 break;
137 case XFS_ATTR_LEAF_MAGIC:
138 bp->b_ops = &xfs_attr_leaf_buf_ops;
139 bp->b_ops->verify_read(bp);
140 return;
141 case XFS_DIR2_LEAFN_MAGIC:
142 bp->b_ops = &xfs_dir2_leafn_buf_ops;
143 bp->b_ops->verify_read(bp);
144 return;
145 default:
146 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
147 mp, info);
148 xfs_buf_ioerror(bp, EFSCORRUPTED);
149 break;
150 }
151}
152
153const struct xfs_buf_ops xfs_da_node_buf_ops = {
154 .verify_read = xfs_da_node_read_verify,
155 .verify_write = xfs_da_node_write_verify,
156};
157
158
159int
160xfs_da_node_read(
161 struct xfs_trans *tp,
162 struct xfs_inode *dp,
163 xfs_dablk_t bno,
164 xfs_daddr_t mappedbno,
165 struct xfs_buf **bpp,
166 int which_fork)
167{
168 return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
169 which_fork, &xfs_da_node_buf_ops);
170}
171
94/*======================================================================== 172/*========================================================================
95 * Routines used for growing the Btree. 173 * Routines used for growing the Btree.
96 *========================================================================*/ 174 *========================================================================*/
@@ -125,6 +203,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
125 xfs_trans_log_buf(tp, bp, 203 xfs_trans_log_buf(tp, bp,
126 XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); 204 XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
127 205
206 bp->b_ops = &xfs_da_node_buf_ops;
128 *bpp = bp; 207 *bpp = bp;
129 return(0); 208 return(0);
130} 209}
@@ -324,6 +403,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
324 } 403 }
325 memcpy(node, oldroot, size); 404 memcpy(node, oldroot, size);
326 xfs_trans_log_buf(tp, bp, 0, size - 1); 405 xfs_trans_log_buf(tp, bp, 0, size - 1);
406
407 bp->b_ops = blk1->bp->b_ops;
327 blk1->bp = bp; 408 blk1->bp = bp;
328 blk1->blkno = blkno; 409 blk1->blkno = blkno;
329 410
@@ -746,7 +827,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
746 */ 827 */
747 child = be32_to_cpu(oldroot->btree[0].before); 828 child = be32_to_cpu(oldroot->btree[0].before);
748 ASSERT(child != 0); 829 ASSERT(child != 0);
749 error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp, 830 error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp,
750 args->whichfork); 831 args->whichfork);
751 if (error) 832 if (error)
752 return(error); 833 return(error);
@@ -754,7 +835,14 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
754 xfs_da_blkinfo_onlychild_validate(bp->b_addr, 835 xfs_da_blkinfo_onlychild_validate(bp->b_addr,
755 be16_to_cpu(oldroot->hdr.level)); 836 be16_to_cpu(oldroot->hdr.level));
756 837
838 /*
839 * This could be copying a leaf back into the root block in the case of
840 * there only being a single leaf block left in the tree. Hence we have
841 * to update the b_ops pointer as well to match the buffer type change
842 * that could occur.
843 */
757 memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize); 844 memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize);
845 root_blk->bp->b_ops = bp->b_ops;
758 xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); 846 xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
759 error = xfs_da_shrink_inode(args, child, bp); 847 error = xfs_da_shrink_inode(args, child, bp);
760 return(error); 848 return(error);
@@ -779,6 +867,8 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
779 xfs_dablk_t blkno; 867 xfs_dablk_t blkno;
780 struct xfs_buf *bp; 868 struct xfs_buf *bp;
781 869
870 trace_xfs_da_node_toosmall(state->args);
871
782 /* 872 /*
783 * Check for the degenerate case of the block being over 50% full. 873 * Check for the degenerate case of the block being over 50% full.
784 * If so, it's not worth even looking to see if we might be able 874 * If so, it's not worth even looking to see if we might be able
@@ -835,7 +925,7 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
835 blkno = be32_to_cpu(info->back); 925 blkno = be32_to_cpu(info->back);
836 if (blkno == 0) 926 if (blkno == 0)
837 continue; 927 continue;
838 error = xfs_da_read_buf(state->args->trans, state->args->dp, 928 error = xfs_da_node_read(state->args->trans, state->args->dp,
839 blkno, -1, &bp, state->args->whichfork); 929 blkno, -1, &bp, state->args->whichfork);
840 if (error) 930 if (error)
841 return(error); 931 return(error);
@@ -900,6 +990,8 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
900 xfs_dahash_t lasthash=0; 990 xfs_dahash_t lasthash=0;
901 int level, count; 991 int level, count;
902 992
993 trace_xfs_da_fixhashpath(state->args);
994
903 level = path->active-1; 995 level = path->active-1;
904 blk = &path->blk[ level ]; 996 blk = &path->blk[ level ];
905 switch (blk->magic) { 997 switch (blk->magic) {
@@ -1079,7 +1171,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
1079 * Read the next node down in the tree. 1171 * Read the next node down in the tree.
1080 */ 1172 */
1081 blk->blkno = blkno; 1173 blk->blkno = blkno;
1082 error = xfs_da_read_buf(args->trans, args->dp, blkno, 1174 error = xfs_da_node_read(args->trans, args->dp, blkno,
1083 -1, &blk->bp, args->whichfork); 1175 -1, &blk->bp, args->whichfork);
1084 if (error) { 1176 if (error) {
1085 blk->blkno = 0; 1177 blk->blkno = 0;
@@ -1241,7 +1333,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
1241 new_info->forw = cpu_to_be32(old_blk->blkno); 1333 new_info->forw = cpu_to_be32(old_blk->blkno);
1242 new_info->back = old_info->back; 1334 new_info->back = old_info->back;
1243 if (old_info->back) { 1335 if (old_info->back) {
1244 error = xfs_da_read_buf(args->trans, args->dp, 1336 error = xfs_da_node_read(args->trans, args->dp,
1245 be32_to_cpu(old_info->back), 1337 be32_to_cpu(old_info->back),
1246 -1, &bp, args->whichfork); 1338 -1, &bp, args->whichfork);
1247 if (error) 1339 if (error)
@@ -1262,7 +1354,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
1262 new_info->forw = old_info->forw; 1354 new_info->forw = old_info->forw;
1263 new_info->back = cpu_to_be32(old_blk->blkno); 1355 new_info->back = cpu_to_be32(old_blk->blkno);
1264 if (old_info->forw) { 1356 if (old_info->forw) {
1265 error = xfs_da_read_buf(args->trans, args->dp, 1357 error = xfs_da_node_read(args->trans, args->dp,
1266 be32_to_cpu(old_info->forw), 1358 be32_to_cpu(old_info->forw),
1267 -1, &bp, args->whichfork); 1359 -1, &bp, args->whichfork);
1268 if (error) 1360 if (error)
@@ -1362,7 +1454,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
1362 trace_xfs_da_unlink_back(args); 1454 trace_xfs_da_unlink_back(args);
1363 save_info->back = drop_info->back; 1455 save_info->back = drop_info->back;
1364 if (drop_info->back) { 1456 if (drop_info->back) {
1365 error = xfs_da_read_buf(args->trans, args->dp, 1457 error = xfs_da_node_read(args->trans, args->dp,
1366 be32_to_cpu(drop_info->back), 1458 be32_to_cpu(drop_info->back),
1367 -1, &bp, args->whichfork); 1459 -1, &bp, args->whichfork);
1368 if (error) 1460 if (error)
@@ -1379,7 +1471,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
1379 trace_xfs_da_unlink_forward(args); 1471 trace_xfs_da_unlink_forward(args);
1380 save_info->forw = drop_info->forw; 1472 save_info->forw = drop_info->forw;
1381 if (drop_info->forw) { 1473 if (drop_info->forw) {
1382 error = xfs_da_read_buf(args->trans, args->dp, 1474 error = xfs_da_node_read(args->trans, args->dp,
1383 be32_to_cpu(drop_info->forw), 1475 be32_to_cpu(drop_info->forw),
1384 -1, &bp, args->whichfork); 1476 -1, &bp, args->whichfork);
1385 if (error) 1477 if (error)
@@ -1417,6 +1509,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
1417 xfs_dablk_t blkno=0; 1509 xfs_dablk_t blkno=0;
1418 int level, error; 1510 int level, error;
1419 1511
1512 trace_xfs_da_path_shift(state->args);
1513
1420 /* 1514 /*
1421 * Roll up the Btree looking for the first block where our 1515 * Roll up the Btree looking for the first block where our
1422 * current index is not at the edge of the block. Note that 1516 * current index is not at the edge of the block. Note that
@@ -1463,8 +1557,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
1463 * Read the next child block. 1557 * Read the next child block.
1464 */ 1558 */
1465 blk->blkno = blkno; 1559 blk->blkno = blkno;
1466 error = xfs_da_read_buf(args->trans, args->dp, blkno, -1, 1560 error = xfs_da_node_read(args->trans, args->dp, blkno, -1,
1467 &blk->bp, args->whichfork); 1561 &blk->bp, args->whichfork);
1468 if (error) 1562 if (error)
1469 return(error); 1563 return(error);
1470 ASSERT(blk->bp != NULL); 1564 ASSERT(blk->bp != NULL);
@@ -1727,7 +1821,8 @@ xfs_da_swap_lastblock(
1727 * Read the last block in the btree space. 1821 * Read the last block in the btree space.
1728 */ 1822 */
1729 last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; 1823 last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
1730 if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w))) 1824 error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w);
1825 if (error)
1731 return error; 1826 return error;
1732 /* 1827 /*
1733 * Copy the last block into the dead buffer and log it. 1828 * Copy the last block into the dead buffer and log it.
@@ -1753,7 +1848,8 @@ xfs_da_swap_lastblock(
1753 * If the moved block has a left sibling, fix up the pointers. 1848 * If the moved block has a left sibling, fix up the pointers.
1754 */ 1849 */
1755 if ((sib_blkno = be32_to_cpu(dead_info->back))) { 1850 if ((sib_blkno = be32_to_cpu(dead_info->back))) {
1756 if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) 1851 error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
1852 if (error)
1757 goto done; 1853 goto done;
1758 sib_info = sib_buf->b_addr; 1854 sib_info = sib_buf->b_addr;
1759 if (unlikely( 1855 if (unlikely(
@@ -1774,7 +1870,8 @@ xfs_da_swap_lastblock(
1774 * If the moved block has a right sibling, fix up the pointers. 1870 * If the moved block has a right sibling, fix up the pointers.
1775 */ 1871 */
1776 if ((sib_blkno = be32_to_cpu(dead_info->forw))) { 1872 if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
1777 if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) 1873 error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
1874 if (error)
1778 goto done; 1875 goto done;
1779 sib_info = sib_buf->b_addr; 1876 sib_info = sib_buf->b_addr;
1780 if (unlikely( 1877 if (unlikely(
@@ -1797,7 +1894,8 @@ xfs_da_swap_lastblock(
1797 * Walk down the tree looking for the parent of the moved block. 1894 * Walk down the tree looking for the parent of the moved block.
1798 */ 1895 */
1799 for (;;) { 1896 for (;;) {
1800 if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) 1897 error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
1898 if (error)
1801 goto done; 1899 goto done;
1802 par_node = par_buf->b_addr; 1900 par_node = par_buf->b_addr;
1803 if (unlikely(par_node->hdr.info.magic != 1901 if (unlikely(par_node->hdr.info.magic !=
@@ -1847,7 +1945,8 @@ xfs_da_swap_lastblock(
1847 error = XFS_ERROR(EFSCORRUPTED); 1945 error = XFS_ERROR(EFSCORRUPTED);
1848 goto done; 1946 goto done;
1849 } 1947 }
1850 if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) 1948 error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
1949 if (error)
1851 goto done; 1950 goto done;
1852 par_node = par_buf->b_addr; 1951 par_node = par_buf->b_addr;
1853 if (unlikely( 1952 if (unlikely(
@@ -2133,7 +2232,8 @@ xfs_da_read_buf(
2133 xfs_dablk_t bno, 2232 xfs_dablk_t bno,
2134 xfs_daddr_t mappedbno, 2233 xfs_daddr_t mappedbno,
2135 struct xfs_buf **bpp, 2234 struct xfs_buf **bpp,
2136 int whichfork) 2235 int whichfork,
2236 const struct xfs_buf_ops *ops)
2137{ 2237{
2138 struct xfs_buf *bp; 2238 struct xfs_buf *bp;
2139 struct xfs_buf_map map; 2239 struct xfs_buf_map map;
@@ -2155,7 +2255,7 @@ xfs_da_read_buf(
2155 2255
2156 error = xfs_trans_read_buf_map(dp->i_mount, trans, 2256 error = xfs_trans_read_buf_map(dp->i_mount, trans,
2157 dp->i_mount->m_ddev_targp, 2257 dp->i_mount->m_ddev_targp,
2158 mapp, nmap, 0, &bp); 2258 mapp, nmap, 0, &bp, ops);
2159 if (error) 2259 if (error)
2160 goto out_free; 2260 goto out_free;
2161 2261
@@ -2211,9 +2311,10 @@ xfs_da_reada_buf(
2211 struct xfs_trans *trans, 2311 struct xfs_trans *trans,
2212 struct xfs_inode *dp, 2312 struct xfs_inode *dp,
2213 xfs_dablk_t bno, 2313 xfs_dablk_t bno,
2214 int whichfork) 2314 xfs_daddr_t mappedbno,
2315 int whichfork,
2316 const struct xfs_buf_ops *ops)
2215{ 2317{
2216 xfs_daddr_t mappedbno = -1;
2217 struct xfs_buf_map map; 2318 struct xfs_buf_map map;
2218 struct xfs_buf_map *mapp; 2319 struct xfs_buf_map *mapp;
2219 int nmap; 2320 int nmap;
@@ -2221,7 +2322,7 @@ xfs_da_reada_buf(
2221 2322
2222 mapp = &map; 2323 mapp = &map;
2223 nmap = 1; 2324 nmap = 1;
2224 error = xfs_dabuf_map(trans, dp, bno, -1, whichfork, 2325 error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
2225 &mapp, &nmap); 2326 &mapp, &nmap);
2226 if (error) { 2327 if (error) {
2227 /* mapping a hole is not an error, but we don't continue */ 2328 /* mapping a hole is not an error, but we don't continue */
@@ -2231,7 +2332,7 @@ xfs_da_reada_buf(
2231 } 2332 }
2232 2333
2233 mappedbno = mapp[0].bm_bn; 2334 mappedbno = mapp[0].bm_bn;
2234 xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap); 2335 xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
2235 2336
2236out_free: 2337out_free:
2237 if (mapp != &map) 2338 if (mapp != &map)
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 132adafb041e..ee5170c46ae1 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -18,7 +18,6 @@
18#ifndef __XFS_DA_BTREE_H__ 18#ifndef __XFS_DA_BTREE_H__
19#define __XFS_DA_BTREE_H__ 19#define __XFS_DA_BTREE_H__
20 20
21struct xfs_buf;
22struct xfs_bmap_free; 21struct xfs_bmap_free;
23struct xfs_inode; 22struct xfs_inode;
24struct xfs_mount; 23struct xfs_mount;
@@ -214,6 +213,9 @@ int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
214 */ 213 */
215int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, 214int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
216 xfs_da_state_blk_t *new_blk); 215 xfs_da_state_blk_t *new_blk);
216int xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
217 xfs_dablk_t bno, xfs_daddr_t mappedbno,
218 struct xfs_buf **bpp, int which_fork);
217 219
218/* 220/*
219 * Utility routines. 221 * Utility routines.
@@ -226,9 +228,11 @@ int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
226 struct xfs_buf **bp, int whichfork); 228 struct xfs_buf **bp, int whichfork);
227int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, 229int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
228 xfs_dablk_t bno, xfs_daddr_t mappedbno, 230 xfs_dablk_t bno, xfs_daddr_t mappedbno,
229 struct xfs_buf **bpp, int whichfork); 231 struct xfs_buf **bpp, int whichfork,
232 const struct xfs_buf_ops *ops);
230xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, 233xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
231 xfs_dablk_t bno, int whichfork); 234 xfs_dablk_t bno, xfs_daddr_t mapped_bno,
235 int whichfork, const struct xfs_buf_ops *ops);
232int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, 236int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
233 struct xfs_buf *dead_buf); 237 struct xfs_buf *dead_buf);
234 238
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index b9b8646e62db..d0e9c74d3d96 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -246,12 +246,10 @@ xfs_swap_extents(
246 goto out_unlock; 246 goto out_unlock;
247 } 247 }
248 248
249 if (VN_CACHED(VFS_I(tip)) != 0) { 249 error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
250 error = xfs_flushinval_pages(tip, 0, -1, 250 if (error)
251 FI_REMAPF_LOCKED); 251 goto out_unlock;
252 if (error) 252 truncate_pagecache_range(VFS_I(ip), 0, -1);
253 goto out_unlock;
254 }
255 253
256 /* Verify O_DIRECT for ftmp */ 254 /* Verify O_DIRECT for ftmp */
257 if (VN_CACHED(VFS_I(tip)) != 0) { 255 if (VN_CACHED(VFS_I(tip)) != 0) {
@@ -315,8 +313,7 @@ xfs_swap_extents(
315 * are safe. We don't really care if non-io related 313 * are safe. We don't really care if non-io related
316 * fields change. 314 * fields change.
317 */ 315 */
318 316 truncate_pagecache_range(VFS_I(ip), 0, -1);
319 xfs_tosspages(ip, 0, -1, FI_REMAPF);
320 317
321 tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); 318 tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
322 if ((error = xfs_trans_reserve(tp, 0, 319 if ((error = xfs_trans_reserve(tp, 0,
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e93ca8f054f4..7536faaa61e7 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -56,6 +56,214 @@ xfs_dir_startup(void)
56 xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); 56 xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
57} 57}
58 58
59static void
60xfs_dir2_block_verify(
61 struct xfs_buf *bp)
62{
63 struct xfs_mount *mp = bp->b_target->bt_mount;
64 struct xfs_dir2_data_hdr *hdr = bp->b_addr;
65 int block_ok = 0;
66
67 block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
68 block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
69
70 if (!block_ok) {
71 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
72 xfs_buf_ioerror(bp, EFSCORRUPTED);
73 }
74}
75
76static void
77xfs_dir2_block_read_verify(
78 struct xfs_buf *bp)
79{
80 xfs_dir2_block_verify(bp);
81}
82
83static void
84xfs_dir2_block_write_verify(
85 struct xfs_buf *bp)
86{
87 xfs_dir2_block_verify(bp);
88}
89
90const struct xfs_buf_ops xfs_dir2_block_buf_ops = {
91 .verify_read = xfs_dir2_block_read_verify,
92 .verify_write = xfs_dir2_block_write_verify,
93};
94
95static int
96xfs_dir2_block_read(
97 struct xfs_trans *tp,
98 struct xfs_inode *dp,
99 struct xfs_buf **bpp)
100{
101 struct xfs_mount *mp = dp->i_mount;
102
103 return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp,
104 XFS_DATA_FORK, &xfs_dir2_block_buf_ops);
105}
106
107static void
108xfs_dir2_block_need_space(
109 struct xfs_dir2_data_hdr *hdr,
110 struct xfs_dir2_block_tail *btp,
111 struct xfs_dir2_leaf_entry *blp,
112 __be16 **tagpp,
113 struct xfs_dir2_data_unused **dupp,
114 struct xfs_dir2_data_unused **enddupp,
115 int *compact,
116 int len)
117{
118 struct xfs_dir2_data_free *bf;
119 __be16 *tagp = NULL;
120 struct xfs_dir2_data_unused *dup = NULL;
121 struct xfs_dir2_data_unused *enddup = NULL;
122
123 *compact = 0;
124 bf = hdr->bestfree;
125
126 /*
127 * If there are stale entries we'll use one for the leaf.
128 */
129 if (btp->stale) {
130 if (be16_to_cpu(bf[0].length) >= len) {
131 /*
132 * The biggest entry enough to avoid compaction.
133 */
134 dup = (xfs_dir2_data_unused_t *)
135 ((char *)hdr + be16_to_cpu(bf[0].offset));
136 goto out;
137 }
138
139 /*
140 * Will need to compact to make this work.
141 * Tag just before the first leaf entry.
142 */
143 *compact = 1;
144 tagp = (__be16 *)blp - 1;
145
146 /* Data object just before the first leaf entry. */
147 dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
148
149 /*
150 * If it's not free then the data will go where the
151 * leaf data starts now, if it works at all.
152 */
153 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
154 if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
155 (uint)sizeof(*blp) < len)
156 dup = NULL;
157 } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
158 dup = NULL;
159 else
160 dup = (xfs_dir2_data_unused_t *)blp;
161 goto out;
162 }
163
164 /*
165 * no stale entries, so just use free space.
166 * Tag just before the first leaf entry.
167 */
168 tagp = (__be16 *)blp - 1;
169
170 /* Data object just before the first leaf entry. */
171 enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
172
173 /*
174 * If it's not free then can't do this add without cleaning up:
175 * the space before the first leaf entry needs to be free so it
176 * can be expanded to hold the pointer to the new entry.
177 */
178 if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
179 /*
180 * Check out the biggest freespace and see if it's the same one.
181 */
182 dup = (xfs_dir2_data_unused_t *)
183 ((char *)hdr + be16_to_cpu(bf[0].offset));
184 if (dup != enddup) {
185 /*
186 * Not the same free entry, just check its length.
187 */
188 if (be16_to_cpu(dup->length) < len)
189 dup = NULL;
190 goto out;
191 }
192
193 /*
194 * It is the biggest freespace, can it hold the leaf too?
195 */
196 if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
197 /*
198 * Yes, use the second-largest entry instead if it works.
199 */
200 if (be16_to_cpu(bf[1].length) >= len)
201 dup = (xfs_dir2_data_unused_t *)
202 ((char *)hdr + be16_to_cpu(bf[1].offset));
203 else
204 dup = NULL;
205 }
206 }
207out:
208 *tagpp = tagp;
209 *dupp = dup;
210 *enddupp = enddup;
211}
212
213/*
214 * compact the leaf entries.
215 * Leave the highest-numbered stale entry stale.
216 * XXX should be the one closest to mid but mid is not yet computed.
217 */
218static void
219xfs_dir2_block_compact(
220 struct xfs_trans *tp,
221 struct xfs_buf *bp,
222 struct xfs_dir2_data_hdr *hdr,
223 struct xfs_dir2_block_tail *btp,
224 struct xfs_dir2_leaf_entry *blp,
225 int *needlog,
226 int *lfloghigh,
227 int *lfloglow)
228{
229 int fromidx; /* source leaf index */
230 int toidx; /* target leaf index */
231 int needscan = 0;
232 int highstale; /* high stale index */
233
234 fromidx = toidx = be32_to_cpu(btp->count) - 1;
235 highstale = *lfloghigh = -1;
236 for (; fromidx >= 0; fromidx--) {
237 if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
238 if (highstale == -1)
239 highstale = toidx;
240 else {
241 if (*lfloghigh == -1)
242 *lfloghigh = toidx;
243 continue;
244 }
245 }
246 if (fromidx < toidx)
247 blp[toidx] = blp[fromidx];
248 toidx--;
249 }
250 *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
251 *lfloghigh -= be32_to_cpu(btp->stale) - 1;
252 be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
253 xfs_dir2_data_make_free(tp, bp,
254 (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
255 (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
256 needlog, &needscan);
257 blp += be32_to_cpu(btp->stale) - 1;
258 btp->stale = cpu_to_be32(1);
259 /*
260 * If we now need to rebuild the bestfree map, do so.
261 * This needs to happen before the next call to use_free.
262 */
263 if (needscan)
264 xfs_dir2_data_freescan(tp->t_mountp, hdr, needlog);
265}
266
59/* 267/*
60 * Add an entry to a block directory. 268 * Add an entry to a block directory.
61 */ 269 */
@@ -63,7 +271,6 @@ int /* error */
63xfs_dir2_block_addname( 271xfs_dir2_block_addname(
64 xfs_da_args_t *args) /* directory op arguments */ 272 xfs_da_args_t *args) /* directory op arguments */
65{ 273{
66 xfs_dir2_data_free_t *bf; /* bestfree table in block */
67 xfs_dir2_data_hdr_t *hdr; /* block header */ 274 xfs_dir2_data_hdr_t *hdr; /* block header */
68 xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ 275 xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
69 struct xfs_buf *bp; /* buffer for block */ 276 struct xfs_buf *bp; /* buffer for block */
@@ -94,134 +301,44 @@ xfs_dir2_block_addname(
94 dp = args->dp; 301 dp = args->dp;
95 tp = args->trans; 302 tp = args->trans;
96 mp = dp->i_mount; 303 mp = dp->i_mount;
97 /* 304
98 * Read the (one and only) directory block into dabuf bp. 305 /* Read the (one and only) directory block into bp. */
99 */ 306 error = xfs_dir2_block_read(tp, dp, &bp);
100 if ((error = 307 if (error)
101 xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
102 return error; 308 return error;
103 } 309
104 ASSERT(bp != NULL);
105 hdr = bp->b_addr;
106 /*
107 * Check the magic number, corrupted if wrong.
108 */
109 if (unlikely(hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))) {
110 XFS_CORRUPTION_ERROR("xfs_dir2_block_addname",
111 XFS_ERRLEVEL_LOW, mp, hdr);
112 xfs_trans_brelse(tp, bp);
113 return XFS_ERROR(EFSCORRUPTED);
114 }
115 len = xfs_dir2_data_entsize(args->namelen); 310 len = xfs_dir2_data_entsize(args->namelen);
311
116 /* 312 /*
117 * Set up pointers to parts of the block. 313 * Set up pointers to parts of the block.
118 */ 314 */
119 bf = hdr->bestfree; 315 hdr = bp->b_addr;
120 btp = xfs_dir2_block_tail_p(mp, hdr); 316 btp = xfs_dir2_block_tail_p(mp, hdr);
121 blp = xfs_dir2_block_leaf_p(btp); 317 blp = xfs_dir2_block_leaf_p(btp);
318
122 /* 319 /*
123 * No stale entries? Need space for entry and new leaf. 320 * Find out if we can reuse stale entries or whether we need extra
124 */ 321 * space for entry and new leaf.
125 if (!btp->stale) {
126 /*
127 * Tag just before the first leaf entry.
128 */
129 tagp = (__be16 *)blp - 1;
130 /*
131 * Data object just before the first leaf entry.
132 */
133 enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
134 /*
135 * If it's not free then can't do this add without cleaning up:
136 * the space before the first leaf entry needs to be free so it
137 * can be expanded to hold the pointer to the new entry.
138 */
139 if (be16_to_cpu(enddup->freetag) != XFS_DIR2_DATA_FREE_TAG)
140 dup = enddup = NULL;
141 /*
142 * Check out the biggest freespace and see if it's the same one.
143 */
144 else {
145 dup = (xfs_dir2_data_unused_t *)
146 ((char *)hdr + be16_to_cpu(bf[0].offset));
147 if (dup == enddup) {
148 /*
149 * It is the biggest freespace, is it too small
150 * to hold the new leaf too?
151 */
152 if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
153 /*
154 * Yes, we use the second-largest
155 * entry instead if it works.
156 */
157 if (be16_to_cpu(bf[1].length) >= len)
158 dup = (xfs_dir2_data_unused_t *)
159 ((char *)hdr +
160 be16_to_cpu(bf[1].offset));
161 else
162 dup = NULL;
163 }
164 } else {
165 /*
166 * Not the same free entry,
167 * just check its length.
168 */
169 if (be16_to_cpu(dup->length) < len) {
170 dup = NULL;
171 }
172 }
173 }
174 compact = 0;
175 }
176 /*
177 * If there are stale entries we'll use one for the leaf.
178 * Is the biggest entry enough to avoid compaction?
179 */
180 else if (be16_to_cpu(bf[0].length) >= len) {
181 dup = (xfs_dir2_data_unused_t *)
182 ((char *)hdr + be16_to_cpu(bf[0].offset));
183 compact = 0;
184 }
185 /*
186 * Will need to compact to make this work.
187 */ 322 */
188 else { 323 xfs_dir2_block_need_space(hdr, btp, blp, &tagp, &dup,
189 /* 324 &enddup, &compact, len);
190 * Tag just before the first leaf entry. 325
191 */
192 tagp = (__be16 *)blp - 1;
193 /*
194 * Data object just before the first leaf entry.
195 */
196 dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
197 /*
198 * If it's not free then the data will go where the
199 * leaf data starts now, if it works at all.
200 */
201 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
202 if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
203 (uint)sizeof(*blp) < len)
204 dup = NULL;
205 } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
206 dup = NULL;
207 else
208 dup = (xfs_dir2_data_unused_t *)blp;
209 compact = 1;
210 }
211 /* 326 /*
212 * If this isn't a real add, we're done with the buffer. 327 * Done everything we need for a space check now.
213 */ 328 */
214 if (args->op_flags & XFS_DA_OP_JUSTCHECK) 329 if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
215 xfs_trans_brelse(tp, bp); 330 xfs_trans_brelse(tp, bp);
331 if (!dup)
332 return XFS_ERROR(ENOSPC);
333 return 0;
334 }
335
216 /* 336 /*
217 * If we don't have space for the new entry & leaf ... 337 * If we don't have space for the new entry & leaf ...
218 */ 338 */
219 if (!dup) { 339 if (!dup) {
220 /* 340 /* Don't have a space reservation: return no-space. */
221 * Not trying to actually do anything, or don't have 341 if (args->total == 0)
222 * a space reservation: return no-space.
223 */
224 if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
225 return XFS_ERROR(ENOSPC); 342 return XFS_ERROR(ENOSPC);
226 /* 343 /*
227 * Convert to the next larger format. 344 * Convert to the next larger format.
@@ -232,65 +349,24 @@ xfs_dir2_block_addname(
232 return error; 349 return error;
233 return xfs_dir2_leaf_addname(args); 350 return xfs_dir2_leaf_addname(args);
234 } 351 }
235 /* 352
236 * Just checking, and it would work, so say so.
237 */
238 if (args->op_flags & XFS_DA_OP_JUSTCHECK)
239 return 0;
240 needlog = needscan = 0; 353 needlog = needscan = 0;
354
241 /* 355 /*
242 * If need to compact the leaf entries, do it now. 356 * If need to compact the leaf entries, do it now.
243 * Leave the highest-numbered stale entry stale.
244 * XXX should be the one closest to mid but mid is not yet computed.
245 */
246 if (compact) {
247 int fromidx; /* source leaf index */
248 int toidx; /* target leaf index */
249
250 for (fromidx = toidx = be32_to_cpu(btp->count) - 1,
251 highstale = lfloghigh = -1;
252 fromidx >= 0;
253 fromidx--) {
254 if (blp[fromidx].address ==
255 cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
256 if (highstale == -1)
257 highstale = toidx;
258 else {
259 if (lfloghigh == -1)
260 lfloghigh = toidx;
261 continue;
262 }
263 }
264 if (fromidx < toidx)
265 blp[toidx] = blp[fromidx];
266 toidx--;
267 }
268 lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
269 lfloghigh -= be32_to_cpu(btp->stale) - 1;
270 be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
271 xfs_dir2_data_make_free(tp, bp,
272 (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
273 (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
274 &needlog, &needscan);
275 blp += be32_to_cpu(btp->stale) - 1;
276 btp->stale = cpu_to_be32(1);
277 /*
278 * If we now need to rebuild the bestfree map, do so.
279 * This needs to happen before the next call to use_free.
280 */
281 if (needscan) {
282 xfs_dir2_data_freescan(mp, hdr, &needlog);
283 needscan = 0;
284 }
285 }
286 /*
287 * Set leaf logging boundaries to impossible state.
288 * For the no-stale case they're set explicitly.
289 */ 357 */
358 if (compact)
359 xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog,
360 &lfloghigh, &lfloglow);
290 else if (btp->stale) { 361 else if (btp->stale) {
362 /*
363 * Set leaf logging boundaries to impossible state.
364 * For the no-stale case they're set explicitly.
365 */
291 lfloglow = be32_to_cpu(btp->count); 366 lfloglow = be32_to_cpu(btp->count);
292 lfloghigh = -1; 367 lfloghigh = -1;
293 } 368 }
369
294 /* 370 /*
295 * Find the slot that's first lower than our hash value, -1 if none. 371 * Find the slot that's first lower than our hash value, -1 if none.
296 */ 372 */
@@ -450,18 +526,13 @@ xfs_dir2_block_getdents(
450 /* 526 /*
451 * If the block number in the offset is out of range, we're done. 527 * If the block number in the offset is out of range, we're done.
452 */ 528 */
453 if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) { 529 if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
454 return 0; 530 return 0;
455 } 531
456 /* 532 error = xfs_dir2_block_read(NULL, dp, &bp);
457 * Can't read the block, give up, else get dabuf in bp.
458 */
459 error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1,
460 &bp, XFS_DATA_FORK);
461 if (error) 533 if (error)
462 return error; 534 return error;
463 535
464 ASSERT(bp != NULL);
465 /* 536 /*
466 * Extract the byte offset we start at from the seek pointer. 537 * Extract the byte offset we start at from the seek pointer.
467 * We'll skip entries before this. 538 * We'll skip entries before this.
@@ -637,14 +708,11 @@ xfs_dir2_block_lookup_int(
637 dp = args->dp; 708 dp = args->dp;
638 tp = args->trans; 709 tp = args->trans;
639 mp = dp->i_mount; 710 mp = dp->i_mount;
640 /* 711
641 * Read the buffer, return error if we can't get it. 712 error = xfs_dir2_block_read(tp, dp, &bp);
642 */ 713 if (error)
643 if ((error =
644 xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
645 return error; 714 return error;
646 } 715
647 ASSERT(bp != NULL);
648 hdr = bp->b_addr; 716 hdr = bp->b_addr;
649 xfs_dir2_data_check(dp, bp); 717 xfs_dir2_data_check(dp, bp);
650 btp = xfs_dir2_block_tail_p(mp, hdr); 718 btp = xfs_dir2_block_tail_p(mp, hdr);
@@ -917,10 +985,10 @@ xfs_dir2_leaf_to_block(
917 /* 985 /*
918 * Read the data block if we don't already have it, give up if it fails. 986 * Read the data block if we don't already have it, give up if it fails.
919 */ 987 */
920 if (dbp == NULL && 988 if (!dbp) {
921 (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp, 989 error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp);
922 XFS_DATA_FORK))) { 990 if (error)
923 return error; 991 return error;
924 } 992 }
925 hdr = dbp->b_addr; 993 hdr = dbp->b_addr;
926 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); 994 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
@@ -944,6 +1012,7 @@ xfs_dir2_leaf_to_block(
944 /* 1012 /*
945 * Start converting it to block form. 1013 * Start converting it to block form.
946 */ 1014 */
1015 dbp->b_ops = &xfs_dir2_block_buf_ops;
947 hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); 1016 hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
948 needlog = 1; 1017 needlog = 1;
949 needscan = 0; 1018 needscan = 0;
@@ -1073,6 +1142,7 @@ xfs_dir2_sf_to_block(
1073 kmem_free(sfp); 1142 kmem_free(sfp);
1074 return error; 1143 return error;
1075 } 1144 }
1145 bp->b_ops = &xfs_dir2_block_buf_ops;
1076 hdr = bp->b_addr; 1146 hdr = bp->b_addr;
1077 hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); 1147 hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
1078 /* 1148 /*
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 44ffd4d6bc91..ffcf1774152e 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -34,14 +34,13 @@
34STATIC xfs_dir2_data_free_t * 34STATIC xfs_dir2_data_free_t *
35xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup); 35xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup);
36 36
37#ifdef DEBUG
38/* 37/*
39 * Check the consistency of the data block. 38 * Check the consistency of the data block.
40 * The input can also be a block-format directory. 39 * The input can also be a block-format directory.
41 * Pop an assert if we find anything bad. 40 * Return 0 is the buffer is good, otherwise an error.
42 */ 41 */
43void 42int
44xfs_dir2_data_check( 43__xfs_dir2_data_check(
45 struct xfs_inode *dp, /* incore inode pointer */ 44 struct xfs_inode *dp, /* incore inode pointer */
46 struct xfs_buf *bp) /* data block's buffer */ 45 struct xfs_buf *bp) /* data block's buffer */
47{ 46{
@@ -64,18 +63,23 @@ xfs_dir2_data_check(
64 int stale; /* count of stale leaves */ 63 int stale; /* count of stale leaves */
65 struct xfs_name name; 64 struct xfs_name name;
66 65
67 mp = dp->i_mount; 66 mp = bp->b_target->bt_mount;
68 hdr = bp->b_addr; 67 hdr = bp->b_addr;
69 bf = hdr->bestfree; 68 bf = hdr->bestfree;
70 p = (char *)(hdr + 1); 69 p = (char *)(hdr + 1);
71 70
72 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { 71 switch (hdr->magic) {
72 case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
73 btp = xfs_dir2_block_tail_p(mp, hdr); 73 btp = xfs_dir2_block_tail_p(mp, hdr);
74 lep = xfs_dir2_block_leaf_p(btp); 74 lep = xfs_dir2_block_leaf_p(btp);
75 endp = (char *)lep; 75 endp = (char *)lep;
76 } else { 76 break;
77 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); 77 case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
78 endp = (char *)hdr + mp->m_dirblksize; 78 endp = (char *)hdr + mp->m_dirblksize;
79 break;
80 default:
81 XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
82 return EFSCORRUPTED;
79 } 83 }
80 84
81 count = lastfree = freeseen = 0; 85 count = lastfree = freeseen = 0;
@@ -83,19 +87,22 @@ xfs_dir2_data_check(
83 * Account for zero bestfree entries. 87 * Account for zero bestfree entries.
84 */ 88 */
85 if (!bf[0].length) { 89 if (!bf[0].length) {
86 ASSERT(!bf[0].offset); 90 XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
87 freeseen |= 1 << 0; 91 freeseen |= 1 << 0;
88 } 92 }
89 if (!bf[1].length) { 93 if (!bf[1].length) {
90 ASSERT(!bf[1].offset); 94 XFS_WANT_CORRUPTED_RETURN(!bf[1].offset);
91 freeseen |= 1 << 1; 95 freeseen |= 1 << 1;
92 } 96 }
93 if (!bf[2].length) { 97 if (!bf[2].length) {
94 ASSERT(!bf[2].offset); 98 XFS_WANT_CORRUPTED_RETURN(!bf[2].offset);
95 freeseen |= 1 << 2; 99 freeseen |= 1 << 2;
96 } 100 }
97 ASSERT(be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length)); 101
98 ASSERT(be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length)); 102 XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >=
103 be16_to_cpu(bf[1].length));
104 XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >=
105 be16_to_cpu(bf[2].length));
99 /* 106 /*
100 * Loop over the data/unused entries. 107 * Loop over the data/unused entries.
101 */ 108 */
@@ -107,17 +114,20 @@ xfs_dir2_data_check(
107 * doesn't need to be there. 114 * doesn't need to be there.
108 */ 115 */
109 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { 116 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
110 ASSERT(lastfree == 0); 117 XFS_WANT_CORRUPTED_RETURN(lastfree == 0);
111 ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == 118 XFS_WANT_CORRUPTED_RETURN(
112 (char *)dup - (char *)hdr); 119 be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
120 (char *)dup - (char *)hdr);
113 dfp = xfs_dir2_data_freefind(hdr, dup); 121 dfp = xfs_dir2_data_freefind(hdr, dup);
114 if (dfp) { 122 if (dfp) {
115 i = (int)(dfp - bf); 123 i = (int)(dfp - bf);
116 ASSERT((freeseen & (1 << i)) == 0); 124 XFS_WANT_CORRUPTED_RETURN(
125 (freeseen & (1 << i)) == 0);
117 freeseen |= 1 << i; 126 freeseen |= 1 << i;
118 } else { 127 } else {
119 ASSERT(be16_to_cpu(dup->length) <= 128 XFS_WANT_CORRUPTED_RETURN(
120 be16_to_cpu(bf[2].length)); 129 be16_to_cpu(dup->length) <=
130 be16_to_cpu(bf[2].length));
121 } 131 }
122 p += be16_to_cpu(dup->length); 132 p += be16_to_cpu(dup->length);
123 lastfree = 1; 133 lastfree = 1;
@@ -130,10 +140,12 @@ xfs_dir2_data_check(
130 * The linear search is crude but this is DEBUG code. 140 * The linear search is crude but this is DEBUG code.
131 */ 141 */
132 dep = (xfs_dir2_data_entry_t *)p; 142 dep = (xfs_dir2_data_entry_t *)p;
133 ASSERT(dep->namelen != 0); 143 XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0);
134 ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0); 144 XFS_WANT_CORRUPTED_RETURN(
135 ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == 145 !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
136 (char *)dep - (char *)hdr); 146 XFS_WANT_CORRUPTED_RETURN(
147 be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
148 (char *)dep - (char *)hdr);
137 count++; 149 count++;
138 lastfree = 0; 150 lastfree = 0;
139 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { 151 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
@@ -148,27 +160,122 @@ xfs_dir2_data_check(
148 be32_to_cpu(lep[i].hashval) == hash) 160 be32_to_cpu(lep[i].hashval) == hash)
149 break; 161 break;
150 } 162 }
151 ASSERT(i < be32_to_cpu(btp->count)); 163 XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
152 } 164 }
153 p += xfs_dir2_data_entsize(dep->namelen); 165 p += xfs_dir2_data_entsize(dep->namelen);
154 } 166 }
155 /* 167 /*
156 * Need to have seen all the entries and all the bestfree slots. 168 * Need to have seen all the entries and all the bestfree slots.
157 */ 169 */
158 ASSERT(freeseen == 7); 170 XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
159 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { 171 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
160 for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { 172 for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
161 if (lep[i].address == 173 if (lep[i].address ==
162 cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) 174 cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
163 stale++; 175 stale++;
164 if (i > 0) 176 if (i > 0)
165 ASSERT(be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval)); 177 XFS_WANT_CORRUPTED_RETURN(
178 be32_to_cpu(lep[i].hashval) >=
179 be32_to_cpu(lep[i - 1].hashval));
166 } 180 }
167 ASSERT(count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); 181 XFS_WANT_CORRUPTED_RETURN(count ==
168 ASSERT(stale == be32_to_cpu(btp->stale)); 182 be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
183 XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale));
169 } 184 }
185 return 0;
186}
187
188static void
189xfs_dir2_data_verify(
190 struct xfs_buf *bp)
191{
192 struct xfs_mount *mp = bp->b_target->bt_mount;
193 struct xfs_dir2_data_hdr *hdr = bp->b_addr;
194 int block_ok = 0;
195
196 block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC);
197 block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
198
199 if (!block_ok) {
200 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
201 xfs_buf_ioerror(bp, EFSCORRUPTED);
202 }
203}
204
205/*
206 * Readahead of the first block of the directory when it is opened is completely
207 * oblivious to the format of the directory. Hence we can either get a block
208 * format buffer or a data format buffer on readahead.
209 */
210static void
211xfs_dir2_data_reada_verify(
212 struct xfs_buf *bp)
213{
214 struct xfs_mount *mp = bp->b_target->bt_mount;
215 struct xfs_dir2_data_hdr *hdr = bp->b_addr;
216
217 switch (hdr->magic) {
218 case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
219 bp->b_ops = &xfs_dir2_block_buf_ops;
220 bp->b_ops->verify_read(bp);
221 return;
222 case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
223 xfs_dir2_data_verify(bp);
224 return;
225 default:
226 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
227 xfs_buf_ioerror(bp, EFSCORRUPTED);
228 break;
229 }
230}
231
232static void
233xfs_dir2_data_read_verify(
234 struct xfs_buf *bp)
235{
236 xfs_dir2_data_verify(bp);
237}
238
239static void
240xfs_dir2_data_write_verify(
241 struct xfs_buf *bp)
242{
243 xfs_dir2_data_verify(bp);
244}
245
246const struct xfs_buf_ops xfs_dir2_data_buf_ops = {
247 .verify_read = xfs_dir2_data_read_verify,
248 .verify_write = xfs_dir2_data_write_verify,
249};
250
251static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = {
252 .verify_read = xfs_dir2_data_reada_verify,
253 .verify_write = xfs_dir2_data_write_verify,
254};
255
256
257int
258xfs_dir2_data_read(
259 struct xfs_trans *tp,
260 struct xfs_inode *dp,
261 xfs_dablk_t bno,
262 xfs_daddr_t mapped_bno,
263 struct xfs_buf **bpp)
264{
265 return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
266 XFS_DATA_FORK, &xfs_dir2_data_buf_ops);
267}
268
269int
270xfs_dir2_data_readahead(
271 struct xfs_trans *tp,
272 struct xfs_inode *dp,
273 xfs_dablk_t bno,
274 xfs_daddr_t mapped_bno)
275{
276 return xfs_da_reada_buf(tp, dp, bno, mapped_bno,
277 XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops);
170} 278}
171#endif
172 279
173/* 280/*
174 * Given a data block and an unused entry from that block, 281 * Given a data block and an unused entry from that block,
@@ -409,10 +516,9 @@ xfs_dir2_data_init(
409 */ 516 */
410 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp, 517 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp,
411 XFS_DATA_FORK); 518 XFS_DATA_FORK);
412 if (error) { 519 if (error)
413 return error; 520 return error;
414 } 521 bp->b_ops = &xfs_dir2_data_buf_ops;
415 ASSERT(bp != NULL);
416 522
417 /* 523 /*
418 * Initialize the header. 524 * Initialize the header.
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 0b296253bd01..60cd2fa4e047 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -48,6 +48,83 @@ static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp,
48 int first, int last); 48 int first, int last);
49static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp); 49static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp);
50 50
51static void
52xfs_dir2_leaf_verify(
53 struct xfs_buf *bp,
54 __be16 magic)
55{
56 struct xfs_mount *mp = bp->b_target->bt_mount;
57 struct xfs_dir2_leaf_hdr *hdr = bp->b_addr;
58 int block_ok = 0;
59
60 block_ok = hdr->info.magic == magic;
61 if (!block_ok) {
62 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
63 xfs_buf_ioerror(bp, EFSCORRUPTED);
64 }
65}
66
67static void
68xfs_dir2_leaf1_read_verify(
69 struct xfs_buf *bp)
70{
71 xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
72}
73
74static void
75xfs_dir2_leaf1_write_verify(
76 struct xfs_buf *bp)
77{
78 xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
79}
80
81void
82xfs_dir2_leafn_read_verify(
83 struct xfs_buf *bp)
84{
85 xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
86}
87
88void
89xfs_dir2_leafn_write_verify(
90 struct xfs_buf *bp)
91{
92 xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
93}
94
95static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = {
96 .verify_read = xfs_dir2_leaf1_read_verify,
97 .verify_write = xfs_dir2_leaf1_write_verify,
98};
99
100const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = {
101 .verify_read = xfs_dir2_leafn_read_verify,
102 .verify_write = xfs_dir2_leafn_write_verify,
103};
104
105static int
106xfs_dir2_leaf_read(
107 struct xfs_trans *tp,
108 struct xfs_inode *dp,
109 xfs_dablk_t fbno,
110 xfs_daddr_t mappedbno,
111 struct xfs_buf **bpp)
112{
113 return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
114 XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops);
115}
116
117int
118xfs_dir2_leafn_read(
119 struct xfs_trans *tp,
120 struct xfs_inode *dp,
121 xfs_dablk_t fbno,
122 xfs_daddr_t mappedbno,
123 struct xfs_buf **bpp)
124{
125 return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
126 XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops);
127}
51 128
52/* 129/*
53 * Convert a block form directory to a leaf form directory. 130 * Convert a block form directory to a leaf form directory.
@@ -125,6 +202,7 @@ xfs_dir2_block_to_leaf(
125 /* 202 /*
126 * Fix up the block header, make it a data block. 203 * Fix up the block header, make it a data block.
127 */ 204 */
205 dbp->b_ops = &xfs_dir2_data_buf_ops;
128 hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); 206 hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
129 if (needscan) 207 if (needscan)
130 xfs_dir2_data_freescan(mp, hdr, &needlog); 208 xfs_dir2_data_freescan(mp, hdr, &needlog);
@@ -311,15 +389,11 @@ xfs_dir2_leaf_addname(
311 dp = args->dp; 389 dp = args->dp;
312 tp = args->trans; 390 tp = args->trans;
313 mp = dp->i_mount; 391 mp = dp->i_mount;
314 /* 392
315 * Read the leaf block. 393 error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
316 */ 394 if (error)
317 error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
318 XFS_DATA_FORK);
319 if (error) {
320 return error; 395 return error;
321 } 396
322 ASSERT(lbp != NULL);
323 /* 397 /*
324 * Look up the entry by hash value and name. 398 * Look up the entry by hash value and name.
325 * We know it's not there, our caller has already done a lookup. 399 * We know it's not there, our caller has already done a lookup.
@@ -494,22 +568,21 @@ xfs_dir2_leaf_addname(
494 hdr = dbp->b_addr; 568 hdr = dbp->b_addr;
495 bestsp[use_block] = hdr->bestfree[0].length; 569 bestsp[use_block] = hdr->bestfree[0].length;
496 grown = 1; 570 grown = 1;
497 } 571 } else {
498 /* 572 /*
499 * Already had space in some data block. 573 * Already had space in some data block.
500 * Just read that one in. 574 * Just read that one in.
501 */ 575 */
502 else { 576 error = xfs_dir2_data_read(tp, dp,
503 if ((error = 577 xfs_dir2_db_to_da(mp, use_block),
504 xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block), 578 -1, &dbp);
505 -1, &dbp, XFS_DATA_FORK))) { 579 if (error) {
506 xfs_trans_brelse(tp, lbp); 580 xfs_trans_brelse(tp, lbp);
507 return error; 581 return error;
508 } 582 }
509 hdr = dbp->b_addr; 583 hdr = dbp->b_addr;
510 grown = 0; 584 grown = 0;
511 } 585 }
512 xfs_dir2_data_check(dp, dbp);
513 /* 586 /*
514 * Point to the biggest freespace in our data block. 587 * Point to the biggest freespace in our data block.
515 */ 588 */
@@ -892,10 +965,9 @@ xfs_dir2_leaf_readbuf(
892 * Read the directory block starting at the first mapping. 965 * Read the directory block starting at the first mapping.
893 */ 966 */
894 mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff); 967 mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
895 error = xfs_da_read_buf(NULL, dp, map->br_startoff, 968 error = xfs_dir2_data_read(NULL, dp, map->br_startoff,
896 map->br_blockcount >= mp->m_dirblkfsbs ? 969 map->br_blockcount >= mp->m_dirblkfsbs ?
897 XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, 970 XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
898 &bp, XFS_DATA_FORK);
899 971
900 /* 972 /*
901 * Should just skip over the data block instead of giving up. 973 * Should just skip over the data block instead of giving up.
@@ -922,11 +994,11 @@ xfs_dir2_leaf_readbuf(
922 */ 994 */
923 if (i > mip->ra_current && 995 if (i > mip->ra_current &&
924 map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) { 996 map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
925 xfs_buf_readahead(mp->m_ddev_targp, 997 xfs_dir2_data_readahead(NULL, dp,
998 map[mip->ra_index].br_startoff + mip->ra_offset,
926 XFS_FSB_TO_DADDR(mp, 999 XFS_FSB_TO_DADDR(mp,
927 map[mip->ra_index].br_startblock + 1000 map[mip->ra_index].br_startblock +
928 mip->ra_offset), 1001 mip->ra_offset));
929 (int)BTOBB(mp->m_dirblksize));
930 mip->ra_current = i; 1002 mip->ra_current = i;
931 } 1003 }
932 1004
@@ -935,10 +1007,9 @@ xfs_dir2_leaf_readbuf(
935 * use our mapping, but this is a very rare case. 1007 * use our mapping, but this is a very rare case.
936 */ 1008 */
937 else if (i > mip->ra_current) { 1009 else if (i > mip->ra_current) {
938 xfs_da_reada_buf(NULL, dp, 1010 xfs_dir2_data_readahead(NULL, dp,
939 map[mip->ra_index].br_startoff + 1011 map[mip->ra_index].br_startoff +
940 mip->ra_offset, 1012 mip->ra_offset, -1);
941 XFS_DATA_FORK);
942 mip->ra_current = i; 1013 mip->ra_current = i;
943 } 1014 }
944 1015
@@ -1177,15 +1248,14 @@ xfs_dir2_leaf_init(
1177 * Get the buffer for the block. 1248 * Get the buffer for the block.
1178 */ 1249 */
1179 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp, 1250 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp,
1180 XFS_DATA_FORK); 1251 XFS_DATA_FORK);
1181 if (error) { 1252 if (error)
1182 return error; 1253 return error;
1183 } 1254
1184 ASSERT(bp != NULL);
1185 leaf = bp->b_addr;
1186 /* 1255 /*
1187 * Initialize the header. 1256 * Initialize the header.
1188 */ 1257 */
1258 leaf = bp->b_addr;
1189 leaf->hdr.info.magic = cpu_to_be16(magic); 1259 leaf->hdr.info.magic = cpu_to_be16(magic);
1190 leaf->hdr.info.forw = 0; 1260 leaf->hdr.info.forw = 0;
1191 leaf->hdr.info.back = 0; 1261 leaf->hdr.info.back = 0;
@@ -1198,10 +1268,12 @@ xfs_dir2_leaf_init(
1198 * the block. 1268 * the block.
1199 */ 1269 */
1200 if (magic == XFS_DIR2_LEAF1_MAGIC) { 1270 if (magic == XFS_DIR2_LEAF1_MAGIC) {
1271 bp->b_ops = &xfs_dir2_leaf1_buf_ops;
1201 ltp = xfs_dir2_leaf_tail_p(mp, leaf); 1272 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
1202 ltp->bestcount = 0; 1273 ltp->bestcount = 0;
1203 xfs_dir2_leaf_log_tail(tp, bp); 1274 xfs_dir2_leaf_log_tail(tp, bp);
1204 } 1275 } else
1276 bp->b_ops = &xfs_dir2_leafn_buf_ops;
1205 *bpp = bp; 1277 *bpp = bp;
1206 return 0; 1278 return 0;
1207} 1279}
@@ -1372,13 +1444,11 @@ xfs_dir2_leaf_lookup_int(
1372 dp = args->dp; 1444 dp = args->dp;
1373 tp = args->trans; 1445 tp = args->trans;
1374 mp = dp->i_mount; 1446 mp = dp->i_mount;
1375 /* 1447
1376 * Read the leaf block into the buffer. 1448 error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
1377 */
1378 error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
1379 XFS_DATA_FORK);
1380 if (error) 1449 if (error)
1381 return error; 1450 return error;
1451
1382 *lbpp = lbp; 1452 *lbpp = lbp;
1383 leaf = lbp->b_addr; 1453 leaf = lbp->b_addr;
1384 xfs_dir2_leaf_check(dp, lbp); 1454 xfs_dir2_leaf_check(dp, lbp);
@@ -1409,14 +1479,13 @@ xfs_dir2_leaf_lookup_int(
1409 if (newdb != curdb) { 1479 if (newdb != curdb) {
1410 if (dbp) 1480 if (dbp)
1411 xfs_trans_brelse(tp, dbp); 1481 xfs_trans_brelse(tp, dbp);
1412 error = xfs_da_read_buf(tp, dp, 1482 error = xfs_dir2_data_read(tp, dp,
1413 xfs_dir2_db_to_da(mp, newdb), 1483 xfs_dir2_db_to_da(mp, newdb),
1414 -1, &dbp, XFS_DATA_FORK); 1484 -1, &dbp);
1415 if (error) { 1485 if (error) {
1416 xfs_trans_brelse(tp, lbp); 1486 xfs_trans_brelse(tp, lbp);
1417 return error; 1487 return error;
1418 } 1488 }
1419 xfs_dir2_data_check(dp, dbp);
1420 curdb = newdb; 1489 curdb = newdb;
1421 } 1490 }
1422 /* 1491 /*
@@ -1451,9 +1520,9 @@ xfs_dir2_leaf_lookup_int(
1451 ASSERT(cidb != -1); 1520 ASSERT(cidb != -1);
1452 if (cidb != curdb) { 1521 if (cidb != curdb) {
1453 xfs_trans_brelse(tp, dbp); 1522 xfs_trans_brelse(tp, dbp);
1454 error = xfs_da_read_buf(tp, dp, 1523 error = xfs_dir2_data_read(tp, dp,
1455 xfs_dir2_db_to_da(mp, cidb), 1524 xfs_dir2_db_to_da(mp, cidb),
1456 -1, &dbp, XFS_DATA_FORK); 1525 -1, &dbp);
1457 if (error) { 1526 if (error) {
1458 xfs_trans_brelse(tp, lbp); 1527 xfs_trans_brelse(tp, lbp);
1459 return error; 1528 return error;
@@ -1738,10 +1807,9 @@ xfs_dir2_leaf_trim_data(
1738 /* 1807 /*
1739 * Read the offending data block. We need its buffer. 1808 * Read the offending data block. We need its buffer.
1740 */ 1809 */
1741 if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp, 1810 error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp);
1742 XFS_DATA_FORK))) { 1811 if (error)
1743 return error; 1812 return error;
1744 }
1745 1813
1746 leaf = lbp->b_addr; 1814 leaf = lbp->b_addr;
1747 ltp = xfs_dir2_leaf_tail_p(mp, leaf); 1815 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
@@ -1864,10 +1932,9 @@ xfs_dir2_node_to_leaf(
1864 /* 1932 /*
1865 * Read the freespace block. 1933 * Read the freespace block.
1866 */ 1934 */
1867 if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp, 1935 error = xfs_dir2_free_read(tp, dp, mp->m_dirfreeblk, &fbp);
1868 XFS_DATA_FORK))) { 1936 if (error)
1869 return error; 1937 return error;
1870 }
1871 free = fbp->b_addr; 1938 free = fbp->b_addr;
1872 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); 1939 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
1873 ASSERT(!free->hdr.firstdb); 1940 ASSERT(!free->hdr.firstdb);
@@ -1890,7 +1957,10 @@ xfs_dir2_node_to_leaf(
1890 xfs_dir2_leaf_compact(args, lbp); 1957 xfs_dir2_leaf_compact(args, lbp);
1891 else 1958 else
1892 xfs_dir2_leaf_log_header(tp, lbp); 1959 xfs_dir2_leaf_log_header(tp, lbp);
1960
1961 lbp->b_ops = &xfs_dir2_leaf1_buf_ops;
1893 leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC); 1962 leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC);
1963
1894 /* 1964 /*
1895 * Set up the leaf tail from the freespace block. 1965 * Set up the leaf tail from the freespace block.
1896 */ 1966 */
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 6c7052406605..5980f9b7fa9b 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -55,6 +55,74 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
55static int xfs_dir2_node_addname_int(xfs_da_args_t *args, 55static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
56 xfs_da_state_blk_t *fblk); 56 xfs_da_state_blk_t *fblk);
57 57
58static void
59xfs_dir2_free_verify(
60 struct xfs_buf *bp)
61{
62 struct xfs_mount *mp = bp->b_target->bt_mount;
63 struct xfs_dir2_free_hdr *hdr = bp->b_addr;
64 int block_ok = 0;
65
66 block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC);
67 if (!block_ok) {
68 XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic",
69 XFS_ERRLEVEL_LOW, mp, hdr);
70 xfs_buf_ioerror(bp, EFSCORRUPTED);
71 }
72}
73
74static void
75xfs_dir2_free_read_verify(
76 struct xfs_buf *bp)
77{
78 xfs_dir2_free_verify(bp);
79}
80
81static void
82xfs_dir2_free_write_verify(
83 struct xfs_buf *bp)
84{
85 xfs_dir2_free_verify(bp);
86}
87
88static const struct xfs_buf_ops xfs_dir2_free_buf_ops = {
89 .verify_read = xfs_dir2_free_read_verify,
90 .verify_write = xfs_dir2_free_write_verify,
91};
92
93
94static int
95__xfs_dir2_free_read(
96 struct xfs_trans *tp,
97 struct xfs_inode *dp,
98 xfs_dablk_t fbno,
99 xfs_daddr_t mappedbno,
100 struct xfs_buf **bpp)
101{
102 return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
103 XFS_DATA_FORK, &xfs_dir2_free_buf_ops);
104}
105
106int
107xfs_dir2_free_read(
108 struct xfs_trans *tp,
109 struct xfs_inode *dp,
110 xfs_dablk_t fbno,
111 struct xfs_buf **bpp)
112{
113 return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp);
114}
115
116static int
117xfs_dir2_free_try_read(
118 struct xfs_trans *tp,
119 struct xfs_inode *dp,
120 xfs_dablk_t fbno,
121 struct xfs_buf **bpp)
122{
123 return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp);
124}
125
58/* 126/*
59 * Log entries from a freespace block. 127 * Log entries from a freespace block.
60 */ 128 */
@@ -131,11 +199,12 @@ xfs_dir2_leaf_to_node(
131 /* 199 /*
132 * Get the buffer for the new freespace block. 200 * Get the buffer for the new freespace block.
133 */ 201 */
134 if ((error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, 202 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
135 XFS_DATA_FORK))) { 203 XFS_DATA_FORK);
204 if (error)
136 return error; 205 return error;
137 } 206 fbp->b_ops = &xfs_dir2_free_buf_ops;
138 ASSERT(fbp != NULL); 207
139 free = fbp->b_addr; 208 free = fbp->b_addr;
140 leaf = lbp->b_addr; 209 leaf = lbp->b_addr;
141 ltp = xfs_dir2_leaf_tail_p(mp, leaf); 210 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
@@ -157,7 +226,10 @@ xfs_dir2_leaf_to_node(
157 *to = cpu_to_be16(off); 226 *to = cpu_to_be16(off);
158 } 227 }
159 free->hdr.nused = cpu_to_be32(n); 228 free->hdr.nused = cpu_to_be32(n);
229
230 lbp->b_ops = &xfs_dir2_leafn_buf_ops;
160 leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); 231 leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
232
161 /* 233 /*
162 * Log everything. 234 * Log everything.
163 */ 235 */
@@ -394,12 +466,10 @@ xfs_dir2_leafn_lookup_for_addname(
394 */ 466 */
395 if (curbp) 467 if (curbp)
396 xfs_trans_brelse(tp, curbp); 468 xfs_trans_brelse(tp, curbp);
397 /* 469
398 * Read the free block. 470 error = xfs_dir2_free_read(tp, dp,
399 */
400 error = xfs_da_read_buf(tp, dp,
401 xfs_dir2_db_to_da(mp, newfdb), 471 xfs_dir2_db_to_da(mp, newfdb),
402 -1, &curbp, XFS_DATA_FORK); 472 &curbp);
403 if (error) 473 if (error)
404 return error; 474 return error;
405 free = curbp->b_addr; 475 free = curbp->b_addr;
@@ -534,9 +604,9 @@ xfs_dir2_leafn_lookup_for_entry(
534 ASSERT(state->extravalid); 604 ASSERT(state->extravalid);
535 curbp = state->extrablk.bp; 605 curbp = state->extrablk.bp;
536 } else { 606 } else {
537 error = xfs_da_read_buf(tp, dp, 607 error = xfs_dir2_data_read(tp, dp,
538 xfs_dir2_db_to_da(mp, newdb), 608 xfs_dir2_db_to_da(mp, newdb),
539 -1, &curbp, XFS_DATA_FORK); 609 -1, &curbp);
540 if (error) 610 if (error)
541 return error; 611 return error;
542 } 612 }
@@ -568,6 +638,7 @@ xfs_dir2_leafn_lookup_for_entry(
568 state->extrablk.index = (int)((char *)dep - 638 state->extrablk.index = (int)((char *)dep -
569 (char *)curbp->b_addr); 639 (char *)curbp->b_addr);
570 state->extrablk.magic = XFS_DIR2_DATA_MAGIC; 640 state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
641 curbp->b_ops = &xfs_dir2_data_buf_ops;
571 if (cmp == XFS_CMP_EXACT) 642 if (cmp == XFS_CMP_EXACT)
572 return XFS_ERROR(EEXIST); 643 return XFS_ERROR(EEXIST);
573 } 644 }
@@ -582,6 +653,7 @@ xfs_dir2_leafn_lookup_for_entry(
582 state->extrablk.index = -1; 653 state->extrablk.index = -1;
583 state->extrablk.blkno = curdb; 654 state->extrablk.blkno = curdb;
584 state->extrablk.magic = XFS_DIR2_DATA_MAGIC; 655 state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
656 curbp->b_ops = &xfs_dir2_data_buf_ops;
585 } else { 657 } else {
586 /* If the curbp is not the CI match block, drop it */ 658 /* If the curbp is not the CI match block, drop it */
587 if (state->extrablk.bp != curbp) 659 if (state->extrablk.bp != curbp)
@@ -825,6 +897,77 @@ xfs_dir2_leafn_rebalance(
825 } 897 }
826} 898}
827 899
900static int
901xfs_dir2_data_block_free(
902 xfs_da_args_t *args,
903 struct xfs_dir2_data_hdr *hdr,
904 struct xfs_dir2_free *free,
905 xfs_dir2_db_t fdb,
906 int findex,
907 struct xfs_buf *fbp,
908 int longest)
909{
910 struct xfs_trans *tp = args->trans;
911 int logfree = 0;
912
913 if (!hdr) {
914 /* One less used entry in the free table. */
915 be32_add_cpu(&free->hdr.nused, -1);
916 xfs_dir2_free_log_header(tp, fbp);
917
918 /*
919 * If this was the last entry in the table, we can trim the
920 * table size back. There might be other entries at the end
921 * referring to non-existent data blocks, get those too.
922 */
923 if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
924 int i; /* free entry index */
925
926 for (i = findex - 1; i >= 0; i--) {
927 if (free->bests[i] != cpu_to_be16(NULLDATAOFF))
928 break;
929 }
930 free->hdr.nvalid = cpu_to_be32(i + 1);
931 logfree = 0;
932 } else {
933 /* Not the last entry, just punch it out. */
934 free->bests[findex] = cpu_to_be16(NULLDATAOFF);
935 logfree = 1;
936 }
937 /*
938 * If there are no useful entries left in the block,
939 * get rid of the block if we can.
940 */
941 if (!free->hdr.nused) {
942 int error;
943
944 error = xfs_dir2_shrink_inode(args, fdb, fbp);
945 if (error == 0) {
946 fbp = NULL;
947 logfree = 0;
948 } else if (error != ENOSPC || args->total != 0)
949 return error;
950 /*
951 * It's possible to get ENOSPC if there is no
952 * space reservation. In this case some one
953 * else will eventually get rid of this block.
954 */
955 }
956 } else {
957 /*
958 * Data block is not empty, just set the free entry to the new
959 * value.
960 */
961 free->bests[findex] = cpu_to_be16(longest);
962 logfree = 1;
963 }
964
965 /* Log the free entry that changed, unless we got rid of it. */
966 if (logfree)
967 xfs_dir2_free_log_bests(tp, fbp, findex, findex);
968 return 0;
969}
970
828/* 971/*
829 * Remove an entry from a node directory. 972 * Remove an entry from a node directory.
830 * This removes the leaf entry and the data entry, 973 * This removes the leaf entry and the data entry,
@@ -908,17 +1051,16 @@ xfs_dir2_leafn_remove(
908 xfs_dir2_db_t fdb; /* freeblock block number */ 1051 xfs_dir2_db_t fdb; /* freeblock block number */
909 int findex; /* index in freeblock entries */ 1052 int findex; /* index in freeblock entries */
910 xfs_dir2_free_t *free; /* freeblock structure */ 1053 xfs_dir2_free_t *free; /* freeblock structure */
911 int logfree; /* need to log free entry */
912 1054
913 /* 1055 /*
914 * Convert the data block number to a free block, 1056 * Convert the data block number to a free block,
915 * read in the free block. 1057 * read in the free block.
916 */ 1058 */
917 fdb = xfs_dir2_db_to_fdb(mp, db); 1059 fdb = xfs_dir2_db_to_fdb(mp, db);
918 if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), 1060 error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb),
919 -1, &fbp, XFS_DATA_FORK))) { 1061 &fbp);
1062 if (error)
920 return error; 1063 return error;
921 }
922 free = fbp->b_addr; 1064 free = fbp->b_addr;
923 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); 1065 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
924 ASSERT(be32_to_cpu(free->hdr.firstdb) == 1066 ASSERT(be32_to_cpu(free->hdr.firstdb) ==
@@ -954,68 +1096,12 @@ xfs_dir2_leafn_remove(
954 * If we got rid of the data block, we can eliminate that entry 1096 * If we got rid of the data block, we can eliminate that entry
955 * in the free block. 1097 * in the free block.
956 */ 1098 */
957 if (hdr == NULL) { 1099 error = xfs_dir2_data_block_free(args, hdr, free,
958 /* 1100 fdb, findex, fbp, longest);
959 * One less used entry in the free table. 1101 if (error)
960 */ 1102 return error;
961 be32_add_cpu(&free->hdr.nused, -1);
962 xfs_dir2_free_log_header(tp, fbp);
963 /*
964 * If this was the last entry in the table, we can
965 * trim the table size back. There might be other
966 * entries at the end referring to non-existent
967 * data blocks, get those too.
968 */
969 if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
970 int i; /* free entry index */
971
972 for (i = findex - 1;
973 i >= 0 &&
974 free->bests[i] == cpu_to_be16(NULLDATAOFF);
975 i--)
976 continue;
977 free->hdr.nvalid = cpu_to_be32(i + 1);
978 logfree = 0;
979 }
980 /*
981 * Not the last entry, just punch it out.
982 */
983 else {
984 free->bests[findex] = cpu_to_be16(NULLDATAOFF);
985 logfree = 1;
986 }
987 /*
988 * If there are no useful entries left in the block,
989 * get rid of the block if we can.
990 */
991 if (!free->hdr.nused) {
992 error = xfs_dir2_shrink_inode(args, fdb, fbp);
993 if (error == 0) {
994 fbp = NULL;
995 logfree = 0;
996 } else if (error != ENOSPC || args->total != 0)
997 return error;
998 /*
999 * It's possible to get ENOSPC if there is no
1000 * space reservation. In this case some one
1001 * else will eventually get rid of this block.
1002 */
1003 }
1004 }
1005 /*
1006 * Data block is not empty, just set the free entry to
1007 * the new value.
1008 */
1009 else {
1010 free->bests[findex] = cpu_to_be16(longest);
1011 logfree = 1;
1012 }
1013 /*
1014 * Log the free entry that changed, unless we got rid of it.
1015 */
1016 if (logfree)
1017 xfs_dir2_free_log_bests(tp, fbp, findex, findex);
1018 } 1103 }
1104
1019 xfs_dir2_leafn_check(dp, bp); 1105 xfs_dir2_leafn_check(dp, bp);
1020 /* 1106 /*
1021 * Return indication of whether this leaf block is empty enough 1107 * Return indication of whether this leaf block is empty enough
@@ -1169,12 +1255,11 @@ xfs_dir2_leafn_toosmall(
1169 /* 1255 /*
1170 * Read the sibling leaf block. 1256 * Read the sibling leaf block.
1171 */ 1257 */
1172 if ((error = 1258 error = xfs_dir2_leafn_read(state->args->trans, state->args->dp,
1173 xfs_da_read_buf(state->args->trans, state->args->dp, blkno, 1259 blkno, -1, &bp);
1174 -1, &bp, XFS_DATA_FORK))) { 1260 if (error)
1175 return error; 1261 return error;
1176 } 1262
1177 ASSERT(bp != NULL);
1178 /* 1263 /*
1179 * Count bytes in the two blocks combined. 1264 * Count bytes in the two blocks combined.
1180 */ 1265 */
@@ -1454,14 +1539,13 @@ xfs_dir2_node_addname_int(
1454 * This should be really rare, so there's no reason 1539 * This should be really rare, so there's no reason
1455 * to avoid it. 1540 * to avoid it.
1456 */ 1541 */
1457 if ((error = xfs_da_read_buf(tp, dp, 1542 error = xfs_dir2_free_try_read(tp, dp,
1458 xfs_dir2_db_to_da(mp, fbno), -2, &fbp, 1543 xfs_dir2_db_to_da(mp, fbno),
1459 XFS_DATA_FORK))) { 1544 &fbp);
1545 if (error)
1460 return error; 1546 return error;
1461 } 1547 if (!fbp)
1462 if (unlikely(fbp == NULL)) {
1463 continue; 1548 continue;
1464 }
1465 free = fbp->b_addr; 1549 free = fbp->b_addr;
1466 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); 1550 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
1467 findex = 0; 1551 findex = 0;
@@ -1520,9 +1604,10 @@ xfs_dir2_node_addname_int(
1520 * that was just allocated. 1604 * that was just allocated.
1521 */ 1605 */
1522 fbno = xfs_dir2_db_to_fdb(mp, dbno); 1606 fbno = xfs_dir2_db_to_fdb(mp, dbno);
1523 if (unlikely(error = xfs_da_read_buf(tp, dp, 1607 error = xfs_dir2_free_try_read(tp, dp,
1524 xfs_dir2_db_to_da(mp, fbno), -2, &fbp, 1608 xfs_dir2_db_to_da(mp, fbno),
1525 XFS_DATA_FORK))) 1609 &fbp);
1610 if (error)
1526 return error; 1611 return error;
1527 1612
1528 /* 1613 /*
@@ -1561,12 +1646,12 @@ xfs_dir2_node_addname_int(
1561 /* 1646 /*
1562 * Get a buffer for the new block. 1647 * Get a buffer for the new block.
1563 */ 1648 */
1564 if ((error = xfs_da_get_buf(tp, dp, 1649 error = xfs_da_get_buf(tp, dp,
1565 xfs_dir2_db_to_da(mp, fbno), 1650 xfs_dir2_db_to_da(mp, fbno),
1566 -1, &fbp, XFS_DATA_FORK))) { 1651 -1, &fbp, XFS_DATA_FORK);
1652 if (error)
1567 return error; 1653 return error;
1568 } 1654 fbp->b_ops = &xfs_dir2_free_buf_ops;
1569 ASSERT(fbp != NULL);
1570 1655
1571 /* 1656 /*
1572 * Initialize the new block to be empty, and remember 1657 * Initialize the new block to be empty, and remember
@@ -1630,8 +1715,8 @@ xfs_dir2_node_addname_int(
1630 /* 1715 /*
1631 * Read the data block in. 1716 * Read the data block in.
1632 */ 1717 */
1633 error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno), 1718 error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno),
1634 -1, &dbp, XFS_DATA_FORK); 1719 -1, &dbp);
1635 if (error) 1720 if (error)
1636 return error; 1721 return error;
1637 hdr = dbp->b_addr; 1722 hdr = dbp->b_addr;
@@ -1917,18 +2002,15 @@ xfs_dir2_node_trim_free(
1917 /* 2002 /*
1918 * Read the freespace block. 2003 * Read the freespace block.
1919 */ 2004 */
1920 if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp, 2005 error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
1921 XFS_DATA_FORK))) { 2006 if (error)
1922 return error; 2007 return error;
1923 }
1924
1925 /* 2008 /*
1926 * There can be holes in freespace. If fo is a hole, there's 2009 * There can be holes in freespace. If fo is a hole, there's
1927 * nothing to do. 2010 * nothing to do.
1928 */ 2011 */
1929 if (bp == NULL) { 2012 if (!bp)
1930 return 0; 2013 return 0;
1931 }
1932 free = bp->b_addr; 2014 free = bp->b_addr;
1933 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); 2015 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
1934 /* 2016 /*
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 3523d3e15aa8..7da79f6515fd 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -30,6 +30,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
30 const unsigned char *name, int len); 30 const unsigned char *name, int len);
31 31
32/* xfs_dir2_block.c */ 32/* xfs_dir2_block.c */
33extern const struct xfs_buf_ops xfs_dir2_block_buf_ops;
34
33extern int xfs_dir2_block_addname(struct xfs_da_args *args); 35extern int xfs_dir2_block_addname(struct xfs_da_args *args);
34extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent, 36extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent,
35 xfs_off_t *offset, filldir_t filldir); 37 xfs_off_t *offset, filldir_t filldir);
@@ -41,10 +43,19 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
41 43
42/* xfs_dir2_data.c */ 44/* xfs_dir2_data.c */
43#ifdef DEBUG 45#ifdef DEBUG
44extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); 46#define xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp);
45#else 47#else
46#define xfs_dir2_data_check(dp,bp) 48#define xfs_dir2_data_check(dp,bp)
47#endif 49#endif
50
51extern const struct xfs_buf_ops xfs_dir2_data_buf_ops;
52
53extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
54extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
55 xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
56extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
57 xfs_dablk_t bno, xfs_daddr_t mapped_bno);
58
48extern struct xfs_dir2_data_free * 59extern struct xfs_dir2_data_free *
49xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, 60xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
50 struct xfs_dir2_data_unused *dup, int *loghead); 61 struct xfs_dir2_data_unused *dup, int *loghead);
@@ -66,6 +77,10 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
66 xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); 77 xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
67 78
68/* xfs_dir2_leaf.c */ 79/* xfs_dir2_leaf.c */
80extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops;
81
82extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
83 xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
69extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, 84extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
70 struct xfs_buf *dbp); 85 struct xfs_buf *dbp);
71extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); 86extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
@@ -115,6 +130,8 @@ extern int xfs_dir2_node_removename(struct xfs_da_args *args);
115extern int xfs_dir2_node_replace(struct xfs_da_args *args); 130extern int xfs_dir2_node_replace(struct xfs_da_args *args);
116extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo, 131extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
117 int *rvalp); 132 int *rvalp);
133extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
134 xfs_dablk_t fbno, struct xfs_buf **bpp);
118 135
119/* xfs_dir2_sf.c */ 136/* xfs_dir2_sf.c */
120extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp); 137extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index bf27fcca4843..9e1bf5294c91 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -248,7 +248,59 @@ xfs_qm_init_dquot_blk(
248 xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); 248 xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
249} 249}
250 250
251static void
252xfs_dquot_buf_verify(
253 struct xfs_buf *bp)
254{
255 struct xfs_mount *mp = bp->b_target->bt_mount;
256 struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
257 struct xfs_disk_dquot *ddq;
258 xfs_dqid_t id = 0;
259 int i;
260
261 /*
262 * On the first read of the buffer, verify that each dquot is valid.
263 * We don't know what the id of the dquot is supposed to be, just that
264 * they should be increasing monotonically within the buffer. If the
265 * first id is corrupt, then it will fail on the second dquot in the
266 * buffer so corruptions could point to the wrong dquot in this case.
267 */
268 for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
269 int error;
270
271 ddq = &d[i].dd_diskdq;
272
273 if (i == 0)
274 id = be32_to_cpu(ddq->d_id);
275
276 error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
277 "xfs_dquot_read_verify");
278 if (error) {
279 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d);
280 xfs_buf_ioerror(bp, EFSCORRUPTED);
281 break;
282 }
283 }
284}
285
286static void
287xfs_dquot_buf_read_verify(
288 struct xfs_buf *bp)
289{
290 xfs_dquot_buf_verify(bp);
291}
292
293void
294xfs_dquot_buf_write_verify(
295 struct xfs_buf *bp)
296{
297 xfs_dquot_buf_verify(bp);
298}
251 299
300const struct xfs_buf_ops xfs_dquot_buf_ops = {
301 .verify_read = xfs_dquot_buf_read_verify,
302 .verify_write = xfs_dquot_buf_write_verify,
303};
252 304
253/* 305/*
254 * Allocate a block and fill it with dquots. 306 * Allocate a block and fill it with dquots.
@@ -315,6 +367,7 @@ xfs_qm_dqalloc(
315 error = xfs_buf_geterror(bp); 367 error = xfs_buf_geterror(bp);
316 if (error) 368 if (error)
317 goto error1; 369 goto error1;
370 bp->b_ops = &xfs_dquot_buf_ops;
318 371
319 /* 372 /*
320 * Make a chunk of dquots out of this buffer and log 373 * Make a chunk of dquots out of this buffer and log
@@ -359,6 +412,51 @@ xfs_qm_dqalloc(
359 412
360 return (error); 413 return (error);
361} 414}
415STATIC int
416xfs_qm_dqrepair(
417 struct xfs_mount *mp,
418 struct xfs_trans *tp,
419 struct xfs_dquot *dqp,
420 xfs_dqid_t firstid,
421 struct xfs_buf **bpp)
422{
423 int error;
424 struct xfs_disk_dquot *ddq;
425 struct xfs_dqblk *d;
426 int i;
427
428 /*
429 * Read the buffer without verification so we get the corrupted
430 * buffer returned to us. make sure we verify it on write, though.
431 */
432 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno,
433 mp->m_quotainfo->qi_dqchunklen,
434 0, bpp, NULL);
435
436 if (error) {
437 ASSERT(*bpp == NULL);
438 return XFS_ERROR(error);
439 }
440 (*bpp)->b_ops = &xfs_dquot_buf_ops;
441
442 ASSERT(xfs_buf_islocked(*bpp));
443 d = (struct xfs_dqblk *)(*bpp)->b_addr;
444
445 /* Do the actual repair of dquots in this buffer */
446 for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
447 ddq = &d[i].dd_diskdq;
448 error = xfs_qm_dqcheck(mp, ddq, firstid + i,
449 dqp->dq_flags & XFS_DQ_ALLTYPES,
450 XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair");
451 if (error) {
452 /* repair failed, we're screwed */
453 xfs_trans_brelse(tp, *bpp);
454 return XFS_ERROR(EIO);
455 }
456 }
457
458 return 0;
459}
362 460
363/* 461/*
364 * Maps a dquot to the buffer containing its on-disk version. 462 * Maps a dquot to the buffer containing its on-disk version.
@@ -378,7 +476,6 @@ xfs_qm_dqtobp(
378 xfs_buf_t *bp; 476 xfs_buf_t *bp;
379 xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp); 477 xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp);
380 xfs_mount_t *mp = dqp->q_mount; 478 xfs_mount_t *mp = dqp->q_mount;
381 xfs_disk_dquot_t *ddq;
382 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); 479 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
383 xfs_trans_t *tp = (tpp ? *tpp : NULL); 480 xfs_trans_t *tp = (tpp ? *tpp : NULL);
384 481
@@ -439,33 +536,24 @@ xfs_qm_dqtobp(
439 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 536 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
440 dqp->q_blkno, 537 dqp->q_blkno,
441 mp->m_quotainfo->qi_dqchunklen, 538 mp->m_quotainfo->qi_dqchunklen,
442 0, &bp); 539 0, &bp, &xfs_dquot_buf_ops);
443 if (error || !bp)
444 return XFS_ERROR(error);
445 }
446
447 ASSERT(xfs_buf_islocked(bp));
448 540
449 /* 541 if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
450 * calculate the location of the dquot inside the buffer. 542 xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff *
451 */ 543 mp->m_quotainfo->qi_dqperchunk;
452 ddq = bp->b_addr + dqp->q_bufoffset; 544 ASSERT(bp == NULL);
545 error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp);
546 }
453 547
454 /* 548 if (error) {
455 * A simple sanity check in case we got a corrupted dquot... 549 ASSERT(bp == NULL);
456 */ 550 return XFS_ERROR(error);
457 error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
458 flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
459 "dqtobp");
460 if (error) {
461 if (!(flags & XFS_QMOPT_DQREPAIR)) {
462 xfs_trans_brelse(tp, bp);
463 return XFS_ERROR(EIO);
464 } 551 }
465 } 552 }
466 553
554 ASSERT(xfs_buf_islocked(bp));
467 *O_bpp = bp; 555 *O_bpp = bp;
468 *O_ddpp = ddq; 556 *O_ddpp = bp->b_addr + dqp->q_bufoffset;
469 557
470 return (0); 558 return (0);
471} 559}
@@ -920,7 +1008,7 @@ xfs_qm_dqflush(
920 * Get the buffer containing the on-disk dquot 1008 * Get the buffer containing the on-disk dquot
921 */ 1009 */
922 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, 1010 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
923 mp->m_quotainfo->qi_dqchunklen, 0, &bp); 1011 mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL);
924 if (error) 1012 if (error)
925 goto out_unlock; 1013 goto out_unlock;
926 1014
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 7d20af27346d..c694a8469c4a 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -161,4 +161,6 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
161 return dqp; 161 return dqp;
162} 162}
163 163
164extern const struct xfs_buf_ops xfs_dquot_buf_ops;
165
164#endif /* __XFS_DQUOT_H__ */ 166#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 8c6d1d70278c..a83611849cee 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -29,6 +29,7 @@
29#include "xfs_inode.h" 29#include "xfs_inode.h"
30#include "xfs_inode_item.h" 30#include "xfs_inode_item.h"
31#include "xfs_trace.h" 31#include "xfs_trace.h"
32#include "xfs_icache.h"
32 33
33/* 34/*
34 * Note that we only accept fileids which are long enough rather than allow 35 * Note that we only accept fileids which are long enough rather than allow
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index aa473fa640a2..67284edb84d7 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -31,6 +31,8 @@
31#include "xfs_error.h" 31#include "xfs_error.h"
32#include "xfs_vnodeops.h" 32#include "xfs_vnodeops.h"
33#include "xfs_da_btree.h" 33#include "xfs_da_btree.h"
34#include "xfs_dir2_format.h"
35#include "xfs_dir2_priv.h"
34#include "xfs_ioctl.h" 36#include "xfs_ioctl.h"
35#include "xfs_trace.h" 37#include "xfs_trace.h"
36 38
@@ -84,7 +86,7 @@ xfs_rw_ilock_demote(
84 * valid before the operation, it will be read from disk before 86 * valid before the operation, it will be read from disk before
85 * being partially zeroed. 87 * being partially zeroed.
86 */ 88 */
87STATIC int 89int
88xfs_iozero( 90xfs_iozero(
89 struct xfs_inode *ip, /* inode */ 91 struct xfs_inode *ip, /* inode */
90 loff_t pos, /* offset in file */ 92 loff_t pos, /* offset in file */
@@ -255,15 +257,14 @@ xfs_file_aio_read(
255 xfs_buftarg_t *target = 257 xfs_buftarg_t *target =
256 XFS_IS_REALTIME_INODE(ip) ? 258 XFS_IS_REALTIME_INODE(ip) ?
257 mp->m_rtdev_targp : mp->m_ddev_targp; 259 mp->m_rtdev_targp : mp->m_ddev_targp;
258 if ((iocb->ki_pos & target->bt_smask) || 260 if ((pos & target->bt_smask) || (size & target->bt_smask)) {
259 (size & target->bt_smask)) { 261 if (pos == i_size_read(inode))
260 if (iocb->ki_pos == i_size_read(inode))
261 return 0; 262 return 0;
262 return -XFS_ERROR(EINVAL); 263 return -XFS_ERROR(EINVAL);
263 } 264 }
264 } 265 }
265 266
266 n = mp->m_super->s_maxbytes - iocb->ki_pos; 267 n = mp->m_super->s_maxbytes - pos;
267 if (n <= 0 || size == 0) 268 if (n <= 0 || size == 0)
268 return 0; 269 return 0;
269 270
@@ -289,20 +290,21 @@ xfs_file_aio_read(
289 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); 290 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
290 291
291 if (inode->i_mapping->nrpages) { 292 if (inode->i_mapping->nrpages) {
292 ret = -xfs_flushinval_pages(ip, 293 ret = -filemap_write_and_wait_range(
293 (iocb->ki_pos & PAGE_CACHE_MASK), 294 VFS_I(ip)->i_mapping,
294 -1, FI_REMAPF_LOCKED); 295 pos, -1);
295 if (ret) { 296 if (ret) {
296 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); 297 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
297 return ret; 298 return ret;
298 } 299 }
300 truncate_pagecache_range(VFS_I(ip), pos, -1);
299 } 301 }
300 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 302 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
301 } 303 }
302 304
303 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); 305 trace_xfs_file_read(ip, size, pos, ioflags);
304 306
305 ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos); 307 ret = generic_file_aio_read(iocb, iovp, nr_segs, pos);
306 if (ret > 0) 308 if (ret > 0)
307 XFS_STATS_ADD(xs_read_bytes, ret); 309 XFS_STATS_ADD(xs_read_bytes, ret);
308 310
@@ -670,10 +672,11 @@ xfs_file_dio_aio_write(
670 goto out; 672 goto out;
671 673
672 if (mapping->nrpages) { 674 if (mapping->nrpages) {
673 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, 675 ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
674 FI_REMAPF_LOCKED); 676 pos, -1);
675 if (ret) 677 if (ret)
676 goto out; 678 goto out;
679 truncate_pagecache_range(VFS_I(ip), pos, -1);
677 } 680 }
678 681
679 /* 682 /*
@@ -728,16 +731,17 @@ xfs_file_buffered_aio_write(
728write_retry: 731write_retry:
729 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); 732 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
730 ret = generic_file_buffered_write(iocb, iovp, nr_segs, 733 ret = generic_file_buffered_write(iocb, iovp, nr_segs,
731 pos, &iocb->ki_pos, count, ret); 734 pos, &iocb->ki_pos, count, 0);
735
732 /* 736 /*
733 * if we just got an ENOSPC, flush the inode now we aren't holding any 737 * If we just got an ENOSPC, try to write back all dirty inodes to
734 * page locks and retry *once* 738 * convert delalloc space to free up some of the excess reserved
739 * metadata space.
735 */ 740 */
736 if (ret == -ENOSPC && !enospc) { 741 if (ret == -ENOSPC && !enospc) {
737 enospc = 1; 742 enospc = 1;
738 ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); 743 xfs_flush_inodes(ip->i_mount);
739 if (!ret) 744 goto write_retry;
740 goto write_retry;
741 } 745 }
742 746
743 current->backing_dev_info = NULL; 747 current->backing_dev_info = NULL;
@@ -889,7 +893,7 @@ xfs_dir_open(
889 */ 893 */
890 mode = xfs_ilock_map_shared(ip); 894 mode = xfs_ilock_map_shared(ip);
891 if (ip->i_d.di_nextents > 0) 895 if (ip->i_d.di_nextents > 0)
892 xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK); 896 xfs_dir2_data_readahead(NULL, ip, 0, -1);
893 xfs_iunlock(ip, mode); 897 xfs_iunlock(ip, mode);
894 return 0; 898 return 0;
895} 899}
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index c13fed8c394a..6dda3f949b04 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -233,7 +233,8 @@ typedef struct xfs_fsop_resblks {
233#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ 233#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */
234#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ 234#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */
235#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */ 235#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */
236#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ 236#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */
237#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */
237#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ 238#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */
238 239
239 240
@@ -339,6 +340,35 @@ typedef struct xfs_error_injection {
339 340
340 341
341/* 342/*
343 * Speculative preallocation trimming.
344 */
345#define XFS_EOFBLOCKS_VERSION 1
346struct xfs_eofblocks {
347 __u32 eof_version;
348 __u32 eof_flags;
349 uid_t eof_uid;
350 gid_t eof_gid;
351 prid_t eof_prid;
352 __u32 pad32;
353 __u64 eof_min_file_size;
354 __u64 pad64[12];
355};
356
357/* eof_flags values */
358#define XFS_EOF_FLAGS_SYNC (1 << 0) /* sync/wait mode scan */
359#define XFS_EOF_FLAGS_UID (1 << 1) /* filter by uid */
360#define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */
361#define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */
362#define XFS_EOF_FLAGS_MINFILESIZE (1 << 4) /* filter by min file size */
363#define XFS_EOF_FLAGS_VALID \
364 (XFS_EOF_FLAGS_SYNC | \
365 XFS_EOF_FLAGS_UID | \
366 XFS_EOF_FLAGS_GID | \
367 XFS_EOF_FLAGS_PRID | \
368 XFS_EOF_FLAGS_MINFILESIZE)
369
370
371/*
342 * The user-level Handle Request interface structure. 372 * The user-level Handle Request interface structure.
343 */ 373 */
344typedef struct xfs_fsop_handlereq { 374typedef struct xfs_fsop_handlereq {
@@ -456,6 +486,7 @@ typedef struct xfs_handle {
456/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ 486/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */
457#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) 487#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap)
458#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) 488#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
489#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_eofblocks)
459 490
460/* 491/*
461 * ioctl commands that replace IRIX syssgi()'s 492 * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
deleted file mode 100644
index 652b875a9d4c..000000000000
--- a/fs/xfs/xfs_fs_subr.c
+++ /dev/null
@@ -1,96 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_vnodeops.h"
20#include "xfs_bmap_btree.h"
21#include "xfs_inode.h"
22#include "xfs_trace.h"
23
24/*
25 * note: all filemap functions return negative error codes. These
26 * need to be inverted before returning to the xfs core functions.
27 */
28void
29xfs_tosspages(
30 xfs_inode_t *ip,
31 xfs_off_t first,
32 xfs_off_t last,
33 int fiopt)
34{
35 /* can't toss partial tail pages, so mask them out */
36 last &= ~(PAGE_SIZE - 1);
37 truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
38}
39
40int
41xfs_flushinval_pages(
42 xfs_inode_t *ip,
43 xfs_off_t first,
44 xfs_off_t last,
45 int fiopt)
46{
47 struct address_space *mapping = VFS_I(ip)->i_mapping;
48 int ret = 0;
49
50 trace_xfs_pagecache_inval(ip, first, last);
51
52 xfs_iflags_clear(ip, XFS_ITRUNCATED);
53 ret = filemap_write_and_wait_range(mapping, first,
54 last == -1 ? LLONG_MAX : last);
55 if (!ret)
56 truncate_inode_pages_range(mapping, first, last);
57 return -ret;
58}
59
60int
61xfs_flush_pages(
62 xfs_inode_t *ip,
63 xfs_off_t first,
64 xfs_off_t last,
65 uint64_t flags,
66 int fiopt)
67{
68 struct address_space *mapping = VFS_I(ip)->i_mapping;
69 int ret = 0;
70 int ret2;
71
72 xfs_iflags_clear(ip, XFS_ITRUNCATED);
73 ret = -filemap_fdatawrite_range(mapping, first,
74 last == -1 ? LLONG_MAX : last);
75 if (flags & XBF_ASYNC)
76 return ret;
77 ret2 = xfs_wait_on_pages(ip, first, last);
78 if (!ret)
79 ret = ret2;
80 return ret;
81}
82
83int
84xfs_wait_on_pages(
85 xfs_inode_t *ip,
86 xfs_off_t first,
87 xfs_off_t last)
88{
89 struct address_space *mapping = VFS_I(ip)->i_mapping;
90
91 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
92 return -filemap_fdatawait_range(mapping, first,
93 last == -1 ? XFS_ISIZE(ip) - 1 : last);
94 }
95 return 0;
96}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index c25b094efbf7..94eaeedc5498 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -97,7 +97,9 @@ xfs_fs_geometry(
97 (xfs_sb_version_haslazysbcount(&mp->m_sb) ? 97 (xfs_sb_version_haslazysbcount(&mp->m_sb) ?
98 XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) | 98 XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) |
99 (xfs_sb_version_hasattr2(&mp->m_sb) ? 99 (xfs_sb_version_hasattr2(&mp->m_sb) ?
100 XFS_FSOP_GEOM_FLAGS_ATTR2 : 0); 100 XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) |
101 (xfs_sb_version_hasprojid32bit(&mp->m_sb) ?
102 XFS_FSOP_GEOM_FLAGS_PROJID32 : 0);
101 geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? 103 geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
102 mp->m_sb.sb_logsectsize : BBSIZE; 104 mp->m_sb.sb_logsectsize : BBSIZE;
103 geo->rtsectsize = mp->m_sb.sb_blocksize; 105 geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -112,18 +114,40 @@ xfs_fs_geometry(
112 return 0; 114 return 0;
113} 115}
114 116
117static struct xfs_buf *
118xfs_growfs_get_hdr_buf(
119 struct xfs_mount *mp,
120 xfs_daddr_t blkno,
121 size_t numblks,
122 int flags,
123 const struct xfs_buf_ops *ops)
124{
125 struct xfs_buf *bp;
126
127 bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
128 if (!bp)
129 return NULL;
130
131 xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
132 bp->b_bn = blkno;
133 bp->b_maps[0].bm_bn = blkno;
134 bp->b_ops = ops;
135
136 return bp;
137}
138
115static int 139static int
116xfs_growfs_data_private( 140xfs_growfs_data_private(
117 xfs_mount_t *mp, /* mount point for filesystem */ 141 xfs_mount_t *mp, /* mount point for filesystem */
118 xfs_growfs_data_t *in) /* growfs data input struct */ 142 xfs_growfs_data_t *in) /* growfs data input struct */
119{ 143{
120 xfs_agf_t *agf; 144 xfs_agf_t *agf;
145 struct xfs_agfl *agfl;
121 xfs_agi_t *agi; 146 xfs_agi_t *agi;
122 xfs_agnumber_t agno; 147 xfs_agnumber_t agno;
123 xfs_extlen_t agsize; 148 xfs_extlen_t agsize;
124 xfs_extlen_t tmpsize; 149 xfs_extlen_t tmpsize;
125 xfs_alloc_rec_t *arec; 150 xfs_alloc_rec_t *arec;
126 struct xfs_btree_block *block;
127 xfs_buf_t *bp; 151 xfs_buf_t *bp;
128 int bucket; 152 int bucket;
129 int dpct; 153 int dpct;
@@ -146,9 +170,14 @@ xfs_growfs_data_private(
146 dpct = pct - mp->m_sb.sb_imax_pct; 170 dpct = pct - mp->m_sb.sb_imax_pct;
147 bp = xfs_buf_read_uncached(mp->m_ddev_targp, 171 bp = xfs_buf_read_uncached(mp->m_ddev_targp,
148 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), 172 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
149 XFS_FSS_TO_BB(mp, 1), 0); 173 XFS_FSS_TO_BB(mp, 1), 0, NULL);
150 if (!bp) 174 if (!bp)
151 return EIO; 175 return EIO;
176 if (bp->b_error) {
177 int error = bp->b_error;
178 xfs_buf_relse(bp);
179 return error;
180 }
152 xfs_buf_relse(bp); 181 xfs_buf_relse(bp);
153 182
154 new = nb; /* use new as a temporary here */ 183 new = nb; /* use new as a temporary here */
@@ -186,17 +215,18 @@ xfs_growfs_data_private(
186 nfree = 0; 215 nfree = 0;
187 for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { 216 for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
188 /* 217 /*
189 * AG freelist header block 218 * AG freespace header block
190 */ 219 */
191 bp = xfs_buf_get(mp->m_ddev_targp, 220 bp = xfs_growfs_get_hdr_buf(mp,
192 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), 221 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
193 XFS_FSS_TO_BB(mp, 1), 0); 222 XFS_FSS_TO_BB(mp, 1), 0,
223 &xfs_agf_buf_ops);
194 if (!bp) { 224 if (!bp) {
195 error = ENOMEM; 225 error = ENOMEM;
196 goto error0; 226 goto error0;
197 } 227 }
228
198 agf = XFS_BUF_TO_AGF(bp); 229 agf = XFS_BUF_TO_AGF(bp);
199 memset(agf, 0, mp->m_sb.sb_sectsize);
200 agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); 230 agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
201 agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); 231 agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
202 agf->agf_seqno = cpu_to_be32(agno); 232 agf->agf_seqno = cpu_to_be32(agno);
@@ -223,17 +253,39 @@ xfs_growfs_data_private(
223 goto error0; 253 goto error0;
224 254
225 /* 255 /*
256 * AG freelist header block
257 */
258 bp = xfs_growfs_get_hdr_buf(mp,
259 XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
260 XFS_FSS_TO_BB(mp, 1), 0,
261 &xfs_agfl_buf_ops);
262 if (!bp) {
263 error = ENOMEM;
264 goto error0;
265 }
266
267 agfl = XFS_BUF_TO_AGFL(bp);
268 for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
269 agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
270
271 error = xfs_bwrite(bp);
272 xfs_buf_relse(bp);
273 if (error)
274 goto error0;
275
276 /*
226 * AG inode header block 277 * AG inode header block
227 */ 278 */
228 bp = xfs_buf_get(mp->m_ddev_targp, 279 bp = xfs_growfs_get_hdr_buf(mp,
229 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), 280 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
230 XFS_FSS_TO_BB(mp, 1), 0); 281 XFS_FSS_TO_BB(mp, 1), 0,
282 &xfs_agi_buf_ops);
231 if (!bp) { 283 if (!bp) {
232 error = ENOMEM; 284 error = ENOMEM;
233 goto error0; 285 goto error0;
234 } 286 }
287
235 agi = XFS_BUF_TO_AGI(bp); 288 agi = XFS_BUF_TO_AGI(bp);
236 memset(agi, 0, mp->m_sb.sb_sectsize);
237 agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); 289 agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
238 agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); 290 agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
239 agi->agi_seqno = cpu_to_be32(agno); 291 agi->agi_seqno = cpu_to_be32(agno);
@@ -254,24 +306,22 @@ xfs_growfs_data_private(
254 /* 306 /*
255 * BNO btree root block 307 * BNO btree root block
256 */ 308 */
257 bp = xfs_buf_get(mp->m_ddev_targp, 309 bp = xfs_growfs_get_hdr_buf(mp,
258 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), 310 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
259 BTOBB(mp->m_sb.sb_blocksize), 0); 311 BTOBB(mp->m_sb.sb_blocksize), 0,
312 &xfs_allocbt_buf_ops);
313
260 if (!bp) { 314 if (!bp) {
261 error = ENOMEM; 315 error = ENOMEM;
262 goto error0; 316 goto error0;
263 } 317 }
264 block = XFS_BUF_TO_BLOCK(bp); 318
265 memset(block, 0, mp->m_sb.sb_blocksize); 319 xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0);
266 block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); 320 arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
267 block->bb_level = 0;
268 block->bb_numrecs = cpu_to_be16(1);
269 block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
270 block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
271 arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
272 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); 321 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
273 arec->ar_blockcount = cpu_to_be32( 322 arec->ar_blockcount = cpu_to_be32(
274 agsize - be32_to_cpu(arec->ar_startblock)); 323 agsize - be32_to_cpu(arec->ar_startblock));
324
275 error = xfs_bwrite(bp); 325 error = xfs_bwrite(bp);
276 xfs_buf_relse(bp); 326 xfs_buf_relse(bp);
277 if (error) 327 if (error)
@@ -280,25 +330,22 @@ xfs_growfs_data_private(
280 /* 330 /*
281 * CNT btree root block 331 * CNT btree root block
282 */ 332 */
283 bp = xfs_buf_get(mp->m_ddev_targp, 333 bp = xfs_growfs_get_hdr_buf(mp,
284 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), 334 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
285 BTOBB(mp->m_sb.sb_blocksize), 0); 335 BTOBB(mp->m_sb.sb_blocksize), 0,
336 &xfs_allocbt_buf_ops);
286 if (!bp) { 337 if (!bp) {
287 error = ENOMEM; 338 error = ENOMEM;
288 goto error0; 339 goto error0;
289 } 340 }
290 block = XFS_BUF_TO_BLOCK(bp); 341
291 memset(block, 0, mp->m_sb.sb_blocksize); 342 xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0);
292 block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); 343 arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
293 block->bb_level = 0;
294 block->bb_numrecs = cpu_to_be16(1);
295 block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
296 block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
297 arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
298 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); 344 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
299 arec->ar_blockcount = cpu_to_be32( 345 arec->ar_blockcount = cpu_to_be32(
300 agsize - be32_to_cpu(arec->ar_startblock)); 346 agsize - be32_to_cpu(arec->ar_startblock));
301 nfree += be32_to_cpu(arec->ar_blockcount); 347 nfree += be32_to_cpu(arec->ar_blockcount);
348
302 error = xfs_bwrite(bp); 349 error = xfs_bwrite(bp);
303 xfs_buf_relse(bp); 350 xfs_buf_relse(bp);
304 if (error) 351 if (error)
@@ -307,20 +354,17 @@ xfs_growfs_data_private(
307 /* 354 /*
308 * INO btree root block 355 * INO btree root block
309 */ 356 */
310 bp = xfs_buf_get(mp->m_ddev_targp, 357 bp = xfs_growfs_get_hdr_buf(mp,
311 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), 358 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
312 BTOBB(mp->m_sb.sb_blocksize), 0); 359 BTOBB(mp->m_sb.sb_blocksize), 0,
360 &xfs_inobt_buf_ops);
313 if (!bp) { 361 if (!bp) {
314 error = ENOMEM; 362 error = ENOMEM;
315 goto error0; 363 goto error0;
316 } 364 }
317 block = XFS_BUF_TO_BLOCK(bp); 365
318 memset(block, 0, mp->m_sb.sb_blocksize); 366 xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0);
319 block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); 367
320 block->bb_level = 0;
321 block->bb_numrecs = 0;
322 block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
323 block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
324 error = xfs_bwrite(bp); 368 error = xfs_bwrite(bp);
325 xfs_buf_relse(bp); 369 xfs_buf_relse(bp);
326 if (error) 370 if (error)
@@ -399,9 +443,28 @@ xfs_growfs_data_private(
399 443
400 /* update secondary superblocks. */ 444 /* update secondary superblocks. */
401 for (agno = 1; agno < nagcount; agno++) { 445 for (agno = 1; agno < nagcount; agno++) {
402 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, 446 error = 0;
447 /*
448 * new secondary superblocks need to be zeroed, not read from
449 * disk as the contents of the new area we are growing into is
450 * completely unknown.
451 */
452 if (agno < oagcount) {
453 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
454 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
455 XFS_FSS_TO_BB(mp, 1), 0, &bp,
456 &xfs_sb_buf_ops);
457 } else {
458 bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp,
403 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), 459 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
404 XFS_FSS_TO_BB(mp, 1), 0, &bp); 460 XFS_FSS_TO_BB(mp, 1), 0);
461 if (bp) {
462 bp->b_ops = &xfs_sb_buf_ops;
463 xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
464 } else
465 error = ENOMEM;
466 }
467
405 if (error) { 468 if (error) {
406 xfs_warn(mp, 469 xfs_warn(mp,
407 "error %d reading secondary superblock for ag %d", 470 "error %d reading secondary superblock for ag %d",
@@ -409,6 +472,7 @@ xfs_growfs_data_private(
409 break; 472 break;
410 } 473 }
411 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS); 474 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS);
475
412 /* 476 /*
413 * If we get an error writing out the alternate superblocks, 477 * If we get an error writing out the alternate superblocks,
414 * just issue a warning and continue. The real work is 478 * just issue a warning and continue. The real work is
@@ -423,7 +487,7 @@ xfs_growfs_data_private(
423 break; /* no point in continuing */ 487 break; /* no point in continuing */
424 } 488 }
425 } 489 }
426 return 0; 490 return error;
427 491
428 error0: 492 error0:
429 xfs_trans_cancel(tp, XFS_TRANS_ABORT); 493 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 76e81cff70b9..5399ef222dd7 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -21,7 +21,8 @@
21/* 21/*
22 * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, 22 * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n,
23 * other XFS code uses these values. Times are measured in centisecs (i.e. 23 * other XFS code uses these values. Times are measured in centisecs (i.e.
24 * 100ths of a second). 24 * 100ths of a second) with the exception of eofb_timer, which is measured in
25 * seconds.
25 */ 26 */
26xfs_param_t xfs_params = { 27xfs_param_t xfs_params = {
27 /* MIN DFLT MAX */ 28 /* MIN DFLT MAX */
@@ -40,4 +41,5 @@ xfs_param_t xfs_params = {
40 .rotorstep = { 1, 1, 255 }, 41 .rotorstep = { 1, 1, 255 },
41 .inherit_nodfrg = { 0, 1, 1 }, 42 .inherit_nodfrg = { 0, 1, 1 },
42 .fstrm_timer = { 1, 30*100, 3600*100}, 43 .fstrm_timer = { 1, 30*100, 3600*100},
44 .eofb_timer = { 1, 300, 3600*24},
43}; 45};
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 445bf1aef31c..a815412eab80 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -200,7 +200,8 @@ xfs_ialloc_inode_init(
200 */ 200 */
201 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); 201 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
202 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, 202 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
203 mp->m_bsize * blks_per_cluster, 0); 203 mp->m_bsize * blks_per_cluster,
204 XBF_UNMAPPED);
204 if (!fbuf) 205 if (!fbuf)
205 return ENOMEM; 206 return ENOMEM;
206 /* 207 /*
@@ -210,6 +211,7 @@ xfs_ialloc_inode_init(
210 * to log a whole cluster of inodes instead of all the 211 * to log a whole cluster of inodes instead of all the
211 * individual transactions causing a lot of log traffic. 212 * individual transactions causing a lot of log traffic.
212 */ 213 */
214 fbuf->b_ops = &xfs_inode_buf_ops;
213 xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); 215 xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
214 for (i = 0; i < ninodes; i++) { 216 for (i = 0; i < ninodes; i++) {
215 int ioffset = i << mp->m_sb.sb_inodelog; 217 int ioffset = i << mp->m_sb.sb_inodelog;
@@ -250,6 +252,7 @@ xfs_ialloc_ag_alloc(
250 /* boundary */ 252 /* boundary */
251 struct xfs_perag *pag; 253 struct xfs_perag *pag;
252 254
255 memset(&args, 0, sizeof(args));
253 args.tp = tp; 256 args.tp = tp;
254 args.mp = tp->t_mountp; 257 args.mp = tp->t_mountp;
255 258
@@ -876,9 +879,9 @@ error0:
876 * This function is designed to be called twice if it has to do an allocation 879 * This function is designed to be called twice if it has to do an allocation
877 * to make more free inodes. On the first call, *IO_agbp should be set to NULL. 880 * to make more free inodes. On the first call, *IO_agbp should be set to NULL.
878 * If an inode is available without having to performn an allocation, an inode 881 * If an inode is available without having to performn an allocation, an inode
879 * number is returned. In this case, *IO_agbp would be NULL. If an allocation 882 * number is returned. In this case, *IO_agbp is set to NULL. If an allocation
880 * needes to be done, xfs_dialloc would return the current AGI buffer in 883 * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
881 * *IO_agbp. The caller should then commit the current transaction, allocate a 884 * The caller should then commit the current transaction, allocate a
882 * new transaction, and call xfs_dialloc() again, passing in the previous value 885 * new transaction, and call xfs_dialloc() again, passing in the previous value
883 * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI 886 * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI
884 * buffer is locked across the two calls, the second call is guaranteed to have 887 * buffer is locked across the two calls, the second call is guaranteed to have
@@ -1471,6 +1474,57 @@ xfs_check_agi_unlinked(
1471#define xfs_check_agi_unlinked(agi) 1474#define xfs_check_agi_unlinked(agi)
1472#endif 1475#endif
1473 1476
1477static void
1478xfs_agi_verify(
1479 struct xfs_buf *bp)
1480{
1481 struct xfs_mount *mp = bp->b_target->bt_mount;
1482 struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
1483 int agi_ok;
1484
1485 /*
1486 * Validate the magic number of the agi block.
1487 */
1488 agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
1489 XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
1490
1491 /*
1492 * during growfs operations, the perag is not fully initialised,
1493 * so we can't use it for any useful checking. growfs ensures we can't
1494 * use it by using uncached buffers that don't have the perag attached
1495 * so we can detect and avoid this problem.
1496 */
1497 if (bp->b_pag)
1498 agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) ==
1499 bp->b_pag->pag_agno;
1500
1501 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
1502 XFS_RANDOM_IALLOC_READ_AGI))) {
1503 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi);
1504 xfs_buf_ioerror(bp, EFSCORRUPTED);
1505 }
1506 xfs_check_agi_unlinked(agi);
1507}
1508
1509static void
1510xfs_agi_read_verify(
1511 struct xfs_buf *bp)
1512{
1513 xfs_agi_verify(bp);
1514}
1515
1516static void
1517xfs_agi_write_verify(
1518 struct xfs_buf *bp)
1519{
1520 xfs_agi_verify(bp);
1521}
1522
1523const struct xfs_buf_ops xfs_agi_buf_ops = {
1524 .verify_read = xfs_agi_read_verify,
1525 .verify_write = xfs_agi_write_verify,
1526};
1527
1474/* 1528/*
1475 * Read in the allocation group header (inode allocation section) 1529 * Read in the allocation group header (inode allocation section)
1476 */ 1530 */
@@ -1481,38 +1535,18 @@ xfs_read_agi(
1481 xfs_agnumber_t agno, /* allocation group number */ 1535 xfs_agnumber_t agno, /* allocation group number */
1482 struct xfs_buf **bpp) /* allocation group hdr buf */ 1536 struct xfs_buf **bpp) /* allocation group hdr buf */
1483{ 1537{
1484 struct xfs_agi *agi; /* allocation group header */
1485 int agi_ok; /* agi is consistent */
1486 int error; 1538 int error;
1487 1539
1488 ASSERT(agno != NULLAGNUMBER); 1540 ASSERT(agno != NULLAGNUMBER);
1489 1541
1490 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 1542 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
1491 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), 1543 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
1492 XFS_FSS_TO_BB(mp, 1), 0, bpp); 1544 XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
1493 if (error) 1545 if (error)
1494 return error; 1546 return error;
1495 1547
1496 ASSERT(!xfs_buf_geterror(*bpp)); 1548 ASSERT(!xfs_buf_geterror(*bpp));
1497 agi = XFS_BUF_TO_AGI(*bpp);
1498
1499 /*
1500 * Validate the magic number of the agi block.
1501 */
1502 agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
1503 XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) &&
1504 be32_to_cpu(agi->agi_seqno) == agno;
1505 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
1506 XFS_RANDOM_IALLOC_READ_AGI))) {
1507 XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW,
1508 mp, agi);
1509 xfs_trans_brelse(tp, *bpp);
1510 return XFS_ERROR(EFSCORRUPTED);
1511 }
1512
1513 xfs_buf_set_ref(*bpp, XFS_AGI_REF); 1549 xfs_buf_set_ref(*bpp, XFS_AGI_REF);
1514
1515 xfs_check_agi_unlinked(agi);
1516 return 0; 1550 return 0;
1517} 1551}
1518 1552
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 1fd6ea4e9c91..c8da3df271e6 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -147,7 +147,9 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
147/* 147/*
148 * Get the data from the pointed-to record. 148 * Get the data from the pointed-to record.
149 */ 149 */
150extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, 150int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
151 xfs_inobt_rec_incore_t *rec, int *stat); 151 xfs_inobt_rec_incore_t *rec, int *stat);
152 152
153extern const struct xfs_buf_ops xfs_agi_buf_ops;
154
153#endif /* __XFS_IALLOC_H__ */ 155#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 2b8b7a37aa18..bec344b36507 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -33,6 +33,7 @@
33#include "xfs_ialloc.h" 33#include "xfs_ialloc.h"
34#include "xfs_alloc.h" 34#include "xfs_alloc.h"
35#include "xfs_error.h" 35#include "xfs_error.h"
36#include "xfs_trace.h"
36 37
37 38
38STATIC int 39STATIC int
@@ -181,6 +182,59 @@ xfs_inobt_key_diff(
181 cur->bc_rec.i.ir_startino; 182 cur->bc_rec.i.ir_startino;
182} 183}
183 184
185void
186xfs_inobt_verify(
187 struct xfs_buf *bp)
188{
189 struct xfs_mount *mp = bp->b_target->bt_mount;
190 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
191 unsigned int level;
192 int sblock_ok; /* block passes checks */
193
194 /* magic number and level verification */
195 level = be16_to_cpu(block->bb_level);
196 sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) &&
197 level < mp->m_in_maxlevels;
198
199 /* numrecs verification */
200 sblock_ok = sblock_ok &&
201 be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0];
202
203 /* sibling pointer verification */
204 sblock_ok = sblock_ok &&
205 (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
206 be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
207 block->bb_u.s.bb_leftsib &&
208 (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
209 be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
210 block->bb_u.s.bb_rightsib;
211
212 if (!sblock_ok) {
213 trace_xfs_btree_corrupt(bp, _RET_IP_);
214 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
215 xfs_buf_ioerror(bp, EFSCORRUPTED);
216 }
217}
218
219static void
220xfs_inobt_read_verify(
221 struct xfs_buf *bp)
222{
223 xfs_inobt_verify(bp);
224}
225
226static void
227xfs_inobt_write_verify(
228 struct xfs_buf *bp)
229{
230 xfs_inobt_verify(bp);
231}
232
233const struct xfs_buf_ops xfs_inobt_buf_ops = {
234 .verify_read = xfs_inobt_read_verify,
235 .verify_write = xfs_inobt_write_verify,
236};
237
184#ifdef DEBUG 238#ifdef DEBUG
185STATIC int 239STATIC int
186xfs_inobt_keys_inorder( 240xfs_inobt_keys_inorder(
@@ -218,6 +272,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
218 .init_rec_from_cur = xfs_inobt_init_rec_from_cur, 272 .init_rec_from_cur = xfs_inobt_init_rec_from_cur,
219 .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, 273 .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur,
220 .key_diff = xfs_inobt_key_diff, 274 .key_diff = xfs_inobt_key_diff,
275 .buf_ops = &xfs_inobt_buf_ops,
221#ifdef DEBUG 276#ifdef DEBUG
222 .keys_inorder = xfs_inobt_keys_inorder, 277 .keys_inorder = xfs_inobt_keys_inorder,
223 .recs_inorder = xfs_inobt_recs_inorder, 278 .recs_inorder = xfs_inobt_recs_inorder,
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index f782ad0c4769..25c0239a8eab 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -109,4 +109,6 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
109 struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t); 109 struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
110extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); 110extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
111 111
112extern const struct xfs_buf_ops xfs_inobt_buf_ops;
113
112#endif /* __XFS_IALLOC_BTREE_H__ */ 114#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_icache.c
index 9500caf15acf..96e344e3e927 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_icache.c
@@ -19,6 +19,7 @@
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_log.h" 21#include "xfs_log.h"
22#include "xfs_log_priv.h"
22#include "xfs_inum.h" 23#include "xfs_inum.h"
23#include "xfs_trans.h" 24#include "xfs_trans.h"
24#include "xfs_trans_priv.h" 25#include "xfs_trans_priv.h"
@@ -35,11 +36,425 @@
35#include "xfs_quota.h" 36#include "xfs_quota.h"
36#include "xfs_trace.h" 37#include "xfs_trace.h"
37#include "xfs_fsops.h" 38#include "xfs_fsops.h"
39#include "xfs_icache.h"
38 40
39#include <linux/kthread.h> 41#include <linux/kthread.h>
40#include <linux/freezer.h> 42#include <linux/freezer.h>
41 43
42struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ 44STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
45 struct xfs_perag *pag, struct xfs_inode *ip);
46
47/*
48 * Allocate and initialise an xfs_inode.
49 */
50STATIC struct xfs_inode *
51xfs_inode_alloc(
52 struct xfs_mount *mp,
53 xfs_ino_t ino)
54{
55 struct xfs_inode *ip;
56
57 /*
58 * if this didn't occur in transactions, we could use
59 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
60 * code up to do this anyway.
61 */
62 ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
63 if (!ip)
64 return NULL;
65 if (inode_init_always(mp->m_super, VFS_I(ip))) {
66 kmem_zone_free(xfs_inode_zone, ip);
67 return NULL;
68 }
69
70 ASSERT(atomic_read(&ip->i_pincount) == 0);
71 ASSERT(!spin_is_locked(&ip->i_flags_lock));
72 ASSERT(!xfs_isiflocked(ip));
73 ASSERT(ip->i_ino == 0);
74
75 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
76
77 /* initialise the xfs inode */
78 ip->i_ino = ino;
79 ip->i_mount = mp;
80 memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
81 ip->i_afp = NULL;
82 memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
83 ip->i_flags = 0;
84 ip->i_delayed_blks = 0;
85 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
86
87 return ip;
88}
89
90STATIC void
91xfs_inode_free_callback(
92 struct rcu_head *head)
93{
94 struct inode *inode = container_of(head, struct inode, i_rcu);
95 struct xfs_inode *ip = XFS_I(inode);
96
97 kmem_zone_free(xfs_inode_zone, ip);
98}
99
100STATIC void
101xfs_inode_free(
102 struct xfs_inode *ip)
103{
104 switch (ip->i_d.di_mode & S_IFMT) {
105 case S_IFREG:
106 case S_IFDIR:
107 case S_IFLNK:
108 xfs_idestroy_fork(ip, XFS_DATA_FORK);
109 break;
110 }
111
112 if (ip->i_afp)
113 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
114
115 if (ip->i_itemp) {
116 ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
117 xfs_inode_item_destroy(ip);
118 ip->i_itemp = NULL;
119 }
120
121 /* asserts to verify all state is correct here */
122 ASSERT(atomic_read(&ip->i_pincount) == 0);
123 ASSERT(!spin_is_locked(&ip->i_flags_lock));
124 ASSERT(!xfs_isiflocked(ip));
125
126 /*
127 * Because we use RCU freeing we need to ensure the inode always
128 * appears to be reclaimed with an invalid inode number when in the
129 * free state. The ip->i_flags_lock provides the barrier against lookup
130 * races.
131 */
132 spin_lock(&ip->i_flags_lock);
133 ip->i_flags = XFS_IRECLAIM;
134 ip->i_ino = 0;
135 spin_unlock(&ip->i_flags_lock);
136
137 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
138}
139
140/*
141 * Check the validity of the inode we just found it the cache
142 */
143static int
144xfs_iget_cache_hit(
145 struct xfs_perag *pag,
146 struct xfs_inode *ip,
147 xfs_ino_t ino,
148 int flags,
149 int lock_flags) __releases(RCU)
150{
151 struct inode *inode = VFS_I(ip);
152 struct xfs_mount *mp = ip->i_mount;
153 int error;
154
155 /*
156 * check for re-use of an inode within an RCU grace period due to the
157 * radix tree nodes not being updated yet. We monitor for this by
158 * setting the inode number to zero before freeing the inode structure.
159 * If the inode has been reallocated and set up, then the inode number
160 * will not match, so check for that, too.
161 */
162 spin_lock(&ip->i_flags_lock);
163 if (ip->i_ino != ino) {
164 trace_xfs_iget_skip(ip);
165 XFS_STATS_INC(xs_ig_frecycle);
166 error = EAGAIN;
167 goto out_error;
168 }
169
170
171 /*
172 * If we are racing with another cache hit that is currently
173 * instantiating this inode or currently recycling it out of
174 * reclaimabe state, wait for the initialisation to complete
175 * before continuing.
176 *
177 * XXX(hch): eventually we should do something equivalent to
178 * wait_on_inode to wait for these flags to be cleared
179 * instead of polling for it.
180 */
181 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
182 trace_xfs_iget_skip(ip);
183 XFS_STATS_INC(xs_ig_frecycle);
184 error = EAGAIN;
185 goto out_error;
186 }
187
188 /*
189 * If lookup is racing with unlink return an error immediately.
190 */
191 if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
192 error = ENOENT;
193 goto out_error;
194 }
195
196 /*
197 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
198 * Need to carefully get it back into useable state.
199 */
200 if (ip->i_flags & XFS_IRECLAIMABLE) {
201 trace_xfs_iget_reclaim(ip);
202
203 /*
204 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
205 * from stomping over us while we recycle the inode. We can't
206 * clear the radix tree reclaimable tag yet as it requires
207 * pag_ici_lock to be held exclusive.
208 */
209 ip->i_flags |= XFS_IRECLAIM;
210
211 spin_unlock(&ip->i_flags_lock);
212 rcu_read_unlock();
213
214 error = -inode_init_always(mp->m_super, inode);
215 if (error) {
216 /*
217 * Re-initializing the inode failed, and we are in deep
218 * trouble. Try to re-add it to the reclaim list.
219 */
220 rcu_read_lock();
221 spin_lock(&ip->i_flags_lock);
222
223 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
224 ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
225 trace_xfs_iget_reclaim_fail(ip);
226 goto out_error;
227 }
228
229 spin_lock(&pag->pag_ici_lock);
230 spin_lock(&ip->i_flags_lock);
231
232 /*
233 * Clear the per-lifetime state in the inode as we are now
234 * effectively a new inode and need to return to the initial
235 * state before reuse occurs.
236 */
237 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
238 ip->i_flags |= XFS_INEW;
239 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
240 inode->i_state = I_NEW;
241
242 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
243 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
244
245 spin_unlock(&ip->i_flags_lock);
246 spin_unlock(&pag->pag_ici_lock);
247 } else {
248 /* If the VFS inode is being torn down, pause and try again. */
249 if (!igrab(inode)) {
250 trace_xfs_iget_skip(ip);
251 error = EAGAIN;
252 goto out_error;
253 }
254
255 /* We've got a live one. */
256 spin_unlock(&ip->i_flags_lock);
257 rcu_read_unlock();
258 trace_xfs_iget_hit(ip);
259 }
260
261 if (lock_flags != 0)
262 xfs_ilock(ip, lock_flags);
263
264 xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
265 XFS_STATS_INC(xs_ig_found);
266
267 return 0;
268
269out_error:
270 spin_unlock(&ip->i_flags_lock);
271 rcu_read_unlock();
272 return error;
273}
274
275
276static int
277xfs_iget_cache_miss(
278 struct xfs_mount *mp,
279 struct xfs_perag *pag,
280 xfs_trans_t *tp,
281 xfs_ino_t ino,
282 struct xfs_inode **ipp,
283 int flags,
284 int lock_flags)
285{
286 struct xfs_inode *ip;
287 int error;
288 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
289 int iflags;
290
291 ip = xfs_inode_alloc(mp, ino);
292 if (!ip)
293 return ENOMEM;
294
295 error = xfs_iread(mp, tp, ip, flags);
296 if (error)
297 goto out_destroy;
298
299 trace_xfs_iget_miss(ip);
300
301 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
302 error = ENOENT;
303 goto out_destroy;
304 }
305
306 /*
307 * Preload the radix tree so we can insert safely under the
308 * write spinlock. Note that we cannot sleep inside the preload
309 * region. Since we can be called from transaction context, don't
310 * recurse into the file system.
311 */
312 if (radix_tree_preload(GFP_NOFS)) {
313 error = EAGAIN;
314 goto out_destroy;
315 }
316
317 /*
318 * Because the inode hasn't been added to the radix-tree yet it can't
319 * be found by another thread, so we can do the non-sleeping lock here.
320 */
321 if (lock_flags) {
322 if (!xfs_ilock_nowait(ip, lock_flags))
323 BUG();
324 }
325
326 /*
327 * These values must be set before inserting the inode into the radix
328 * tree as the moment it is inserted a concurrent lookup (allowed by the
329 * RCU locking mechanism) can find it and that lookup must see that this
330 * is an inode currently under construction (i.e. that XFS_INEW is set).
331 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
332 * memory barrier that ensures this detection works correctly at lookup
333 * time.
334 */
335 iflags = XFS_INEW;
336 if (flags & XFS_IGET_DONTCACHE)
337 iflags |= XFS_IDONTCACHE;
338 ip->i_udquot = ip->i_gdquot = NULL;
339 xfs_iflags_set(ip, iflags);
340
341 /* insert the new inode */
342 spin_lock(&pag->pag_ici_lock);
343 error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
344 if (unlikely(error)) {
345 WARN_ON(error != -EEXIST);
346 XFS_STATS_INC(xs_ig_dup);
347 error = EAGAIN;
348 goto out_preload_end;
349 }
350 spin_unlock(&pag->pag_ici_lock);
351 radix_tree_preload_end();
352
353 *ipp = ip;
354 return 0;
355
356out_preload_end:
357 spin_unlock(&pag->pag_ici_lock);
358 radix_tree_preload_end();
359 if (lock_flags)
360 xfs_iunlock(ip, lock_flags);
361out_destroy:
362 __destroy_inode(VFS_I(ip));
363 xfs_inode_free(ip);
364 return error;
365}
366
367/*
368 * Look up an inode by number in the given file system.
369 * The inode is looked up in the cache held in each AG.
370 * If the inode is found in the cache, initialise the vfs inode
371 * if necessary.
372 *
373 * If it is not in core, read it in from the file system's device,
374 * add it to the cache and initialise the vfs inode.
375 *
376 * The inode is locked according to the value of the lock_flags parameter.
377 * This flag parameter indicates how and if the inode's IO lock and inode lock
378 * should be taken.
379 *
380 * mp -- the mount point structure for the current file system. It points
381 * to the inode hash table.
382 * tp -- a pointer to the current transaction if there is one. This is
383 * simply passed through to the xfs_iread() call.
384 * ino -- the number of the inode desired. This is the unique identifier
385 * within the file system for the inode being requested.
386 * lock_flags -- flags indicating how to lock the inode. See the comment
387 * for xfs_ilock() for a list of valid values.
388 */
389int
390xfs_iget(
391 xfs_mount_t *mp,
392 xfs_trans_t *tp,
393 xfs_ino_t ino,
394 uint flags,
395 uint lock_flags,
396 xfs_inode_t **ipp)
397{
398 xfs_inode_t *ip;
399 int error;
400 xfs_perag_t *pag;
401 xfs_agino_t agino;
402
403 /*
404 * xfs_reclaim_inode() uses the ILOCK to ensure an inode
405 * doesn't get freed while it's being referenced during a
406 * radix tree traversal here. It assumes this function
407 * aqcuires only the ILOCK (and therefore it has no need to
408 * involve the IOLOCK in this synchronization).
409 */
410 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
411
412 /* reject inode numbers outside existing AGs */
413 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
414 return EINVAL;
415
416 /* get the perag structure and ensure that it's inode capable */
417 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
418 agino = XFS_INO_TO_AGINO(mp, ino);
419
420again:
421 error = 0;
422 rcu_read_lock();
423 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
424
425 if (ip) {
426 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
427 if (error)
428 goto out_error_or_again;
429 } else {
430 rcu_read_unlock();
431 XFS_STATS_INC(xs_ig_missed);
432
433 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
434 flags, lock_flags);
435 if (error)
436 goto out_error_or_again;
437 }
438 xfs_perag_put(pag);
439
440 *ipp = ip;
441
442 /*
443 * If we have a real type for an on-disk inode, we can set ops(&unlock)
444 * now. If it's a new inode being created, xfs_ialloc will handle it.
445 */
446 if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
447 xfs_setup_inode(ip);
448 return 0;
449
450out_error_or_again:
451 if (error == EAGAIN) {
452 delay(1);
453 goto again;
454 }
455 xfs_perag_put(pag);
456 return error;
457}
43 458
44/* 459/*
45 * The inode lookup is done in batches to keep the amount of lock traffic and 460 * The inode lookup is done in batches to keep the amount of lock traffic and
@@ -101,8 +516,11 @@ xfs_inode_ag_walk(
101 struct xfs_mount *mp, 516 struct xfs_mount *mp,
102 struct xfs_perag *pag, 517 struct xfs_perag *pag,
103 int (*execute)(struct xfs_inode *ip, 518 int (*execute)(struct xfs_inode *ip,
104 struct xfs_perag *pag, int flags), 519 struct xfs_perag *pag, int flags,
105 int flags) 520 void *args),
521 int flags,
522 void *args,
523 int tag)
106{ 524{
107 uint32_t first_index; 525 uint32_t first_index;
108 int last_error = 0; 526 int last_error = 0;
@@ -121,9 +539,17 @@ restart:
121 int i; 539 int i;
122 540
123 rcu_read_lock(); 541 rcu_read_lock();
124 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 542
543 if (tag == -1)
544 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
125 (void **)batch, first_index, 545 (void **)batch, first_index,
126 XFS_LOOKUP_BATCH); 546 XFS_LOOKUP_BATCH);
547 else
548 nr_found = radix_tree_gang_lookup_tag(
549 &pag->pag_ici_root,
550 (void **) batch, first_index,
551 XFS_LOOKUP_BATCH, tag);
552
127 if (!nr_found) { 553 if (!nr_found) {
128 rcu_read_unlock(); 554 rcu_read_unlock();
129 break; 555 break;
@@ -164,7 +590,7 @@ restart:
164 for (i = 0; i < nr_found; i++) { 590 for (i = 0; i < nr_found; i++) {
165 if (!batch[i]) 591 if (!batch[i])
166 continue; 592 continue;
167 error = execute(batch[i], pag, flags); 593 error = execute(batch[i], pag, flags, args);
168 IRELE(batch[i]); 594 IRELE(batch[i]);
169 if (error == EAGAIN) { 595 if (error == EAGAIN) {
170 skipped++; 596 skipped++;
@@ -189,12 +615,40 @@ restart:
189 return last_error; 615 return last_error;
190} 616}
191 617
618/*
619 * Background scanning to trim post-EOF preallocated space. This is queued
620 * based on the 'background_prealloc_discard_period' tunable (5m by default).
621 */
622STATIC void
623xfs_queue_eofblocks(
624 struct xfs_mount *mp)
625{
626 rcu_read_lock();
627 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG))
628 queue_delayed_work(mp->m_eofblocks_workqueue,
629 &mp->m_eofblocks_work,
630 msecs_to_jiffies(xfs_eofb_secs * 1000));
631 rcu_read_unlock();
632}
633
634void
635xfs_eofblocks_worker(
636 struct work_struct *work)
637{
638 struct xfs_mount *mp = container_of(to_delayed_work(work),
639 struct xfs_mount, m_eofblocks_work);
640 xfs_icache_free_eofblocks(mp, NULL);
641 xfs_queue_eofblocks(mp);
642}
643
192int 644int
193xfs_inode_ag_iterator( 645xfs_inode_ag_iterator(
194 struct xfs_mount *mp, 646 struct xfs_mount *mp,
195 int (*execute)(struct xfs_inode *ip, 647 int (*execute)(struct xfs_inode *ip,
196 struct xfs_perag *pag, int flags), 648 struct xfs_perag *pag, int flags,
197 int flags) 649 void *args),
650 int flags,
651 void *args)
198{ 652{
199 struct xfs_perag *pag; 653 struct xfs_perag *pag;
200 int error = 0; 654 int error = 0;
@@ -204,7 +658,7 @@ xfs_inode_ag_iterator(
204 ag = 0; 658 ag = 0;
205 while ((pag = xfs_perag_get(mp, ag))) { 659 while ((pag = xfs_perag_get(mp, ag))) {
206 ag = pag->pag_agno + 1; 660 ag = pag->pag_agno + 1;
207 error = xfs_inode_ag_walk(mp, pag, execute, flags); 661 error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1);
208 xfs_perag_put(pag); 662 xfs_perag_put(pag);
209 if (error) { 663 if (error) {
210 last_error = error; 664 last_error = error;
@@ -215,224 +669,50 @@ xfs_inode_ag_iterator(
215 return XFS_ERROR(last_error); 669 return XFS_ERROR(last_error);
216} 670}
217 671
218STATIC int
219xfs_sync_inode_data(
220 struct xfs_inode *ip,
221 struct xfs_perag *pag,
222 int flags)
223{
224 struct inode *inode = VFS_I(ip);
225 struct address_space *mapping = inode->i_mapping;
226 int error = 0;
227
228 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
229 return 0;
230
231 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
232 if (flags & SYNC_TRYLOCK)
233 return 0;
234 xfs_ilock(ip, XFS_IOLOCK_SHARED);
235 }
236
237 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
238 0 : XBF_ASYNC, FI_NONE);
239 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
240 return error;
241}
242
243/*
244 * Write out pagecache data for the whole filesystem.
245 */
246STATIC int
247xfs_sync_data(
248 struct xfs_mount *mp,
249 int flags)
250{
251 int error;
252
253 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
254
255 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
256 if (error)
257 return XFS_ERROR(error);
258
259 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
260 return 0;
261}
262
263STATIC int
264xfs_sync_fsdata(
265 struct xfs_mount *mp)
266{
267 struct xfs_buf *bp;
268 int error;
269
270 /*
271 * If the buffer is pinned then push on the log so we won't get stuck
272 * waiting in the write for someone, maybe ourselves, to flush the log.
273 *
274 * Even though we just pushed the log above, we did not have the
275 * superblock buffer locked at that point so it can become pinned in
276 * between there and here.
277 */
278 bp = xfs_getsb(mp, 0);
279 if (xfs_buf_ispinned(bp))
280 xfs_log_force(mp, 0);
281 error = xfs_bwrite(bp);
282 xfs_buf_relse(bp);
283 return error;
284}
285
286/*
287 * When remounting a filesystem read-only or freezing the filesystem, we have
288 * two phases to execute. This first phase is syncing the data before we
289 * quiesce the filesystem, and the second is flushing all the inodes out after
290 * we've waited for all the transactions created by the first phase to
291 * complete. The second phase ensures that the inodes are written to their
292 * location on disk rather than just existing in transactions in the log. This
293 * means after a quiesce there is no log replay required to write the inodes to
294 * disk (this is the main difference between a sync and a quiesce).
295 */
296/*
297 * First stage of freeze - no writers will make progress now we are here,
298 * so we flush delwri and delalloc buffers here, then wait for all I/O to
299 * complete. Data is frozen at that point. Metadata is not frozen,
300 * transactions can still occur here so don't bother emptying the AIL
301 * because it'll just get dirty again.
302 */
303int 672int
304xfs_quiesce_data( 673xfs_inode_ag_iterator_tag(
305 struct xfs_mount *mp) 674 struct xfs_mount *mp,
306{ 675 int (*execute)(struct xfs_inode *ip,
307 int error, error2 = 0; 676 struct xfs_perag *pag, int flags,
308 677 void *args),
309 /* force out the log */ 678 int flags,
310 xfs_log_force(mp, XFS_LOG_SYNC); 679 void *args,
311 680 int tag)
312 /* write superblock and hoover up shutdown errors */
313 error = xfs_sync_fsdata(mp);
314
315 /* mark the log as covered if needed */
316 if (xfs_log_need_covered(mp))
317 error2 = xfs_fs_log_dummy(mp);
318
319 return error ? error : error2;
320}
321
322/*
323 * Second stage of a quiesce. The data is already synced, now we have to take
324 * care of the metadata. New transactions are already blocked, so we need to
325 * wait for any remaining transactions to drain out before proceeding.
326 */
327void
328xfs_quiesce_attr(
329 struct xfs_mount *mp)
330{
331 int error = 0;
332
333 /* wait for all modifications to complete */
334 while (atomic_read(&mp->m_active_trans) > 0)
335 delay(100);
336
337 /* reclaim inodes to do any IO before the freeze completes */
338 xfs_reclaim_inodes(mp, 0);
339 xfs_reclaim_inodes(mp, SYNC_WAIT);
340
341 /* flush all pending changes from the AIL */
342 xfs_ail_push_all_sync(mp->m_ail);
343
344 /*
345 * Just warn here till VFS can correctly support
346 * read-only remount without racing.
347 */
348 WARN_ON(atomic_read(&mp->m_active_trans) != 0);
349
350 /* Push the superblock and write an unmount record */
351 error = xfs_log_sbcount(mp);
352 if (error)
353 xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
354 "Frozen image may not be consistent.");
355 xfs_log_unmount_write(mp);
356
357 /*
358 * At this point we might have modified the superblock again and thus
359 * added an item to the AIL, thus flush it again.
360 */
361 xfs_ail_push_all_sync(mp->m_ail);
362
363 /*
364 * The superblock buffer is uncached and xfsaild_push() will lock and
365 * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
366 * here but a lock on the superblock buffer will block until iodone()
367 * has completed.
368 */
369 xfs_buf_lock(mp->m_sb_bp);
370 xfs_buf_unlock(mp->m_sb_bp);
371}
372
373static void
374xfs_syncd_queue_sync(
375 struct xfs_mount *mp)
376{
377 queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
378 msecs_to_jiffies(xfs_syncd_centisecs * 10));
379}
380
381/*
382 * Every sync period we need to unpin all items, reclaim inodes and sync
383 * disk quotas. We might need to cover the log to indicate that the
384 * filesystem is idle and not frozen.
385 */
386STATIC void
387xfs_sync_worker(
388 struct work_struct *work)
389{ 681{
390 struct xfs_mount *mp = container_of(to_delayed_work(work), 682 struct xfs_perag *pag;
391 struct xfs_mount, m_sync_work); 683 int error = 0;
392 int error; 684 int last_error = 0;
393 685 xfs_agnumber_t ag;
394 /*
395 * We shouldn't write/force the log if we are in the mount/unmount
396 * process or on a read only filesystem. The workqueue still needs to be
397 * active in both cases, however, because it is used for inode reclaim
398 * during these times. Use the MS_ACTIVE flag to avoid doing anything
399 * during mount. Doing work during unmount is avoided by calling
400 * cancel_delayed_work_sync on this work queue before tearing down
401 * the ail and the log in xfs_log_unmount.
402 */
403 if (!(mp->m_super->s_flags & MS_ACTIVE) &&
404 !(mp->m_flags & XFS_MOUNT_RDONLY)) {
405 /* dgc: errors ignored here */
406 if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
407 xfs_log_need_covered(mp))
408 error = xfs_fs_log_dummy(mp);
409 else
410 xfs_log_force(mp, 0);
411 686
412 /* start pushing all the metadata that is currently 687 ag = 0;
413 * dirty */ 688 while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
414 xfs_ail_push_all(mp->m_ail); 689 ag = pag->pag_agno + 1;
690 error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag);
691 xfs_perag_put(pag);
692 if (error) {
693 last_error = error;
694 if (error == EFSCORRUPTED)
695 break;
696 }
415 } 697 }
416 698 return XFS_ERROR(last_error);
417 /* queue us up again */
418 xfs_syncd_queue_sync(mp);
419} 699}
420 700
421/* 701/*
422 * Queue a new inode reclaim pass if there are reclaimable inodes and there 702 * Queue a new inode reclaim pass if there are reclaimable inodes and there
423 * isn't a reclaim pass already in progress. By default it runs every 5s based 703 * isn't a reclaim pass already in progress. By default it runs every 5s based
424 * on the xfs syncd work default of 30s. Perhaps this should have it's own 704 * on the xfs periodic sync default of 30s. Perhaps this should have it's own
425 * tunable, but that can be done if this method proves to be ineffective or too 705 * tunable, but that can be done if this method proves to be ineffective or too
426 * aggressive. 706 * aggressive.
427 */ 707 */
428static void 708static void
429xfs_syncd_queue_reclaim( 709xfs_reclaim_work_queue(
430 struct xfs_mount *mp) 710 struct xfs_mount *mp)
431{ 711{
432 712
433 rcu_read_lock(); 713 rcu_read_lock();
434 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 714 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
435 queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, 715 queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
436 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 716 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
437 } 717 }
438 rcu_read_unlock(); 718 rcu_read_unlock();
@@ -445,7 +725,7 @@ xfs_syncd_queue_reclaim(
445 * goes low. It scans as quickly as possible avoiding locked inodes or those 725 * goes low. It scans as quickly as possible avoiding locked inodes or those
446 * already being flushed, and once done schedules a future pass. 726 * already being flushed, and once done schedules a future pass.
447 */ 727 */
448STATIC void 728void
449xfs_reclaim_worker( 729xfs_reclaim_worker(
450 struct work_struct *work) 730 struct work_struct *work)
451{ 731{
@@ -453,65 +733,10 @@ xfs_reclaim_worker(
453 struct xfs_mount, m_reclaim_work); 733 struct xfs_mount, m_reclaim_work);
454 734
455 xfs_reclaim_inodes(mp, SYNC_TRYLOCK); 735 xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
456 xfs_syncd_queue_reclaim(mp); 736 xfs_reclaim_work_queue(mp);
457} 737}
458 738
459/* 739static void
460 * Flush delayed allocate data, attempting to free up reserved space
461 * from existing allocations. At this point a new allocation attempt
462 * has failed with ENOSPC and we are in the process of scratching our
463 * heads, looking about for more room.
464 *
465 * Queue a new data flush if there isn't one already in progress and
466 * wait for completion of the flush. This means that we only ever have one
467 * inode flush in progress no matter how many ENOSPC events are occurring and
468 * so will prevent the system from bogging down due to every concurrent
469 * ENOSPC event scanning all the active inodes in the system for writeback.
470 */
471void
472xfs_flush_inodes(
473 struct xfs_inode *ip)
474{
475 struct xfs_mount *mp = ip->i_mount;
476
477 queue_work(xfs_syncd_wq, &mp->m_flush_work);
478 flush_work(&mp->m_flush_work);
479}
480
481STATIC void
482xfs_flush_worker(
483 struct work_struct *work)
484{
485 struct xfs_mount *mp = container_of(work,
486 struct xfs_mount, m_flush_work);
487
488 xfs_sync_data(mp, SYNC_TRYLOCK);
489 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
490}
491
492int
493xfs_syncd_init(
494 struct xfs_mount *mp)
495{
496 INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
497 INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
498 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
499
500 xfs_syncd_queue_sync(mp);
501
502 return 0;
503}
504
505void
506xfs_syncd_stop(
507 struct xfs_mount *mp)
508{
509 cancel_delayed_work_sync(&mp->m_sync_work);
510 cancel_delayed_work_sync(&mp->m_reclaim_work);
511 cancel_work_sync(&mp->m_flush_work);
512}
513
514void
515__xfs_inode_set_reclaim_tag( 740__xfs_inode_set_reclaim_tag(
516 struct xfs_perag *pag, 741 struct xfs_perag *pag,
517 struct xfs_inode *ip) 742 struct xfs_inode *ip)
@@ -529,7 +754,7 @@ __xfs_inode_set_reclaim_tag(
529 spin_unlock(&ip->i_mount->m_perag_lock); 754 spin_unlock(&ip->i_mount->m_perag_lock);
530 755
531 /* schedule periodic background inode reclaim */ 756 /* schedule periodic background inode reclaim */
532 xfs_syncd_queue_reclaim(ip->i_mount); 757 xfs_reclaim_work_queue(ip->i_mount);
533 758
534 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, 759 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
535 -1, _RET_IP_); 760 -1, _RET_IP_);
@@ -577,7 +802,7 @@ __xfs_inode_clear_reclaim(
577 } 802 }
578} 803}
579 804
580void 805STATIC void
581__xfs_inode_clear_reclaim_tag( 806__xfs_inode_clear_reclaim_tag(
582 xfs_mount_t *mp, 807 xfs_mount_t *mp,
583 xfs_perag_t *pag, 808 xfs_perag_t *pag,
@@ -787,9 +1012,9 @@ out:
787 /* 1012 /*
788 * We could return EAGAIN here to make reclaim rescan the inode tree in 1013 * We could return EAGAIN here to make reclaim rescan the inode tree in
789 * a short while. However, this just burns CPU time scanning the tree 1014 * a short while. However, this just burns CPU time scanning the tree
790 * waiting for IO to complete and xfssyncd never goes back to the idle 1015 * waiting for IO to complete and the reclaim work never goes back to
791 * state. Instead, return 0 to let the next scheduled background reclaim 1016 * the idle state. Instead, return 0 to let the next scheduled
792 * attempt to reclaim the inode again. 1017 * background reclaim attempt to reclaim the inode again.
793 */ 1018 */
794 return 0; 1019 return 0;
795} 1020}
@@ -800,7 +1025,7 @@ out:
800 * then a shut down during filesystem unmount reclaim walk leak all the 1025 * then a shut down during filesystem unmount reclaim walk leak all the
801 * unreclaimed inodes. 1026 * unreclaimed inodes.
802 */ 1027 */
803int 1028STATIC int
804xfs_reclaim_inodes_ag( 1029xfs_reclaim_inodes_ag(
805 struct xfs_mount *mp, 1030 struct xfs_mount *mp,
806 int flags, 1031 int flags,
@@ -945,7 +1170,7 @@ xfs_reclaim_inodes_nr(
945 int nr_to_scan) 1170 int nr_to_scan)
946{ 1171{
947 /* kick background reclaimer and push the AIL */ 1172 /* kick background reclaimer and push the AIL */
948 xfs_syncd_queue_reclaim(mp); 1173 xfs_reclaim_work_queue(mp);
949 xfs_ail_push_all(mp->m_ail); 1174 xfs_ail_push_all(mp->m_ail);
950 1175
951 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); 1176 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
@@ -971,3 +1196,146 @@ xfs_reclaim_inodes_count(
971 return reclaimable; 1196 return reclaimable;
972} 1197}
973 1198
1199STATIC int
1200xfs_inode_match_id(
1201 struct xfs_inode *ip,
1202 struct xfs_eofblocks *eofb)
1203{
1204 if (eofb->eof_flags & XFS_EOF_FLAGS_UID &&
1205 ip->i_d.di_uid != eofb->eof_uid)
1206 return 0;
1207
1208 if (eofb->eof_flags & XFS_EOF_FLAGS_GID &&
1209 ip->i_d.di_gid != eofb->eof_gid)
1210 return 0;
1211
1212 if (eofb->eof_flags & XFS_EOF_FLAGS_PRID &&
1213 xfs_get_projid(ip) != eofb->eof_prid)
1214 return 0;
1215
1216 return 1;
1217}
1218
1219STATIC int
1220xfs_inode_free_eofblocks(
1221 struct xfs_inode *ip,
1222 struct xfs_perag *pag,
1223 int flags,
1224 void *args)
1225{
1226 int ret;
1227 struct xfs_eofblocks *eofb = args;
1228
1229 if (!xfs_can_free_eofblocks(ip, false)) {
1230 /* inode could be preallocated or append-only */
1231 trace_xfs_inode_free_eofblocks_invalid(ip);
1232 xfs_inode_clear_eofblocks_tag(ip);
1233 return 0;
1234 }
1235
1236 /*
1237 * If the mapping is dirty the operation can block and wait for some
1238 * time. Unless we are waiting, skip it.
1239 */
1240 if (!(flags & SYNC_WAIT) &&
1241 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
1242 return 0;
1243
1244 if (eofb) {
1245 if (!xfs_inode_match_id(ip, eofb))
1246 return 0;
1247
1248 /* skip the inode if the file size is too small */
1249 if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
1250 XFS_ISIZE(ip) < eofb->eof_min_file_size)
1251 return 0;
1252 }
1253
1254 ret = xfs_free_eofblocks(ip->i_mount, ip, true);
1255
1256 /* don't revisit the inode if we're not waiting */
1257 if (ret == EAGAIN && !(flags & SYNC_WAIT))
1258 ret = 0;
1259
1260 return ret;
1261}
1262
1263int
1264xfs_icache_free_eofblocks(
1265 struct xfs_mount *mp,
1266 struct xfs_eofblocks *eofb)
1267{
1268 int flags = SYNC_TRYLOCK;
1269
1270 if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
1271 flags = SYNC_WAIT;
1272
1273 return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags,
1274 eofb, XFS_ICI_EOFBLOCKS_TAG);
1275}
1276
1277void
1278xfs_inode_set_eofblocks_tag(
1279 xfs_inode_t *ip)
1280{
1281 struct xfs_mount *mp = ip->i_mount;
1282 struct xfs_perag *pag;
1283 int tagged;
1284
1285 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1286 spin_lock(&pag->pag_ici_lock);
1287 trace_xfs_inode_set_eofblocks_tag(ip);
1288
1289 tagged = radix_tree_tagged(&pag->pag_ici_root,
1290 XFS_ICI_EOFBLOCKS_TAG);
1291 radix_tree_tag_set(&pag->pag_ici_root,
1292 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
1293 XFS_ICI_EOFBLOCKS_TAG);
1294 if (!tagged) {
1295 /* propagate the eofblocks tag up into the perag radix tree */
1296 spin_lock(&ip->i_mount->m_perag_lock);
1297 radix_tree_tag_set(&ip->i_mount->m_perag_tree,
1298 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
1299 XFS_ICI_EOFBLOCKS_TAG);
1300 spin_unlock(&ip->i_mount->m_perag_lock);
1301
1302 /* kick off background trimming */
1303 xfs_queue_eofblocks(ip->i_mount);
1304
1305 trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno,
1306 -1, _RET_IP_);
1307 }
1308
1309 spin_unlock(&pag->pag_ici_lock);
1310 xfs_perag_put(pag);
1311}
1312
1313void
1314xfs_inode_clear_eofblocks_tag(
1315 xfs_inode_t *ip)
1316{
1317 struct xfs_mount *mp = ip->i_mount;
1318 struct xfs_perag *pag;
1319
1320 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1321 spin_lock(&pag->pag_ici_lock);
1322 trace_xfs_inode_clear_eofblocks_tag(ip);
1323
1324 radix_tree_tag_clear(&pag->pag_ici_root,
1325 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
1326 XFS_ICI_EOFBLOCKS_TAG);
1327 if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) {
1328 /* clear the eofblocks tag from the perag radix tree */
1329 spin_lock(&ip->i_mount->m_perag_lock);
1330 radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
1331 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
1332 XFS_ICI_EOFBLOCKS_TAG);
1333 spin_unlock(&ip->i_mount->m_perag_lock);
1334 trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno,
1335 -1, _RET_IP_);
1336 }
1337
1338 spin_unlock(&pag->pag_ici_lock);
1339 xfs_perag_put(pag);
1340}
1341
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_icache.h
index 941202e7ac6e..e0f138c70a2f 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_icache.h
@@ -24,28 +24,30 @@ struct xfs_perag;
24#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ 24#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
25#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ 25#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
26 26
27extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ 27int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
28 uint flags, uint lock_flags, xfs_inode_t **ipp);
28 29
29int xfs_syncd_init(struct xfs_mount *mp); 30void xfs_reclaim_worker(struct work_struct *work);
30void xfs_syncd_stop(struct xfs_mount *mp);
31
32int xfs_quiesce_data(struct xfs_mount *mp);
33void xfs_quiesce_attr(struct xfs_mount *mp);
34
35void xfs_flush_inodes(struct xfs_inode *ip);
36 31
37int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); 32int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
38int xfs_reclaim_inodes_count(struct xfs_mount *mp); 33int xfs_reclaim_inodes_count(struct xfs_mount *mp);
39void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); 34void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
40 35
41void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); 36void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
42void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); 37
43void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, 38void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
44 struct xfs_inode *ip); 39void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
40int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
41void xfs_eofblocks_worker(struct work_struct *);
45 42
46int xfs_sync_inode_grab(struct xfs_inode *ip); 43int xfs_sync_inode_grab(struct xfs_inode *ip);
47int xfs_inode_ag_iterator(struct xfs_mount *mp, 44int xfs_inode_ag_iterator(struct xfs_mount *mp,
48 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), 45 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
49 int flags); 46 int flags, void *args),
47 int flags, void *args);
48int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
49 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
50 int flags, void *args),
51 int flags, void *args, int tag);
50 52
51#endif 53#endif
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
deleted file mode 100644
index 784a803383ec..000000000000
--- a/fs/xfs/xfs_iget.c
+++ /dev/null
@@ -1,705 +0,0 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_acl.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_bmap_btree.h"
29#include "xfs_alloc_btree.h"
30#include "xfs_ialloc_btree.h"
31#include "xfs_dinode.h"
32#include "xfs_inode.h"
33#include "xfs_btree.h"
34#include "xfs_ialloc.h"
35#include "xfs_quota.h"
36#include "xfs_utils.h"
37#include "xfs_trans_priv.h"
38#include "xfs_inode_item.h"
39#include "xfs_bmap.h"
40#include "xfs_trace.h"
41
42
43/*
44 * Allocate and initialise an xfs_inode.
45 */
46STATIC struct xfs_inode *
47xfs_inode_alloc(
48 struct xfs_mount *mp,
49 xfs_ino_t ino)
50{
51 struct xfs_inode *ip;
52
53 /*
54 * if this didn't occur in transactions, we could use
55 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
56 * code up to do this anyway.
57 */
58 ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
59 if (!ip)
60 return NULL;
61 if (inode_init_always(mp->m_super, VFS_I(ip))) {
62 kmem_zone_free(xfs_inode_zone, ip);
63 return NULL;
64 }
65
66 ASSERT(atomic_read(&ip->i_pincount) == 0);
67 ASSERT(!spin_is_locked(&ip->i_flags_lock));
68 ASSERT(!xfs_isiflocked(ip));
69 ASSERT(ip->i_ino == 0);
70
71 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
72
73 /* initialise the xfs inode */
74 ip->i_ino = ino;
75 ip->i_mount = mp;
76 memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
77 ip->i_afp = NULL;
78 memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
79 ip->i_flags = 0;
80 ip->i_delayed_blks = 0;
81 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
82
83 return ip;
84}
85
86STATIC void
87xfs_inode_free_callback(
88 struct rcu_head *head)
89{
90 struct inode *inode = container_of(head, struct inode, i_rcu);
91 struct xfs_inode *ip = XFS_I(inode);
92
93 kmem_zone_free(xfs_inode_zone, ip);
94}
95
96void
97xfs_inode_free(
98 struct xfs_inode *ip)
99{
100 switch (ip->i_d.di_mode & S_IFMT) {
101 case S_IFREG:
102 case S_IFDIR:
103 case S_IFLNK:
104 xfs_idestroy_fork(ip, XFS_DATA_FORK);
105 break;
106 }
107
108 if (ip->i_afp)
109 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
110
111 if (ip->i_itemp) {
112 ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
113 xfs_inode_item_destroy(ip);
114 ip->i_itemp = NULL;
115 }
116
117 /* asserts to verify all state is correct here */
118 ASSERT(atomic_read(&ip->i_pincount) == 0);
119 ASSERT(!spin_is_locked(&ip->i_flags_lock));
120 ASSERT(!xfs_isiflocked(ip));
121
122 /*
123 * Because we use RCU freeing we need to ensure the inode always
124 * appears to be reclaimed with an invalid inode number when in the
125 * free state. The ip->i_flags_lock provides the barrier against lookup
126 * races.
127 */
128 spin_lock(&ip->i_flags_lock);
129 ip->i_flags = XFS_IRECLAIM;
130 ip->i_ino = 0;
131 spin_unlock(&ip->i_flags_lock);
132
133 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
134}
135
136/*
137 * Check the validity of the inode we just found it the cache
138 */
139static int
140xfs_iget_cache_hit(
141 struct xfs_perag *pag,
142 struct xfs_inode *ip,
143 xfs_ino_t ino,
144 int flags,
145 int lock_flags) __releases(RCU)
146{
147 struct inode *inode = VFS_I(ip);
148 struct xfs_mount *mp = ip->i_mount;
149 int error;
150
151 /*
152 * check for re-use of an inode within an RCU grace period due to the
153 * radix tree nodes not being updated yet. We monitor for this by
154 * setting the inode number to zero before freeing the inode structure.
155 * If the inode has been reallocated and set up, then the inode number
156 * will not match, so check for that, too.
157 */
158 spin_lock(&ip->i_flags_lock);
159 if (ip->i_ino != ino) {
160 trace_xfs_iget_skip(ip);
161 XFS_STATS_INC(xs_ig_frecycle);
162 error = EAGAIN;
163 goto out_error;
164 }
165
166
167 /*
168 * If we are racing with another cache hit that is currently
169 * instantiating this inode or currently recycling it out of
170 * reclaimabe state, wait for the initialisation to complete
171 * before continuing.
172 *
173 * XXX(hch): eventually we should do something equivalent to
174 * wait_on_inode to wait for these flags to be cleared
175 * instead of polling for it.
176 */
177 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
178 trace_xfs_iget_skip(ip);
179 XFS_STATS_INC(xs_ig_frecycle);
180 error = EAGAIN;
181 goto out_error;
182 }
183
184 /*
185 * If lookup is racing with unlink return an error immediately.
186 */
187 if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
188 error = ENOENT;
189 goto out_error;
190 }
191
192 /*
193 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
194 * Need to carefully get it back into useable state.
195 */
196 if (ip->i_flags & XFS_IRECLAIMABLE) {
197 trace_xfs_iget_reclaim(ip);
198
199 /*
200 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
201 * from stomping over us while we recycle the inode. We can't
202 * clear the radix tree reclaimable tag yet as it requires
203 * pag_ici_lock to be held exclusive.
204 */
205 ip->i_flags |= XFS_IRECLAIM;
206
207 spin_unlock(&ip->i_flags_lock);
208 rcu_read_unlock();
209
210 error = -inode_init_always(mp->m_super, inode);
211 if (error) {
212 /*
213 * Re-initializing the inode failed, and we are in deep
214 * trouble. Try to re-add it to the reclaim list.
215 */
216 rcu_read_lock();
217 spin_lock(&ip->i_flags_lock);
218
219 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
220 ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
221 trace_xfs_iget_reclaim_fail(ip);
222 goto out_error;
223 }
224
225 spin_lock(&pag->pag_ici_lock);
226 spin_lock(&ip->i_flags_lock);
227
228 /*
229 * Clear the per-lifetime state in the inode as we are now
230 * effectively a new inode and need to return to the initial
231 * state before reuse occurs.
232 */
233 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
234 ip->i_flags |= XFS_INEW;
235 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
236 inode->i_state = I_NEW;
237
238 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
239 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
240
241 spin_unlock(&ip->i_flags_lock);
242 spin_unlock(&pag->pag_ici_lock);
243 } else {
244 /* If the VFS inode is being torn down, pause and try again. */
245 if (!igrab(inode)) {
246 trace_xfs_iget_skip(ip);
247 error = EAGAIN;
248 goto out_error;
249 }
250
251 /* We've got a live one. */
252 spin_unlock(&ip->i_flags_lock);
253 rcu_read_unlock();
254 trace_xfs_iget_hit(ip);
255 }
256
257 if (lock_flags != 0)
258 xfs_ilock(ip, lock_flags);
259
260 xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
261 XFS_STATS_INC(xs_ig_found);
262
263 return 0;
264
265out_error:
266 spin_unlock(&ip->i_flags_lock);
267 rcu_read_unlock();
268 return error;
269}
270
271
272static int
273xfs_iget_cache_miss(
274 struct xfs_mount *mp,
275 struct xfs_perag *pag,
276 xfs_trans_t *tp,
277 xfs_ino_t ino,
278 struct xfs_inode **ipp,
279 int flags,
280 int lock_flags)
281{
282 struct xfs_inode *ip;
283 int error;
284 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
285 int iflags;
286
287 ip = xfs_inode_alloc(mp, ino);
288 if (!ip)
289 return ENOMEM;
290
291 error = xfs_iread(mp, tp, ip, flags);
292 if (error)
293 goto out_destroy;
294
295 trace_xfs_iget_miss(ip);
296
297 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
298 error = ENOENT;
299 goto out_destroy;
300 }
301
302 /*
303 * Preload the radix tree so we can insert safely under the
304 * write spinlock. Note that we cannot sleep inside the preload
305 * region. Since we can be called from transaction context, don't
306 * recurse into the file system.
307 */
308 if (radix_tree_preload(GFP_NOFS)) {
309 error = EAGAIN;
310 goto out_destroy;
311 }
312
313 /*
314 * Because the inode hasn't been added to the radix-tree yet it can't
315 * be found by another thread, so we can do the non-sleeping lock here.
316 */
317 if (lock_flags) {
318 if (!xfs_ilock_nowait(ip, lock_flags))
319 BUG();
320 }
321
322 /*
323 * These values must be set before inserting the inode into the radix
324 * tree as the moment it is inserted a concurrent lookup (allowed by the
325 * RCU locking mechanism) can find it and that lookup must see that this
326 * is an inode currently under construction (i.e. that XFS_INEW is set).
327 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
328 * memory barrier that ensures this detection works correctly at lookup
329 * time.
330 */
331 iflags = XFS_INEW;
332 if (flags & XFS_IGET_DONTCACHE)
333 iflags |= XFS_IDONTCACHE;
334 ip->i_udquot = ip->i_gdquot = NULL;
335 xfs_iflags_set(ip, iflags);
336
337 /* insert the new inode */
338 spin_lock(&pag->pag_ici_lock);
339 error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
340 if (unlikely(error)) {
341 WARN_ON(error != -EEXIST);
342 XFS_STATS_INC(xs_ig_dup);
343 error = EAGAIN;
344 goto out_preload_end;
345 }
346 spin_unlock(&pag->pag_ici_lock);
347 radix_tree_preload_end();
348
349 *ipp = ip;
350 return 0;
351
352out_preload_end:
353 spin_unlock(&pag->pag_ici_lock);
354 radix_tree_preload_end();
355 if (lock_flags)
356 xfs_iunlock(ip, lock_flags);
357out_destroy:
358 __destroy_inode(VFS_I(ip));
359 xfs_inode_free(ip);
360 return error;
361}
362
363/*
364 * Look up an inode by number in the given file system.
365 * The inode is looked up in the cache held in each AG.
366 * If the inode is found in the cache, initialise the vfs inode
367 * if necessary.
368 *
369 * If it is not in core, read it in from the file system's device,
370 * add it to the cache and initialise the vfs inode.
371 *
372 * The inode is locked according to the value of the lock_flags parameter.
373 * This flag parameter indicates how and if the inode's IO lock and inode lock
374 * should be taken.
375 *
376 * mp -- the mount point structure for the current file system. It points
377 * to the inode hash table.
378 * tp -- a pointer to the current transaction if there is one. This is
379 * simply passed through to the xfs_iread() call.
380 * ino -- the number of the inode desired. This is the unique identifier
381 * within the file system for the inode being requested.
382 * lock_flags -- flags indicating how to lock the inode. See the comment
383 * for xfs_ilock() for a list of valid values.
384 */
385int
386xfs_iget(
387 xfs_mount_t *mp,
388 xfs_trans_t *tp,
389 xfs_ino_t ino,
390 uint flags,
391 uint lock_flags,
392 xfs_inode_t **ipp)
393{
394 xfs_inode_t *ip;
395 int error;
396 xfs_perag_t *pag;
397 xfs_agino_t agino;
398
399 /*
400 * xfs_reclaim_inode() uses the ILOCK to ensure an inode
401 * doesn't get freed while it's being referenced during a
402 * radix tree traversal here. It assumes this function
403 * aqcuires only the ILOCK (and therefore it has no need to
404 * involve the IOLOCK in this synchronization).
405 */
406 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
407
408 /* reject inode numbers outside existing AGs */
409 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
410 return EINVAL;
411
412 /* get the perag structure and ensure that it's inode capable */
413 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
414 agino = XFS_INO_TO_AGINO(mp, ino);
415
416again:
417 error = 0;
418 rcu_read_lock();
419 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
420
421 if (ip) {
422 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
423 if (error)
424 goto out_error_or_again;
425 } else {
426 rcu_read_unlock();
427 XFS_STATS_INC(xs_ig_missed);
428
429 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
430 flags, lock_flags);
431 if (error)
432 goto out_error_or_again;
433 }
434 xfs_perag_put(pag);
435
436 *ipp = ip;
437
438 /*
439 * If we have a real type for an on-disk inode, we can set ops(&unlock)
440 * now. If it's a new inode being created, xfs_ialloc will handle it.
441 */
442 if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
443 xfs_setup_inode(ip);
444 return 0;
445
446out_error_or_again:
447 if (error == EAGAIN) {
448 delay(1);
449 goto again;
450 }
451 xfs_perag_put(pag);
452 return error;
453}
454
455/*
456 * This is a wrapper routine around the xfs_ilock() routine
457 * used to centralize some grungy code. It is used in places
458 * that wish to lock the inode solely for reading the extents.
459 * The reason these places can't just call xfs_ilock(SHARED)
460 * is that the inode lock also guards to bringing in of the
461 * extents from disk for a file in b-tree format. If the inode
462 * is in b-tree format, then we need to lock the inode exclusively
463 * until the extents are read in. Locking it exclusively all
464 * the time would limit our parallelism unnecessarily, though.
465 * What we do instead is check to see if the extents have been
466 * read in yet, and only lock the inode exclusively if they
467 * have not.
468 *
469 * The function returns a value which should be given to the
470 * corresponding xfs_iunlock_map_shared(). This value is
471 * the mode in which the lock was actually taken.
472 */
473uint
474xfs_ilock_map_shared(
475 xfs_inode_t *ip)
476{
477 uint lock_mode;
478
479 if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
480 ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
481 lock_mode = XFS_ILOCK_EXCL;
482 } else {
483 lock_mode = XFS_ILOCK_SHARED;
484 }
485
486 xfs_ilock(ip, lock_mode);
487
488 return lock_mode;
489}
490
491/*
492 * This is simply the unlock routine to go with xfs_ilock_map_shared().
493 * All it does is call xfs_iunlock() with the given lock_mode.
494 */
495void
496xfs_iunlock_map_shared(
497 xfs_inode_t *ip,
498 unsigned int lock_mode)
499{
500 xfs_iunlock(ip, lock_mode);
501}
502
503/*
504 * The xfs inode contains 2 locks: a multi-reader lock called the
505 * i_iolock and a multi-reader lock called the i_lock. This routine
506 * allows either or both of the locks to be obtained.
507 *
508 * The 2 locks should always be ordered so that the IO lock is
509 * obtained first in order to prevent deadlock.
510 *
511 * ip -- the inode being locked
512 * lock_flags -- this parameter indicates the inode's locks
513 * to be locked. It can be:
514 * XFS_IOLOCK_SHARED,
515 * XFS_IOLOCK_EXCL,
516 * XFS_ILOCK_SHARED,
517 * XFS_ILOCK_EXCL,
518 * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
519 * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
520 * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
521 * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
522 */
523void
524xfs_ilock(
525 xfs_inode_t *ip,
526 uint lock_flags)
527{
528 /*
529 * You can't set both SHARED and EXCL for the same lock,
530 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
531 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
532 */
533 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
534 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
535 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
536 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
537 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
538
539 if (lock_flags & XFS_IOLOCK_EXCL)
540 mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
541 else if (lock_flags & XFS_IOLOCK_SHARED)
542 mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
543
544 if (lock_flags & XFS_ILOCK_EXCL)
545 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
546 else if (lock_flags & XFS_ILOCK_SHARED)
547 mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
548
549 trace_xfs_ilock(ip, lock_flags, _RET_IP_);
550}
551
552/*
553 * This is just like xfs_ilock(), except that the caller
554 * is guaranteed not to sleep. It returns 1 if it gets
555 * the requested locks and 0 otherwise. If the IO lock is
556 * obtained but the inode lock cannot be, then the IO lock
557 * is dropped before returning.
558 *
559 * ip -- the inode being locked
560 * lock_flags -- this parameter indicates the inode's locks to be
561 * to be locked. See the comment for xfs_ilock() for a list
562 * of valid values.
563 */
564int
565xfs_ilock_nowait(
566 xfs_inode_t *ip,
567 uint lock_flags)
568{
569 /*
570 * You can't set both SHARED and EXCL for the same lock,
571 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
572 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
573 */
574 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
575 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
576 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
577 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
578 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
579
580 if (lock_flags & XFS_IOLOCK_EXCL) {
581 if (!mrtryupdate(&ip->i_iolock))
582 goto out;
583 } else if (lock_flags & XFS_IOLOCK_SHARED) {
584 if (!mrtryaccess(&ip->i_iolock))
585 goto out;
586 }
587 if (lock_flags & XFS_ILOCK_EXCL) {
588 if (!mrtryupdate(&ip->i_lock))
589 goto out_undo_iolock;
590 } else if (lock_flags & XFS_ILOCK_SHARED) {
591 if (!mrtryaccess(&ip->i_lock))
592 goto out_undo_iolock;
593 }
594 trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
595 return 1;
596
597 out_undo_iolock:
598 if (lock_flags & XFS_IOLOCK_EXCL)
599 mrunlock_excl(&ip->i_iolock);
600 else if (lock_flags & XFS_IOLOCK_SHARED)
601 mrunlock_shared(&ip->i_iolock);
602 out:
603 return 0;
604}
605
606/*
607 * xfs_iunlock() is used to drop the inode locks acquired with
608 * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
609 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
610 * that we know which locks to drop.
611 *
612 * ip -- the inode being unlocked
613 * lock_flags -- this parameter indicates the inode's locks to be
614 * to be unlocked. See the comment for xfs_ilock() for a list
615 * of valid values for this parameter.
616 *
617 */
618void
619xfs_iunlock(
620 xfs_inode_t *ip,
621 uint lock_flags)
622{
623 /*
624 * You can't set both SHARED and EXCL for the same lock,
625 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
626 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
627 */
628 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
629 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
630 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
631 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
632 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
633 ASSERT(lock_flags != 0);
634
635 if (lock_flags & XFS_IOLOCK_EXCL)
636 mrunlock_excl(&ip->i_iolock);
637 else if (lock_flags & XFS_IOLOCK_SHARED)
638 mrunlock_shared(&ip->i_iolock);
639
640 if (lock_flags & XFS_ILOCK_EXCL)
641 mrunlock_excl(&ip->i_lock);
642 else if (lock_flags & XFS_ILOCK_SHARED)
643 mrunlock_shared(&ip->i_lock);
644
645 trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
646}
647
648/*
649 * give up write locks. the i/o lock cannot be held nested
650 * if it is being demoted.
651 */
652void
653xfs_ilock_demote(
654 xfs_inode_t *ip,
655 uint lock_flags)
656{
657 ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
658 ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
659
660 if (lock_flags & XFS_ILOCK_EXCL)
661 mrdemote(&ip->i_lock);
662 if (lock_flags & XFS_IOLOCK_EXCL)
663 mrdemote(&ip->i_iolock);
664
665 trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
666}
667
668#ifdef DEBUG
669int
670xfs_isilocked(
671 xfs_inode_t *ip,
672 uint lock_flags)
673{
674 if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
675 if (!(lock_flags & XFS_ILOCK_SHARED))
676 return !!ip->i_lock.mr_writer;
677 return rwsem_is_locked(&ip->i_lock.mr_lock);
678 }
679
680 if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
681 if (!(lock_flags & XFS_IOLOCK_SHARED))
682 return !!ip->i_iolock.mr_writer;
683 return rwsem_is_locked(&ip->i_iolock.mr_lock);
684 }
685
686 ASSERT(0);
687 return 0;
688}
689#endif
690
691void
692__xfs_iflock(
693 struct xfs_inode *ip)
694{
695 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
696 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
697
698 do {
699 prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
700 if (xfs_isiflocked(ip))
701 io_schedule();
702 } while (!xfs_iflock_nowait(ip));
703
704 finish_wait(wq, &wait.wait);
705}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 2778258fcfa2..66282dcb821b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -45,6 +45,7 @@
45#include "xfs_filestream.h" 45#include "xfs_filestream.h"
46#include "xfs_vnodeops.h" 46#include "xfs_vnodeops.h"
47#include "xfs_trace.h" 47#include "xfs_trace.h"
48#include "xfs_icache.h"
48 49
49kmem_zone_t *xfs_ifork_zone; 50kmem_zone_t *xfs_ifork_zone;
50kmem_zone_t *xfs_inode_zone; 51kmem_zone_t *xfs_inode_zone;
@@ -74,6 +75,256 @@ xfs_get_extsz_hint(
74 return 0; 75 return 0;
75} 76}
76 77
78/*
79 * This is a wrapper routine around the xfs_ilock() routine used to centralize
80 * some grungy code. It is used in places that wish to lock the inode solely
81 * for reading the extents. The reason these places can't just call
82 * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
83 * extents from disk for a file in b-tree format. If the inode is in b-tree
84 * format, then we need to lock the inode exclusively until the extents are read
85 * in. Locking it exclusively all the time would limit our parallelism
86 * unnecessarily, though. What we do instead is check to see if the extents
87 * have been read in yet, and only lock the inode exclusively if they have not.
88 *
89 * The function returns a value which should be given to the corresponding
90 * xfs_iunlock_map_shared(). This value is the mode in which the lock was
91 * actually taken.
92 */
93uint
94xfs_ilock_map_shared(
95 xfs_inode_t *ip)
96{
97 uint lock_mode;
98
99 if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
100 ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
101 lock_mode = XFS_ILOCK_EXCL;
102 } else {
103 lock_mode = XFS_ILOCK_SHARED;
104 }
105
106 xfs_ilock(ip, lock_mode);
107
108 return lock_mode;
109}
110
111/*
112 * This is simply the unlock routine to go with xfs_ilock_map_shared().
113 * All it does is call xfs_iunlock() with the given lock_mode.
114 */
115void
116xfs_iunlock_map_shared(
117 xfs_inode_t *ip,
118 unsigned int lock_mode)
119{
120 xfs_iunlock(ip, lock_mode);
121}
122
123/*
124 * The xfs inode contains 2 locks: a multi-reader lock called the
125 * i_iolock and a multi-reader lock called the i_lock. This routine
126 * allows either or both of the locks to be obtained.
127 *
128 * The 2 locks should always be ordered so that the IO lock is
129 * obtained first in order to prevent deadlock.
130 *
131 * ip -- the inode being locked
132 * lock_flags -- this parameter indicates the inode's locks
133 * to be locked. It can be:
134 * XFS_IOLOCK_SHARED,
135 * XFS_IOLOCK_EXCL,
136 * XFS_ILOCK_SHARED,
137 * XFS_ILOCK_EXCL,
138 * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
139 * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
140 * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
141 * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
142 */
143void
144xfs_ilock(
145 xfs_inode_t *ip,
146 uint lock_flags)
147{
148 trace_xfs_ilock(ip, lock_flags, _RET_IP_);
149
150 /*
151 * You can't set both SHARED and EXCL for the same lock,
152 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
153 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
154 */
155 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
156 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
157 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
158 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
159 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
160
161 if (lock_flags & XFS_IOLOCK_EXCL)
162 mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
163 else if (lock_flags & XFS_IOLOCK_SHARED)
164 mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
165
166 if (lock_flags & XFS_ILOCK_EXCL)
167 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
168 else if (lock_flags & XFS_ILOCK_SHARED)
169 mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
170}
171
172/*
173 * This is just like xfs_ilock(), except that the caller
174 * is guaranteed not to sleep. It returns 1 if it gets
175 * the requested locks and 0 otherwise. If the IO lock is
176 * obtained but the inode lock cannot be, then the IO lock
177 * is dropped before returning.
178 *
179 * ip -- the inode being locked
180 * lock_flags -- this parameter indicates the inode's locks to be
181 * to be locked. See the comment for xfs_ilock() for a list
182 * of valid values.
183 */
184int
185xfs_ilock_nowait(
186 xfs_inode_t *ip,
187 uint lock_flags)
188{
189 trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
190
191 /*
192 * You can't set both SHARED and EXCL for the same lock,
193 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
194 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
195 */
196 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
197 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
198 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
199 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
200 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
201
202 if (lock_flags & XFS_IOLOCK_EXCL) {
203 if (!mrtryupdate(&ip->i_iolock))
204 goto out;
205 } else if (lock_flags & XFS_IOLOCK_SHARED) {
206 if (!mrtryaccess(&ip->i_iolock))
207 goto out;
208 }
209 if (lock_flags & XFS_ILOCK_EXCL) {
210 if (!mrtryupdate(&ip->i_lock))
211 goto out_undo_iolock;
212 } else if (lock_flags & XFS_ILOCK_SHARED) {
213 if (!mrtryaccess(&ip->i_lock))
214 goto out_undo_iolock;
215 }
216 return 1;
217
218 out_undo_iolock:
219 if (lock_flags & XFS_IOLOCK_EXCL)
220 mrunlock_excl(&ip->i_iolock);
221 else if (lock_flags & XFS_IOLOCK_SHARED)
222 mrunlock_shared(&ip->i_iolock);
223 out:
224 return 0;
225}
226
227/*
228 * xfs_iunlock() is used to drop the inode locks acquired with
229 * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
230 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
231 * that we know which locks to drop.
232 *
233 * ip -- the inode being unlocked
234 * lock_flags -- this parameter indicates the inode's locks to be
235 * to be unlocked. See the comment for xfs_ilock() for a list
236 * of valid values for this parameter.
237 *
238 */
239void
240xfs_iunlock(
241 xfs_inode_t *ip,
242 uint lock_flags)
243{
244 /*
245 * You can't set both SHARED and EXCL for the same lock,
246 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
247 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
248 */
249 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
250 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
251 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
252 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
253 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
254 ASSERT(lock_flags != 0);
255
256 if (lock_flags & XFS_IOLOCK_EXCL)
257 mrunlock_excl(&ip->i_iolock);
258 else if (lock_flags & XFS_IOLOCK_SHARED)
259 mrunlock_shared(&ip->i_iolock);
260
261 if (lock_flags & XFS_ILOCK_EXCL)
262 mrunlock_excl(&ip->i_lock);
263 else if (lock_flags & XFS_ILOCK_SHARED)
264 mrunlock_shared(&ip->i_lock);
265
266 trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
267}
268
269/*
270 * give up write locks. the i/o lock cannot be held nested
271 * if it is being demoted.
272 */
273void
274xfs_ilock_demote(
275 xfs_inode_t *ip,
276 uint lock_flags)
277{
278 ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
279 ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
280
281 if (lock_flags & XFS_ILOCK_EXCL)
282 mrdemote(&ip->i_lock);
283 if (lock_flags & XFS_IOLOCK_EXCL)
284 mrdemote(&ip->i_iolock);
285
286 trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
287}
288
289#ifdef DEBUG
290int
291xfs_isilocked(
292 xfs_inode_t *ip,
293 uint lock_flags)
294{
295 if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
296 if (!(lock_flags & XFS_ILOCK_SHARED))
297 return !!ip->i_lock.mr_writer;
298 return rwsem_is_locked(&ip->i_lock.mr_lock);
299 }
300
301 if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
302 if (!(lock_flags & XFS_IOLOCK_SHARED))
303 return !!ip->i_iolock.mr_writer;
304 return rwsem_is_locked(&ip->i_iolock.mr_lock);
305 }
306
307 ASSERT(0);
308 return 0;
309}
310#endif
311
312void
313__xfs_iflock(
314 struct xfs_inode *ip)
315{
316 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
317 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
318
319 do {
320 prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
321 if (xfs_isiflocked(ip))
322 io_schedule();
323 } while (!xfs_iflock_nowait(ip));
324
325 finish_wait(wq, &wait.wait);
326}
327
77#ifdef DEBUG 328#ifdef DEBUG
78/* 329/*
79 * Make sure that the extents in the given memory buffer 330 * Make sure that the extents in the given memory buffer
@@ -131,6 +382,65 @@ xfs_inobp_check(
131} 382}
132#endif 383#endif
133 384
385static void
386xfs_inode_buf_verify(
387 struct xfs_buf *bp)
388{
389 struct xfs_mount *mp = bp->b_target->bt_mount;
390 int i;
391 int ni;
392
393 /*
394 * Validate the magic number and version of every inode in the buffer
395 */
396 ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
397 for (i = 0; i < ni; i++) {
398 int di_ok;
399 xfs_dinode_t *dip;
400
401 dip = (struct xfs_dinode *)xfs_buf_offset(bp,
402 (i << mp->m_sb.sb_inodelog));
403 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
404 XFS_DINODE_GOOD_VERSION(dip->di_version);
405 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
406 XFS_ERRTAG_ITOBP_INOTOBP,
407 XFS_RANDOM_ITOBP_INOTOBP))) {
408 xfs_buf_ioerror(bp, EFSCORRUPTED);
409 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
410 mp, dip);
411#ifdef DEBUG
412 xfs_emerg(mp,
413 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
414 (unsigned long long)bp->b_bn, i,
415 be16_to_cpu(dip->di_magic));
416 ASSERT(0);
417#endif
418 }
419 }
420 xfs_inobp_check(mp, bp);
421}
422
423
424static void
425xfs_inode_buf_read_verify(
426 struct xfs_buf *bp)
427{
428 xfs_inode_buf_verify(bp);
429}
430
431static void
432xfs_inode_buf_write_verify(
433 struct xfs_buf *bp)
434{
435 xfs_inode_buf_verify(bp);
436}
437
438const struct xfs_buf_ops xfs_inode_buf_ops = {
439 .verify_read = xfs_inode_buf_read_verify,
440 .verify_write = xfs_inode_buf_write_verify,
441};
442
443
134/* 444/*
135 * This routine is called to map an inode to the buffer containing the on-disk 445 * This routine is called to map an inode to the buffer containing the on-disk
136 * version of the inode. It returns a pointer to the buffer containing the 446 * version of the inode. It returns a pointer to the buffer containing the
@@ -145,71 +455,33 @@ xfs_imap_to_bp(
145 struct xfs_mount *mp, 455 struct xfs_mount *mp,
146 struct xfs_trans *tp, 456 struct xfs_trans *tp,
147 struct xfs_imap *imap, 457 struct xfs_imap *imap,
148 struct xfs_dinode **dipp, 458 struct xfs_dinode **dipp,
149 struct xfs_buf **bpp, 459 struct xfs_buf **bpp,
150 uint buf_flags, 460 uint buf_flags,
151 uint iget_flags) 461 uint iget_flags)
152{ 462{
153 struct xfs_buf *bp; 463 struct xfs_buf *bp;
154 int error; 464 int error;
155 int i;
156 int ni;
157 465
158 buf_flags |= XBF_UNMAPPED; 466 buf_flags |= XBF_UNMAPPED;
159 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, 467 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
160 (int)imap->im_len, buf_flags, &bp); 468 (int)imap->im_len, buf_flags, &bp,
469 &xfs_inode_buf_ops);
161 if (error) { 470 if (error) {
162 if (error != EAGAIN) { 471 if (error == EAGAIN) {
163 xfs_warn(mp,
164 "%s: xfs_trans_read_buf() returned error %d.",
165 __func__, error);
166 } else {
167 ASSERT(buf_flags & XBF_TRYLOCK); 472 ASSERT(buf_flags & XBF_TRYLOCK);
473 return error;
168 } 474 }
169 return error;
170 }
171
172 /*
173 * Validate the magic number and version of every inode in the buffer
174 * (if DEBUG kernel) or the first inode in the buffer, otherwise.
175 */
176#ifdef DEBUG
177 ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
178#else /* usual case */
179 ni = 1;
180#endif
181 475
182 for (i = 0; i < ni; i++) { 476 if (error == EFSCORRUPTED &&
183 int di_ok; 477 (iget_flags & XFS_IGET_UNTRUSTED))
184 xfs_dinode_t *dip; 478 return XFS_ERROR(EINVAL);
185 479
186 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 480 xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
187 (i << mp->m_sb.sb_inodelog)); 481 __func__, error);
188 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && 482 return error;
189 XFS_DINODE_GOOD_VERSION(dip->di_version);
190 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
191 XFS_ERRTAG_ITOBP_INOTOBP,
192 XFS_RANDOM_ITOBP_INOTOBP))) {
193 if (iget_flags & XFS_IGET_UNTRUSTED) {
194 xfs_trans_brelse(tp, bp);
195 return XFS_ERROR(EINVAL);
196 }
197 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
198 mp, dip);
199#ifdef DEBUG
200 xfs_emerg(mp,
201 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
202 (unsigned long long)imap->im_blkno, i,
203 be16_to_cpu(dip->di_magic));
204 ASSERT(0);
205#endif
206 xfs_trans_brelse(tp, bp);
207 return XFS_ERROR(EFSCORRUPTED);
208 }
209 } 483 }
210 484
211 xfs_inobp_check(mp, bp);
212
213 *bpp = bp; 485 *bpp = bp;
214 *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); 486 *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
215 return 0; 487 return 0;
@@ -853,16 +1125,16 @@ xfs_iread_extents(
853 * set according to the contents of the given cred structure. 1125 * set according to the contents of the given cred structure.
854 * 1126 *
855 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() 1127 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
856 * has a free inode available, call xfs_iget() 1128 * has a free inode available, call xfs_iget() to obtain the in-core
857 * to obtain the in-core version of the allocated inode. Finally, 1129 * version of the allocated inode. Finally, fill in the inode and
858 * fill in the inode and log its initial contents. In this case, 1130 * log its initial contents. In this case, ialloc_context would be
859 * ialloc_context would be set to NULL and call_again set to false. 1131 * set to NULL.
860 * 1132 *
861 * If xfs_dialloc() does not have an available inode, 1133 * If xfs_dialloc() does not have an available inode, it will replenish
862 * it will replenish its supply by doing an allocation. Since we can 1134 * its supply by doing an allocation. Since we can only do one
863 * only do one allocation within a transaction without deadlocks, we 1135 * allocation within a transaction without deadlocks, we must commit
864 * must commit the current transaction before returning the inode itself. 1136 * the current transaction before returning the inode itself.
865 * In this case, therefore, we will set call_again to true and return. 1137 * In this case, therefore, we will set ialloc_context and return.
866 * The caller should then commit the current transaction, start a new 1138 * The caller should then commit the current transaction, start a new
867 * transaction, and call xfs_ialloc() again to actually get the inode. 1139 * transaction, and call xfs_ialloc() again to actually get the inode.
868 * 1140 *
@@ -1509,10 +1781,23 @@ xfs_ifree_cluster(
1509 * to mark all the active inodes on the buffer stale. 1781 * to mark all the active inodes on the buffer stale.
1510 */ 1782 */
1511 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 1783 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
1512 mp->m_bsize * blks_per_cluster, 0); 1784 mp->m_bsize * blks_per_cluster,
1785 XBF_UNMAPPED);
1513 1786
1514 if (!bp) 1787 if (!bp)
1515 return ENOMEM; 1788 return ENOMEM;
1789
1790 /*
1791 * This buffer may not have been correctly initialised as we
1792 * didn't read it from disk. That's not important because we are
1793 * only using to mark the buffer as stale in the log, and to
1794 * attach stale cached inodes on it. That means it will never be
1795 * dispatched for IO. If it is, we want to know about it, and we
1796 * want it to fail. We can acheive this by adding a write
1797 * verifier to the buffer.
1798 */
1799 bp->b_ops = &xfs_inode_buf_ops;
1800
1516 /* 1801 /*
1517 * Walk the inodes already attached to the buffer and mark them 1802 * Walk the inodes already attached to the buffer and mark them
1518 * stale. These will all have the flush locks held, so an 1803 * stale. These will all have the flush locks held, so an
@@ -3660,3 +3945,40 @@ xfs_iext_irec_update_extoffs(
3660 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; 3945 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
3661 } 3946 }
3662} 3947}
3948
3949/*
3950 * Test whether it is appropriate to check an inode for and free post EOF
3951 * blocks. The 'force' parameter determines whether we should also consider
3952 * regular files that are marked preallocated or append-only.
3953 */
3954bool
3955xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
3956{
3957 /* prealloc/delalloc exists only on regular files */
3958 if (!S_ISREG(ip->i_d.di_mode))
3959 return false;
3960
3961 /*
3962 * Zero sized files with no cached pages and delalloc blocks will not
3963 * have speculative prealloc/delalloc blocks to remove.
3964 */
3965 if (VFS_I(ip)->i_size == 0 &&
3966 VN_CACHED(VFS_I(ip)) == 0 &&
3967 ip->i_delayed_blks == 0)
3968 return false;
3969
3970 /* If we haven't read in the extent list, then don't do it now. */
3971 if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
3972 return false;
3973
3974 /*
3975 * Do not free real preallocated or append-only files unless the file
3976 * has delalloc blocks and we are forced to remove them.
3977 */
3978 if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
3979 if (!force || ip->i_delayed_blks == 0)
3980 return false;
3981
3982 return true;
3983}
3984
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 94b32f906e79..22baf6ea4fac 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -496,11 +496,10 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
496 (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \ 496 (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
497 ((pip)->i_d.di_mode & S_ISGID)) 497 ((pip)->i_d.di_mode & S_ISGID))
498 498
499
499/* 500/*
500 * xfs_iget.c prototypes. 501 * xfs_inode.c prototypes.
501 */ 502 */
502int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
503 uint, uint, xfs_inode_t **);
504void xfs_ilock(xfs_inode_t *, uint); 503void xfs_ilock(xfs_inode_t *, uint);
505int xfs_ilock_nowait(xfs_inode_t *, uint); 504int xfs_ilock_nowait(xfs_inode_t *, uint);
506void xfs_iunlock(xfs_inode_t *, uint); 505void xfs_iunlock(xfs_inode_t *, uint);
@@ -508,11 +507,6 @@ void xfs_ilock_demote(xfs_inode_t *, uint);
508int xfs_isilocked(xfs_inode_t *, uint); 507int xfs_isilocked(xfs_inode_t *, uint);
509uint xfs_ilock_map_shared(xfs_inode_t *); 508uint xfs_ilock_map_shared(xfs_inode_t *);
510void xfs_iunlock_map_shared(xfs_inode_t *, uint); 509void xfs_iunlock_map_shared(xfs_inode_t *, uint);
511void xfs_inode_free(struct xfs_inode *ip);
512
513/*
514 * xfs_inode.c prototypes.
515 */
516int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, 510int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
517 xfs_nlink_t, xfs_dev_t, prid_t, int, 511 xfs_nlink_t, xfs_dev_t, prid_t, int,
518 struct xfs_buf **, xfs_inode_t **); 512 struct xfs_buf **, xfs_inode_t **);
@@ -591,6 +585,7 @@ void xfs_iext_irec_compact(xfs_ifork_t *);
591void xfs_iext_irec_compact_pages(xfs_ifork_t *); 585void xfs_iext_irec_compact_pages(xfs_ifork_t *);
592void xfs_iext_irec_compact_full(xfs_ifork_t *); 586void xfs_iext_irec_compact_full(xfs_ifork_t *);
593void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int); 587void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
588bool xfs_can_free_eofblocks(struct xfs_inode *, bool);
594 589
595#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) 590#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
596 591
@@ -603,5 +598,6 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
603extern struct kmem_zone *xfs_ifork_zone; 598extern struct kmem_zone *xfs_ifork_zone;
604extern struct kmem_zone *xfs_inode_zone; 599extern struct kmem_zone *xfs_inode_zone;
605extern struct kmem_zone *xfs_ili_zone; 600extern struct kmem_zone *xfs_ili_zone;
601extern const struct xfs_buf_ops xfs_inode_buf_ops;
606 602
607#endif /* __XFS_INODE_H__ */ 603#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 8305f2ac6773..c1c3ef88a260 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -42,6 +42,7 @@
42#include "xfs_inode_item.h" 42#include "xfs_inode_item.h"
43#include "xfs_export.h" 43#include "xfs_export.h"
44#include "xfs_trace.h" 44#include "xfs_trace.h"
45#include "xfs_icache.h"
45 46
46#include <linux/capability.h> 47#include <linux/capability.h>
47#include <linux/dcache.h> 48#include <linux/dcache.h>
@@ -70,7 +71,7 @@ xfs_find_handle(
70 int hsize; 71 int hsize;
71 xfs_handle_t handle; 72 xfs_handle_t handle;
72 struct inode *inode; 73 struct inode *inode;
73 struct fd f; 74 struct fd f = {0};
74 struct path path; 75 struct path path;
75 int error; 76 int error;
76 struct xfs_inode *ip; 77 struct xfs_inode *ip;
@@ -1602,6 +1603,26 @@ xfs_file_ioctl(
1602 error = xfs_errortag_clearall(mp, 1); 1603 error = xfs_errortag_clearall(mp, 1);
1603 return -error; 1604 return -error;
1604 1605
1606 case XFS_IOC_FREE_EOFBLOCKS: {
1607 struct xfs_eofblocks eofb;
1608
1609 if (copy_from_user(&eofb, arg, sizeof(eofb)))
1610 return -XFS_ERROR(EFAULT);
1611
1612 if (eofb.eof_version != XFS_EOFBLOCKS_VERSION)
1613 return -XFS_ERROR(EINVAL);
1614
1615 if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID)
1616 return -XFS_ERROR(EINVAL);
1617
1618 if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) ||
1619 memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64)))
1620 return -XFS_ERROR(EINVAL);
1621
1622 error = xfs_icache_free_eofblocks(mp, &eofb);
1623 return -error;
1624 }
1625
1605 default: 1626 default:
1606 return -ENOTTY; 1627 return -ENOTTY;
1607 } 1628 }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 973dff6ad935..add06b4e9a63 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -41,6 +41,7 @@
41#include "xfs_utils.h" 41#include "xfs_utils.h"
42#include "xfs_iomap.h" 42#include "xfs_iomap.h"
43#include "xfs_trace.h" 43#include "xfs_trace.h"
44#include "xfs_icache.h"
44 45
45 46
46#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ 47#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
@@ -373,7 +374,7 @@ xfs_iomap_write_delay(
373 xfs_extlen_t extsz; 374 xfs_extlen_t extsz;
374 int nimaps; 375 int nimaps;
375 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; 376 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
376 int prealloc, flushed = 0; 377 int prealloc;
377 int error; 378 int error;
378 379
379 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 380 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -434,31 +435,29 @@ retry:
434 } 435 }
435 436
436 /* 437 /*
437 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For 438 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
438 * ENOSPC, * flush all other inodes with delalloc blocks to free up
439 * some of the excess reserved metadata space. For both cases, retry
440 * without EOF preallocation. 439 * without EOF preallocation.
441 */ 440 */
442 if (nimaps == 0) { 441 if (nimaps == 0) {
443 trace_xfs_delalloc_enospc(ip, offset, count); 442 trace_xfs_delalloc_enospc(ip, offset, count);
444 if (flushed) 443 if (prealloc) {
445 return XFS_ERROR(error ? error : ENOSPC); 444 prealloc = 0;
446 445 error = 0;
447 if (error == ENOSPC) { 446 goto retry;
448 xfs_iunlock(ip, XFS_ILOCK_EXCL);
449 xfs_flush_inodes(ip);
450 xfs_ilock(ip, XFS_ILOCK_EXCL);
451 } 447 }
452 448 return XFS_ERROR(error ? error : ENOSPC);
453 flushed = 1;
454 error = 0;
455 prealloc = 0;
456 goto retry;
457 } 449 }
458 450
459 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) 451 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
460 return xfs_alert_fsblock_zero(ip, &imap[0]); 452 return xfs_alert_fsblock_zero(ip, &imap[0]);
461 453
454 /*
455 * Tag the inode as speculatively preallocated so we can reclaim this
456 * space on demand, if necessary.
457 */
458 if (prealloc)
459 xfs_inode_set_eofblocks_tag(ip);
460
462 *ret_imap = imap[0]; 461 *ret_imap = imap[0];
463 return 0; 462 return 0;
464} 463}
@@ -584,7 +583,9 @@ xfs_iomap_write_allocate(
584 * pointer that the caller gave to us. 583 * pointer that the caller gave to us.
585 */ 584 */
586 error = xfs_bmapi_write(tp, ip, map_start_fsb, 585 error = xfs_bmapi_write(tp, ip, map_start_fsb,
587 count_fsb, 0, &first_block, 1, 586 count_fsb,
587 XFS_BMAPI_STACK_SWITCH,
588 &first_block, 1,
588 imap, &nimaps, &free_list); 589 imap, &nimaps, &free_list);
589 if (error) 590 if (error)
590 goto trans_cancel; 591 goto trans_cancel;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 4e00cf091d2c..d82efaa2ac73 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,6 +38,7 @@
38#include "xfs_vnodeops.h" 38#include "xfs_vnodeops.h"
39#include "xfs_inode_item.h" 39#include "xfs_inode_item.h"
40#include "xfs_trace.h" 40#include "xfs_trace.h"
41#include "xfs_icache.h"
41 42
42#include <linux/capability.h> 43#include <linux/capability.h>
43#include <linux/xattr.h> 44#include <linux/xattr.h>
@@ -779,8 +780,8 @@ xfs_setattr_size(
779 * care about here. 780 * care about here.
780 */ 781 */
781 if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { 782 if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
782 error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0, 783 error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
783 FI_NONE); 784 ip->i_d.di_size, newsize);
784 if (error) 785 if (error)
785 goto out_unlock; 786 goto out_unlock;
786 } 787 }
@@ -854,6 +855,9 @@ xfs_setattr_size(
854 * and do not wait the usual (long) time for writeout. 855 * and do not wait the usual (long) time for writeout.
855 */ 856 */
856 xfs_iflags_set(ip, XFS_ITRUNCATED); 857 xfs_iflags_set(ip, XFS_ITRUNCATED);
858
859 /* A truncate down always removes post-EOF blocks. */
860 xfs_inode_clear_eofblocks_tag(ip);
857 } 861 }
858 862
859 if (mask & ATTR_CTIME) { 863 if (mask & ATTR_CTIME) {
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 01d10a66e302..2ea7d402188d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -34,6 +34,7 @@
34#include "xfs_error.h" 34#include "xfs_error.h"
35#include "xfs_btree.h" 35#include "xfs_btree.h"
36#include "xfs_trace.h" 36#include "xfs_trace.h"
37#include "xfs_icache.h"
37 38
38STATIC int 39STATIC int
39xfs_internal_inum( 40xfs_internal_inum(
@@ -395,7 +396,8 @@ xfs_bulkstat(
395 if (xfs_inobt_maskn(chunkidx, nicluster) 396 if (xfs_inobt_maskn(chunkidx, nicluster)
396 & ~r.ir_free) 397 & ~r.ir_free)
397 xfs_btree_reada_bufs(mp, agno, 398 xfs_btree_reada_bufs(mp, agno,
398 agbno, nbcluster); 399 agbno, nbcluster,
400 &xfs_inode_buf_ops);
399 } 401 }
400 irbp->ir_startino = r.ir_startino; 402 irbp->ir_startino = r.ir_startino;
401 irbp->ir_freecount = r.ir_freecount; 403 irbp->ir_freecount = r.ir_freecount;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 828662f70d64..fe7e4df85a7b 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -44,6 +44,7 @@
44#include <linux/kernel.h> 44#include <linux/kernel.h>
45#include <linux/blkdev.h> 45#include <linux/blkdev.h>
46#include <linux/slab.h> 46#include <linux/slab.h>
47#include <linux/crc32c.h>
47#include <linux/module.h> 48#include <linux/module.h>
48#include <linux/mutex.h> 49#include <linux/mutex.h>
49#include <linux/file.h> 50#include <linux/file.h>
@@ -118,6 +119,7 @@
118#define xfs_rotorstep xfs_params.rotorstep.val 119#define xfs_rotorstep xfs_params.rotorstep.val
119#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val 120#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val
120#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val 121#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val
122#define xfs_eofb_secs xfs_params.eofb_timer.val
121 123
122#define current_cpu() (raw_smp_processor_id()) 124#define current_cpu() (raw_smp_processor_id())
123#define current_pid() (current->pid) 125#define current_pid() (current->pid)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 7f4f9370d0e7..46bd9d52ab51 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -34,6 +34,8 @@
34#include "xfs_dinode.h" 34#include "xfs_dinode.h"
35#include "xfs_inode.h" 35#include "xfs_inode.h"
36#include "xfs_trace.h" 36#include "xfs_trace.h"
37#include "xfs_fsops.h"
38#include "xfs_cksum.h"
37 39
38kmem_zone_t *xfs_log_ticket_zone; 40kmem_zone_t *xfs_log_ticket_zone;
39 41
@@ -458,7 +460,8 @@ xfs_log_reserve(
458 tic->t_trans_type = t_type; 460 tic->t_trans_type = t_type;
459 *ticp = tic; 461 *ticp = tic;
460 462
461 xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt); 463 xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
464 : tic->t_unit_res);
462 465
463 trace_xfs_log_reserve(log, tic); 466 trace_xfs_log_reserve(log, tic);
464 467
@@ -679,25 +682,29 @@ out:
679} 682}
680 683
681/* 684/*
682 * Finish the recovery of the file system. This is separate from 685 * Finish the recovery of the file system. This is separate from the
683 * the xfs_log_mount() call, because it depends on the code in 686 * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read
684 * xfs_mountfs() to read in the root and real-time bitmap inodes 687 * in the root and real-time bitmap inodes between calling xfs_log_mount() and
685 * between calling xfs_log_mount() and here. 688 * here.
686 * 689 *
687 * mp - ubiquitous xfs mount point structure 690 * If we finish recovery successfully, start the background log work. If we are
691 * not doing recovery, then we have a RO filesystem and we don't need to start
692 * it.
688 */ 693 */
689int 694int
690xfs_log_mount_finish(xfs_mount_t *mp) 695xfs_log_mount_finish(xfs_mount_t *mp)
691{ 696{
692 int error; 697 int error = 0;
693 698
694 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) 699 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
695 error = xlog_recover_finish(mp->m_log); 700 error = xlog_recover_finish(mp->m_log);
696 else { 701 if (!error)
697 error = 0; 702 xfs_log_work_queue(mp);
703 } else {
698 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 704 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
699 } 705 }
700 706
707
701 return error; 708 return error;
702} 709}
703 710
@@ -850,15 +857,49 @@ xfs_log_unmount_write(xfs_mount_t *mp)
850} /* xfs_log_unmount_write */ 857} /* xfs_log_unmount_write */
851 858
852/* 859/*
853 * Deallocate log structures for unmount/relocation. 860 * Empty the log for unmount/freeze.
861 *
862 * To do this, we first need to shut down the background log work so it is not
863 * trying to cover the log as we clean up. We then need to unpin all objects in
864 * the log so we can then flush them out. Once they have completed their IO and
865 * run the callbacks removing themselves from the AIL, we can write the unmount
866 * record.
867 */
868void
869xfs_log_quiesce(
870 struct xfs_mount *mp)
871{
872 cancel_delayed_work_sync(&mp->m_log->l_work);
873 xfs_log_force(mp, XFS_LOG_SYNC);
874
875 /*
876 * The superblock buffer is uncached and while xfs_ail_push_all_sync()
877 * will push it, xfs_wait_buftarg() will not wait for it. Further,
878 * xfs_buf_iowait() cannot be used because it was pushed with the
879 * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
880 * the IO to complete.
881 */
882 xfs_ail_push_all_sync(mp->m_ail);
883 xfs_wait_buftarg(mp->m_ddev_targp);
884 xfs_buf_lock(mp->m_sb_bp);
885 xfs_buf_unlock(mp->m_sb_bp);
886
887 xfs_log_unmount_write(mp);
888}
889
890/*
891 * Shut down and release the AIL and Log.
854 * 892 *
855 * We need to stop the aild from running before we destroy 893 * During unmount, we need to ensure we flush all the dirty metadata objects
856 * and deallocate the log as the aild references the log. 894 * from the AIL so that the log is empty before we write the unmount record to
895 * the log. Once this is done, we can tear down the AIL and the log.
857 */ 896 */
858void 897void
859xfs_log_unmount(xfs_mount_t *mp) 898xfs_log_unmount(
899 struct xfs_mount *mp)
860{ 900{
861 cancel_delayed_work_sync(&mp->m_sync_work); 901 xfs_log_quiesce(mp);
902
862 xfs_trans_ail_destroy(mp); 903 xfs_trans_ail_destroy(mp);
863 xlog_dealloc_log(mp->m_log); 904 xlog_dealloc_log(mp->m_log);
864} 905}
@@ -1090,8 +1131,7 @@ xlog_iodone(xfs_buf_t *bp)
1090 * with it being freed after writing the unmount record to the 1131 * with it being freed after writing the unmount record to the
1091 * log. 1132 * log.
1092 */ 1133 */
1093 1134}
1094} /* xlog_iodone */
1095 1135
1096/* 1136/*
1097 * Return size of each in-core log record buffer. 1137 * Return size of each in-core log record buffer.
@@ -1161,6 +1201,40 @@ done:
1161} /* xlog_get_iclog_buffer_size */ 1201} /* xlog_get_iclog_buffer_size */
1162 1202
1163 1203
1204void
1205xfs_log_work_queue(
1206 struct xfs_mount *mp)
1207{
1208 queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work,
1209 msecs_to_jiffies(xfs_syncd_centisecs * 10));
1210}
1211
1212/*
1213 * Every sync period we need to unpin all items in the AIL and push them to
1214 * disk. If there is nothing dirty, then we might need to cover the log to
1215 * indicate that the filesystem is idle.
1216 */
1217void
1218xfs_log_worker(
1219 struct work_struct *work)
1220{
1221 struct xlog *log = container_of(to_delayed_work(work),
1222 struct xlog, l_work);
1223 struct xfs_mount *mp = log->l_mp;
1224
1225 /* dgc: errors ignored - not fatal and nowhere to report them */
1226 if (xfs_log_need_covered(mp))
1227 xfs_fs_log_dummy(mp);
1228 else
1229 xfs_log_force(mp, 0);
1230
1231 /* start pushing all the metadata that is currently dirty */
1232 xfs_ail_push_all(mp->m_ail);
1233
1234 /* queue us up again */
1235 xfs_log_work_queue(mp);
1236}
1237
1164/* 1238/*
1165 * This routine initializes some of the log structure for a given mount point. 1239 * This routine initializes some of the log structure for a given mount point.
1166 * Its primary purpose is to fill in enough, so recovery can occur. However, 1240 * Its primary purpose is to fill in enough, so recovery can occur. However,
@@ -1195,6 +1269,7 @@ xlog_alloc_log(
1195 log->l_logBBsize = num_bblks; 1269 log->l_logBBsize = num_bblks;
1196 log->l_covered_state = XLOG_STATE_COVER_IDLE; 1270 log->l_covered_state = XLOG_STATE_COVER_IDLE;
1197 log->l_flags |= XLOG_ACTIVE_RECOVERY; 1271 log->l_flags |= XLOG_ACTIVE_RECOVERY;
1272 INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
1198 1273
1199 log->l_prev_block = -1; 1274 log->l_prev_block = -1;
1200 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ 1275 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
@@ -1417,6 +1492,84 @@ xlog_grant_push_ail(
1417} 1492}
1418 1493
1419/* 1494/*
1495 * Stamp cycle number in every block
1496 */
1497STATIC void
1498xlog_pack_data(
1499 struct xlog *log,
1500 struct xlog_in_core *iclog,
1501 int roundoff)
1502{
1503 int i, j, k;
1504 int size = iclog->ic_offset + roundoff;
1505 __be32 cycle_lsn;
1506 xfs_caddr_t dp;
1507
1508 cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
1509
1510 dp = iclog->ic_datap;
1511 for (i = 0; i < BTOBB(size); i++) {
1512 if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE))
1513 break;
1514 iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
1515 *(__be32 *)dp = cycle_lsn;
1516 dp += BBSIZE;
1517 }
1518
1519 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1520 xlog_in_core_2_t *xhdr = iclog->ic_data;
1521
1522 for ( ; i < BTOBB(size); i++) {
1523 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1524 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1525 xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
1526 *(__be32 *)dp = cycle_lsn;
1527 dp += BBSIZE;
1528 }
1529
1530 for (i = 1; i < log->l_iclog_heads; i++)
1531 xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
1532 }
1533}
1534
1535/*
1536 * Calculate the checksum for a log buffer.
1537 *
1538 * This is a little more complicated than it should be because the various
1539 * headers and the actual data are non-contiguous.
1540 */
1541__le32
1542xlog_cksum(
1543 struct xlog *log,
1544 struct xlog_rec_header *rhead,
1545 char *dp,
1546 int size)
1547{
1548 __uint32_t crc;
1549
1550 /* first generate the crc for the record header ... */
1551 crc = xfs_start_cksum((char *)rhead,
1552 sizeof(struct xlog_rec_header),
1553 offsetof(struct xlog_rec_header, h_crc));
1554
1555 /* ... then for additional cycle data for v2 logs ... */
1556 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1557 union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead;
1558 int i;
1559
1560 for (i = 1; i < log->l_iclog_heads; i++) {
1561 crc = crc32c(crc, &xhdr[i].hic_xheader,
1562 sizeof(struct xlog_rec_ext_header));
1563 }
1564 }
1565
1566 /* ... and finally for the payload */
1567 crc = crc32c(crc, dp, size);
1568
1569 return xfs_end_cksum(crc);
1570}
1571
1572/*
1420 * The bdstrat callback function for log bufs. This gives us a central 1573 * The bdstrat callback function for log bufs. This gives us a central
1421 * place to trap bufs in case we get hit by a log I/O error and need to 1574 * place to trap bufs in case we get hit by a log I/O error and need to
1422 * shutdown. Actually, in practice, even when we didn't get a log error, 1575 * shutdown. Actually, in practice, even when we didn't get a log error,
@@ -1476,7 +1629,6 @@ xlog_sync(
1476 struct xlog *log, 1629 struct xlog *log,
1477 struct xlog_in_core *iclog) 1630 struct xlog_in_core *iclog)
1478{ 1631{
1479 xfs_caddr_t dptr; /* pointer to byte sized element */
1480 xfs_buf_t *bp; 1632 xfs_buf_t *bp;
1481 int i; 1633 int i;
1482 uint count; /* byte count of bwrite */ 1634 uint count; /* byte count of bwrite */
@@ -1485,6 +1637,7 @@ xlog_sync(
1485 int split = 0; /* split write into two regions */ 1637 int split = 0; /* split write into two regions */
1486 int error; 1638 int error;
1487 int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); 1639 int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
1640 int size;
1488 1641
1489 XFS_STATS_INC(xs_log_writes); 1642 XFS_STATS_INC(xs_log_writes);
1490 ASSERT(atomic_read(&iclog->ic_refcnt) == 0); 1643 ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
@@ -1515,13 +1668,10 @@ xlog_sync(
1515 xlog_pack_data(log, iclog, roundoff); 1668 xlog_pack_data(log, iclog, roundoff);
1516 1669
1517 /* real byte length */ 1670 /* real byte length */
1518 if (v2) { 1671 size = iclog->ic_offset;
1519 iclog->ic_header.h_len = 1672 if (v2)
1520 cpu_to_be32(iclog->ic_offset + roundoff); 1673 size += roundoff;
1521 } else { 1674 iclog->ic_header.h_len = cpu_to_be32(size);
1522 iclog->ic_header.h_len =
1523 cpu_to_be32(iclog->ic_offset);
1524 }
1525 1675
1526 bp = iclog->ic_bp; 1676 bp = iclog->ic_bp;
1527 XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); 1677 XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
@@ -1530,12 +1680,36 @@ xlog_sync(
1530 1680
1531 /* Do we need to split this write into 2 parts? */ 1681 /* Do we need to split this write into 2 parts? */
1532 if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { 1682 if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
1683 char *dptr;
1684
1533 split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp))); 1685 split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
1534 count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)); 1686 count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
1535 iclog->ic_bwritecnt = 2; /* split into 2 writes */ 1687 iclog->ic_bwritecnt = 2;
1688
1689 /*
1690 * Bump the cycle numbers at the start of each block in the
1691 * part of the iclog that ends up in the buffer that gets
1692 * written to the start of the log.
1693 *
1694 * Watch out for the header magic number case, though.
1695 */
1696 dptr = (char *)&iclog->ic_header + count;
1697 for (i = 0; i < split; i += BBSIZE) {
1698 __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr);
1699 if (++cycle == XLOG_HEADER_MAGIC_NUM)
1700 cycle++;
1701 *(__be32 *)dptr = cpu_to_be32(cycle);
1702
1703 dptr += BBSIZE;
1704 }
1536 } else { 1705 } else {
1537 iclog->ic_bwritecnt = 1; 1706 iclog->ic_bwritecnt = 1;
1538 } 1707 }
1708
1709 /* calculcate the checksum */
1710 iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
1711 iclog->ic_datap, size);
1712
1539 bp->b_io_length = BTOBB(count); 1713 bp->b_io_length = BTOBB(count);
1540 bp->b_fspriv = iclog; 1714 bp->b_fspriv = iclog;
1541 XFS_BUF_ZEROFLAGS(bp); 1715 XFS_BUF_ZEROFLAGS(bp);
@@ -1589,19 +1763,6 @@ xlog_sync(
1589 bp->b_flags |= XBF_SYNCIO; 1763 bp->b_flags |= XBF_SYNCIO;
1590 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) 1764 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
1591 bp->b_flags |= XBF_FUA; 1765 bp->b_flags |= XBF_FUA;
1592 dptr = bp->b_addr;
1593 /*
1594 * Bump the cycle numbers at the start of each block
1595 * since this part of the buffer is at the start of
1596 * a new cycle. Watch out for the header magic number
1597 * case, though.
1598 */
1599 for (i = 0; i < split; i += BBSIZE) {
1600 be32_add_cpu((__be32 *)dptr, 1);
1601 if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM)
1602 be32_add_cpu((__be32 *)dptr, 1);
1603 dptr += BBSIZE;
1604 }
1605 1766
1606 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); 1767 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
1607 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); 1768 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -1618,7 +1779,6 @@ xlog_sync(
1618 return 0; 1779 return 0;
1619} /* xlog_sync */ 1780} /* xlog_sync */
1620 1781
1621
1622/* 1782/*
1623 * Deallocate a log structure 1783 * Deallocate a log structure
1624 */ 1784 */
@@ -2387,14 +2547,27 @@ xlog_state_do_callback(
2387 2547
2388 2548
2389 /* 2549 /*
2390 * update the last_sync_lsn before we drop the 2550 * Completion of a iclog IO does not imply that
2551 * a transaction has completed, as transactions
2552 * can be large enough to span many iclogs. We
2553 * cannot change the tail of the log half way
2554 * through a transaction as this may be the only
2555 * transaction in the log and moving th etail to
2556 * point to the middle of it will prevent
2557 * recovery from finding the start of the
2558 * transaction. Hence we should only update the
2559 * last_sync_lsn if this iclog contains
2560 * transaction completion callbacks on it.
2561 *
2562 * We have to do this before we drop the
2391 * icloglock to ensure we are the only one that 2563 * icloglock to ensure we are the only one that
2392 * can update it. 2564 * can update it.
2393 */ 2565 */
2394 ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), 2566 ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
2395 be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); 2567 be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
2396 atomic64_set(&log->l_last_sync_lsn, 2568 if (iclog->ic_callback)
2397 be64_to_cpu(iclog->ic_header.h_lsn)); 2569 atomic64_set(&log->l_last_sync_lsn,
2570 be64_to_cpu(iclog->ic_header.h_lsn));
2398 2571
2399 } else 2572 } else
2400 ioerrors++; 2573 ioerrors++;
@@ -3700,3 +3873,4 @@ xlog_iclogs_empty(
3700 } while (iclog != log->l_iclog); 3873 } while (iclog != log->l_iclog);
3701 return 1; 3874 return 1;
3702} 3875}
3876
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 748d312850e2..5caee96059df 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -181,5 +181,9 @@ int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
181 xfs_lsn_t *commit_lsn, int flags); 181 xfs_lsn_t *commit_lsn, int flags);
182bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); 182bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
183 183
184void xfs_log_work_queue(struct xfs_mount *mp);
185void xfs_log_worker(struct work_struct *work);
186void xfs_log_quiesce(struct xfs_mount *mp);
187
184#endif 188#endif
185#endif /* __XFS_LOG_H__ */ 189#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 18a801d76a42..16d8d12ea3b4 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -139,7 +139,6 @@ static inline uint xlog_get_client_id(__be32 i)
139/* 139/*
140 * Flags for log structure 140 * Flags for log structure
141 */ 141 */
142#define XLOG_CHKSUM_MISMATCH 0x1 /* used only during recovery */
143#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ 142#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */
144#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ 143#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
145#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being 144#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
@@ -291,7 +290,7 @@ typedef struct xlog_rec_header {
291 __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */ 290 __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */
292 __be64 h_lsn; /* lsn of this LR : 8 */ 291 __be64 h_lsn; /* lsn of this LR : 8 */
293 __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ 292 __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */
294 __be32 h_chksum; /* may not be used; non-zero if used : 4 */ 293 __le32 h_crc; /* crc of log record : 4 */
295 __be32 h_prev_block; /* block number to previous LR : 4 */ 294 __be32 h_prev_block; /* block number to previous LR : 4 */
296 __be32 h_num_logops; /* number of log operations in this LR : 4 */ 295 __be32 h_num_logops; /* number of log operations in this LR : 4 */
297 __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; 296 __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
@@ -495,6 +494,7 @@ struct xlog {
495 struct xfs_buf *l_xbuf; /* extra buffer for log 494 struct xfs_buf *l_xbuf; /* extra buffer for log
496 * wrapping */ 495 * wrapping */
497 struct xfs_buftarg *l_targ; /* buftarg of log */ 496 struct xfs_buftarg *l_targ; /* buftarg of log */
497 struct delayed_work l_work; /* background flush work */
498 uint l_flags; 498 uint l_flags;
499 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ 499 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
500 struct list_head *l_buf_cancel_table; 500 struct list_head *l_buf_cancel_table;
@@ -554,11 +554,9 @@ xlog_recover(
554extern int 554extern int
555xlog_recover_finish( 555xlog_recover_finish(
556 struct xlog *log); 556 struct xlog *log);
557extern void 557
558xlog_pack_data( 558extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
559 struct xlog *log, 559 char *dp, int size);
560 struct xlog_in_core *iclog,
561 int);
562 560
563extern kmem_zone_t *xfs_log_ticket_zone; 561extern kmem_zone_t *xfs_log_ticket_zone;
564struct xlog_ticket * 562struct xlog_ticket *
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 5da3ace352bf..96fcbb85ff83 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -41,7 +41,9 @@
41#include "xfs_trans_priv.h" 41#include "xfs_trans_priv.h"
42#include "xfs_quota.h" 42#include "xfs_quota.h"
43#include "xfs_utils.h" 43#include "xfs_utils.h"
44#include "xfs_cksum.h"
44#include "xfs_trace.h" 45#include "xfs_trace.h"
46#include "xfs_icache.h"
45 47
46STATIC int 48STATIC int
47xlog_find_zeroed( 49xlog_find_zeroed(
@@ -2143,7 +2145,7 @@ xlog_recover_buffer_pass2(
2143 buf_flags |= XBF_UNMAPPED; 2145 buf_flags |= XBF_UNMAPPED;
2144 2146
2145 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, 2147 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2146 buf_flags); 2148 buf_flags, NULL);
2147 if (!bp) 2149 if (!bp)
2148 return XFS_ERROR(ENOMEM); 2150 return XFS_ERROR(ENOMEM);
2149 error = bp->b_error; 2151 error = bp->b_error;
@@ -2236,7 +2238,8 @@ xlog_recover_inode_pass2(
2236 } 2238 }
2237 trace_xfs_log_recover_inode_recover(log, in_f); 2239 trace_xfs_log_recover_inode_recover(log, in_f);
2238 2240
2239 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0); 2241 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
2242 NULL);
2240 if (!bp) { 2243 if (!bp) {
2241 error = ENOMEM; 2244 error = ENOMEM;
2242 goto error; 2245 goto error;
@@ -2547,7 +2550,8 @@ xlog_recover_dquot_pass2(
2547 ASSERT(dq_f->qlf_len == 1); 2550 ASSERT(dq_f->qlf_len == 1);
2548 2551
2549 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, 2552 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
2550 XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp); 2553 XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
2554 NULL);
2551 if (error) 2555 if (error)
2552 return error; 2556 return error;
2553 2557
@@ -3213,80 +3217,58 @@ xlog_recover_process_iunlinks(
3213 mp->m_dmevmask = mp_dmevmask; 3217 mp->m_dmevmask = mp_dmevmask;
3214} 3218}
3215 3219
3216
3217#ifdef DEBUG
3218STATIC void
3219xlog_pack_data_checksum(
3220 struct xlog *log,
3221 struct xlog_in_core *iclog,
3222 int size)
3223{
3224 int i;
3225 __be32 *up;
3226 uint chksum = 0;
3227
3228 up = (__be32 *)iclog->ic_datap;
3229 /* divide length by 4 to get # words */
3230 for (i = 0; i < (size >> 2); i++) {
3231 chksum ^= be32_to_cpu(*up);
3232 up++;
3233 }
3234 iclog->ic_header.h_chksum = cpu_to_be32(chksum);
3235}
3236#else
3237#define xlog_pack_data_checksum(log, iclog, size)
3238#endif
3239
3240/* 3220/*
3241 * Stamp cycle number in every block 3221 * Upack the log buffer data and crc check it. If the check fails, issue a
3222 * warning if and only if the CRC in the header is non-zero. This makes the
3223 * check an advisory warning, and the zero CRC check will prevent failure
3224 * warnings from being emitted when upgrading the kernel from one that does not
3225 * add CRCs by default.
3226 *
3227 * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
3228 * corruption failure
3242 */ 3229 */
3243void 3230STATIC int
3244xlog_pack_data( 3231xlog_unpack_data_crc(
3245 struct xlog *log, 3232 struct xlog_rec_header *rhead,
3246 struct xlog_in_core *iclog, 3233 xfs_caddr_t dp,
3247 int roundoff) 3234 struct xlog *log)
3248{ 3235{
3249 int i, j, k; 3236 __le32 crc;
3250 int size = iclog->ic_offset + roundoff; 3237
3251 __be32 cycle_lsn; 3238 crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
3252 xfs_caddr_t dp; 3239 if (crc != rhead->h_crc) {
3253 3240 if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
3254 xlog_pack_data_checksum(log, iclog, size); 3241 xfs_alert(log->l_mp,
3255 3242 "log record CRC mismatch: found 0x%x, expected 0x%x.\n",
3256 cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); 3243 le32_to_cpu(rhead->h_crc),
3257 3244 le32_to_cpu(crc));
3258 dp = iclog->ic_datap; 3245 xfs_hex_dump(dp, 32);
3259 for (i = 0; i < BTOBB(size) &&
3260 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3261 iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
3262 *(__be32 *)dp = cycle_lsn;
3263 dp += BBSIZE;
3264 }
3265
3266 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3267 xlog_in_core_2_t *xhdr = iclog->ic_data;
3268
3269 for ( ; i < BTOBB(size); i++) {
3270 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3271 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3272 xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
3273 *(__be32 *)dp = cycle_lsn;
3274 dp += BBSIZE;
3275 } 3246 }
3276 3247
3277 for (i = 1; i < log->l_iclog_heads; i++) { 3248 /*
3278 xhdr[i].hic_xheader.xh_cycle = cycle_lsn; 3249 * If we've detected a log record corruption, then we can't
3279 } 3250 * recover past this point. Abort recovery if we are enforcing
3251 * CRC protection by punting an error back up the stack.
3252 */
3253 if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
3254 return EFSCORRUPTED;
3280 } 3255 }
3256
3257 return 0;
3281} 3258}
3282 3259
3283STATIC void 3260STATIC int
3284xlog_unpack_data( 3261xlog_unpack_data(
3285 struct xlog_rec_header *rhead, 3262 struct xlog_rec_header *rhead,
3286 xfs_caddr_t dp, 3263 xfs_caddr_t dp,
3287 struct xlog *log) 3264 struct xlog *log)
3288{ 3265{
3289 int i, j, k; 3266 int i, j, k;
3267 int error;
3268
3269 error = xlog_unpack_data_crc(rhead, dp, log);
3270 if (error)
3271 return error;
3290 3272
3291 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && 3273 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
3292 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { 3274 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
@@ -3303,6 +3285,8 @@ xlog_unpack_data(
3303 dp += BBSIZE; 3285 dp += BBSIZE;
3304 } 3286 }
3305 } 3287 }
3288
3289 return 0;
3306} 3290}
3307 3291
3308STATIC int 3292STATIC int
@@ -3434,9 +3418,13 @@ xlog_do_recovery_pass(
3434 if (error) 3418 if (error)
3435 goto bread_err2; 3419 goto bread_err2;
3436 3420
3437 xlog_unpack_data(rhead, offset, log); 3421 error = xlog_unpack_data(rhead, offset, log);
3438 if ((error = xlog_recover_process_data(log, 3422 if (error)
3439 rhash, rhead, offset, pass))) 3423 goto bread_err2;
3424
3425 error = xlog_recover_process_data(log,
3426 rhash, rhead, offset, pass);
3427 if (error)
3440 goto bread_err2; 3428 goto bread_err2;
3441 blk_no += bblks + hblks; 3429 blk_no += bblks + hblks;
3442 } 3430 }
@@ -3541,14 +3529,19 @@ xlog_do_recovery_pass(
3541 * - order is important. 3529 * - order is important.
3542 */ 3530 */
3543 error = xlog_bread_offset(log, 0, 3531 error = xlog_bread_offset(log, 0,
3544 bblks - split_bblks, hbp, 3532 bblks - split_bblks, dbp,
3545 offset + BBTOB(split_bblks)); 3533 offset + BBTOB(split_bblks));
3546 if (error) 3534 if (error)
3547 goto bread_err2; 3535 goto bread_err2;
3548 } 3536 }
3549 xlog_unpack_data(rhead, offset, log); 3537
3550 if ((error = xlog_recover_process_data(log, rhash, 3538 error = xlog_unpack_data(rhead, offset, log);
3551 rhead, offset, pass))) 3539 if (error)
3540 goto bread_err2;
3541
3542 error = xlog_recover_process_data(log, rhash,
3543 rhead, offset, pass);
3544 if (error)
3552 goto bread_err2; 3545 goto bread_err2;
3553 blk_no += bblks; 3546 blk_no += bblks;
3554 } 3547 }
@@ -3573,9 +3566,13 @@ xlog_do_recovery_pass(
3573 if (error) 3566 if (error)
3574 goto bread_err2; 3567 goto bread_err2;
3575 3568
3576 xlog_unpack_data(rhead, offset, log); 3569 error = xlog_unpack_data(rhead, offset, log);
3577 if ((error = xlog_recover_process_data(log, rhash, 3570 if (error)
3578 rhead, offset, pass))) 3571 goto bread_err2;
3572
3573 error = xlog_recover_process_data(log, rhash,
3574 rhead, offset, pass);
3575 if (error)
3579 goto bread_err2; 3576 goto bread_err2;
3580 blk_no += bblks + hblks; 3577 blk_no += bblks + hblks;
3581 } 3578 }
@@ -3689,13 +3686,14 @@ xlog_do_recover(
3689 3686
3690 /* 3687 /*
3691 * Now that we've finished replaying all buffer and inode 3688 * Now that we've finished replaying all buffer and inode
3692 * updates, re-read in the superblock. 3689 * updates, re-read in the superblock and reverify it.
3693 */ 3690 */
3694 bp = xfs_getsb(log->l_mp, 0); 3691 bp = xfs_getsb(log->l_mp, 0);
3695 XFS_BUF_UNDONE(bp); 3692 XFS_BUF_UNDONE(bp);
3696 ASSERT(!(XFS_BUF_ISWRITE(bp))); 3693 ASSERT(!(XFS_BUF_ISWRITE(bp)));
3697 XFS_BUF_READ(bp); 3694 XFS_BUF_READ(bp);
3698 XFS_BUF_UNASYNC(bp); 3695 XFS_BUF_UNASYNC(bp);
3696 bp->b_ops = &xfs_sb_buf_ops;
3699 xfsbdstrat(log->l_mp, bp); 3697 xfsbdstrat(log->l_mp, bp);
3700 error = xfs_buf_iowait(bp); 3698 error = xfs_buf_iowait(bp);
3701 if (error) { 3699 if (error) {
@@ -3707,7 +3705,7 @@ xlog_do_recover(
3707 3705
3708 /* Convert superblock from on-disk format */ 3706 /* Convert superblock from on-disk format */
3709 sbp = &log->l_mp->m_sb; 3707 sbp = &log->l_mp->m_sb;
3710 xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp)); 3708 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
3711 ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); 3709 ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
3712 ASSERT(xfs_sb_good_version(sbp)); 3710 ASSERT(xfs_sb_good_version(sbp));
3713 xfs_buf_relse(bp); 3711 xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b2bd3a0e6376..da508463ff10 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -42,6 +42,7 @@
42#include "xfs_fsops.h" 42#include "xfs_fsops.h"
43#include "xfs_utils.h" 43#include "xfs_utils.h"
44#include "xfs_trace.h" 44#include "xfs_trace.h"
45#include "xfs_icache.h"
45 46
46 47
47#ifdef HAVE_PERCPU_SB 48#ifdef HAVE_PERCPU_SB
@@ -303,9 +304,8 @@ STATIC int
303xfs_mount_validate_sb( 304xfs_mount_validate_sb(
304 xfs_mount_t *mp, 305 xfs_mount_t *mp,
305 xfs_sb_t *sbp, 306 xfs_sb_t *sbp,
306 int flags) 307 bool check_inprogress)
307{ 308{
308 int loud = !(flags & XFS_MFSI_QUIET);
309 309
310 /* 310 /*
311 * If the log device and data device have the 311 * If the log device and data device have the
@@ -315,21 +315,18 @@ xfs_mount_validate_sb(
315 * a volume filesystem in a non-volume manner. 315 * a volume filesystem in a non-volume manner.
316 */ 316 */
317 if (sbp->sb_magicnum != XFS_SB_MAGIC) { 317 if (sbp->sb_magicnum != XFS_SB_MAGIC) {
318 if (loud) 318 xfs_warn(mp, "bad magic number");
319 xfs_warn(mp, "bad magic number");
320 return XFS_ERROR(EWRONGFS); 319 return XFS_ERROR(EWRONGFS);
321 } 320 }
322 321
323 if (!xfs_sb_good_version(sbp)) { 322 if (!xfs_sb_good_version(sbp)) {
324 if (loud) 323 xfs_warn(mp, "bad version");
325 xfs_warn(mp, "bad version");
326 return XFS_ERROR(EWRONGFS); 324 return XFS_ERROR(EWRONGFS);
327 } 325 }
328 326
329 if (unlikely( 327 if (unlikely(
330 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { 328 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
331 if (loud) 329 xfs_warn(mp,
332 xfs_warn(mp,
333 "filesystem is marked as having an external log; " 330 "filesystem is marked as having an external log; "
334 "specify logdev on the mount command line."); 331 "specify logdev on the mount command line.");
335 return XFS_ERROR(EINVAL); 332 return XFS_ERROR(EINVAL);
@@ -337,8 +334,7 @@ xfs_mount_validate_sb(
337 334
338 if (unlikely( 335 if (unlikely(
339 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { 336 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
340 if (loud) 337 xfs_warn(mp,
341 xfs_warn(mp,
342 "filesystem is marked as having an internal log; " 338 "filesystem is marked as having an internal log; "
343 "do not specify logdev on the mount command line."); 339 "do not specify logdev on the mount command line.");
344 return XFS_ERROR(EINVAL); 340 return XFS_ERROR(EINVAL);
@@ -372,8 +368,7 @@ xfs_mount_validate_sb(
372 sbp->sb_dblocks == 0 || 368 sbp->sb_dblocks == 0 ||
373 sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || 369 sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) ||
374 sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) { 370 sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
375 if (loud) 371 XFS_CORRUPTION_ERROR("SB sanity check failed",
376 XFS_CORRUPTION_ERROR("SB sanity check failed",
377 XFS_ERRLEVEL_LOW, mp, sbp); 372 XFS_ERRLEVEL_LOW, mp, sbp);
378 return XFS_ERROR(EFSCORRUPTED); 373 return XFS_ERROR(EFSCORRUPTED);
379 } 374 }
@@ -382,12 +377,10 @@ xfs_mount_validate_sb(
382 * Until this is fixed only page-sized or smaller data blocks work. 377 * Until this is fixed only page-sized or smaller data blocks work.
383 */ 378 */
384 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { 379 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
385 if (loud) { 380 xfs_warn(mp,
386 xfs_warn(mp,
387 "File system with blocksize %d bytes. " 381 "File system with blocksize %d bytes. "
388 "Only pagesize (%ld) or less will currently work.", 382 "Only pagesize (%ld) or less will currently work.",
389 sbp->sb_blocksize, PAGE_SIZE); 383 sbp->sb_blocksize, PAGE_SIZE);
390 }
391 return XFS_ERROR(ENOSYS); 384 return XFS_ERROR(ENOSYS);
392 } 385 }
393 386
@@ -401,23 +394,20 @@ xfs_mount_validate_sb(
401 case 2048: 394 case 2048:
402 break; 395 break;
403 default: 396 default:
404 if (loud) 397 xfs_warn(mp, "inode size of %d bytes not supported",
405 xfs_warn(mp, "inode size of %d bytes not supported",
406 sbp->sb_inodesize); 398 sbp->sb_inodesize);
407 return XFS_ERROR(ENOSYS); 399 return XFS_ERROR(ENOSYS);
408 } 400 }
409 401
410 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || 402 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
411 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { 403 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
412 if (loud) 404 xfs_warn(mp,
413 xfs_warn(mp,
414 "file system too large to be mounted on this system."); 405 "file system too large to be mounted on this system.");
415 return XFS_ERROR(EFBIG); 406 return XFS_ERROR(EFBIG);
416 } 407 }
417 408
418 if (unlikely(sbp->sb_inprogress)) { 409 if (check_inprogress && sbp->sb_inprogress) {
419 if (loud) 410 xfs_warn(mp, "Offline file system operation in progress!");
420 xfs_warn(mp, "file system busy");
421 return XFS_ERROR(EFSCORRUPTED); 411 return XFS_ERROR(EFSCORRUPTED);
422 } 412 }
423 413
@@ -425,9 +415,7 @@ xfs_mount_validate_sb(
425 * Version 1 directory format has never worked on Linux. 415 * Version 1 directory format has never worked on Linux.
426 */ 416 */
427 if (unlikely(!xfs_sb_version_hasdirv2(sbp))) { 417 if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
428 if (loud) 418 xfs_warn(mp, "file system using version 1 directory format");
429 xfs_warn(mp,
430 "file system using version 1 directory format");
431 return XFS_ERROR(ENOSYS); 419 return XFS_ERROR(ENOSYS);
432 } 420 }
433 421
@@ -520,11 +508,9 @@ out_unwind:
520 508
521void 509void
522xfs_sb_from_disk( 510xfs_sb_from_disk(
523 struct xfs_mount *mp, 511 struct xfs_sb *to,
524 xfs_dsb_t *from) 512 xfs_dsb_t *from)
525{ 513{
526 struct xfs_sb *to = &mp->m_sb;
527
528 to->sb_magicnum = be32_to_cpu(from->sb_magicnum); 514 to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
529 to->sb_blocksize = be32_to_cpu(from->sb_blocksize); 515 to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
530 to->sb_dblocks = be64_to_cpu(from->sb_dblocks); 516 to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
@@ -626,6 +612,72 @@ xfs_sb_to_disk(
626 } 612 }
627} 613}
628 614
615static void
616xfs_sb_verify(
617 struct xfs_buf *bp)
618{
619 struct xfs_mount *mp = bp->b_target->bt_mount;
620 struct xfs_sb sb;
621 int error;
622
623 xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
624
625 /*
626 * Only check the in progress field for the primary superblock as
627 * mkfs.xfs doesn't clear it from secondary superblocks.
628 */
629 error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR);
630 if (error)
631 xfs_buf_ioerror(bp, error);
632}
633
634static void
635xfs_sb_read_verify(
636 struct xfs_buf *bp)
637{
638 xfs_sb_verify(bp);
639}
640
641/*
642 * We may be probed for a filesystem match, so we may not want to emit
643 * messages when the superblock buffer is not actually an XFS superblock.
644 * If we find an XFS superblock, the run a normal, noisy mount because we are
645 * really going to mount it and want to know about errors.
646 */
647static void
648xfs_sb_quiet_read_verify(
649 struct xfs_buf *bp)
650{
651 struct xfs_sb sb;
652
653 xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
654
655 if (sb.sb_magicnum == XFS_SB_MAGIC) {
656 /* XFS filesystem, verify noisily! */
657 xfs_sb_read_verify(bp);
658 return;
659 }
660 /* quietly fail */
661 xfs_buf_ioerror(bp, EFSCORRUPTED);
662}
663
664static void
665xfs_sb_write_verify(
666 struct xfs_buf *bp)
667{
668 xfs_sb_verify(bp);
669}
670
671const struct xfs_buf_ops xfs_sb_buf_ops = {
672 .verify_read = xfs_sb_read_verify,
673 .verify_write = xfs_sb_write_verify,
674};
675
676static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
677 .verify_read = xfs_sb_quiet_read_verify,
678 .verify_write = xfs_sb_write_verify,
679};
680
629/* 681/*
630 * xfs_readsb 682 * xfs_readsb
631 * 683 *
@@ -651,26 +703,27 @@ xfs_readsb(xfs_mount_t *mp, int flags)
651 703
652reread: 704reread:
653 bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, 705 bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
654 BTOBB(sector_size), 0); 706 BTOBB(sector_size), 0,
707 loud ? &xfs_sb_buf_ops
708 : &xfs_sb_quiet_buf_ops);
655 if (!bp) { 709 if (!bp) {
656 if (loud) 710 if (loud)
657 xfs_warn(mp, "SB buffer read failed"); 711 xfs_warn(mp, "SB buffer read failed");
658 return EIO; 712 return EIO;
659 } 713 }
660 714 if (bp->b_error) {
661 /* 715 error = bp->b_error;
662 * Initialize the mount structure from the superblock.
663 * But first do some basic consistency checking.
664 */
665 xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp));
666 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
667 if (error) {
668 if (loud) 716 if (loud)
669 xfs_warn(mp, "SB validate failed"); 717 xfs_warn(mp, "SB validate failed");
670 goto release_buf; 718 goto release_buf;
671 } 719 }
672 720
673 /* 721 /*
722 * Initialize the mount structure from the superblock.
723 */
724 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
725
726 /*
674 * We must be able to do sector-sized and sector-aligned IO. 727 * We must be able to do sector-sized and sector-aligned IO.
675 */ 728 */
676 if (sector_size > mp->m_sb.sb_sectsize) { 729 if (sector_size > mp->m_sb.sb_sectsize) {
@@ -1001,7 +1054,7 @@ xfs_check_sizes(xfs_mount_t *mp)
1001 } 1054 }
1002 bp = xfs_buf_read_uncached(mp->m_ddev_targp, 1055 bp = xfs_buf_read_uncached(mp->m_ddev_targp,
1003 d - XFS_FSS_TO_BB(mp, 1), 1056 d - XFS_FSS_TO_BB(mp, 1),
1004 XFS_FSS_TO_BB(mp, 1), 0); 1057 XFS_FSS_TO_BB(mp, 1), 0, NULL);
1005 if (!bp) { 1058 if (!bp) {
1006 xfs_warn(mp, "last sector read failed"); 1059 xfs_warn(mp, "last sector read failed");
1007 return EIO; 1060 return EIO;
@@ -1016,7 +1069,7 @@ xfs_check_sizes(xfs_mount_t *mp)
1016 } 1069 }
1017 bp = xfs_buf_read_uncached(mp->m_logdev_targp, 1070 bp = xfs_buf_read_uncached(mp->m_logdev_targp,
1018 d - XFS_FSB_TO_BB(mp, 1), 1071 d - XFS_FSB_TO_BB(mp, 1),
1019 XFS_FSB_TO_BB(mp, 1), 0); 1072 XFS_FSB_TO_BB(mp, 1), 0, NULL);
1020 if (!bp) { 1073 if (!bp) {
1021 xfs_warn(mp, "log device read failed"); 1074 xfs_warn(mp, "log device read failed");
1022 return EIO; 1075 return EIO;
@@ -1427,6 +1480,8 @@ xfs_unmountfs(
1427 __uint64_t resblks; 1480 __uint64_t resblks;
1428 int error; 1481 int error;
1429 1482
1483 cancel_delayed_work_sync(&mp->m_eofblocks_work);
1484
1430 xfs_qm_unmount_quotas(mp); 1485 xfs_qm_unmount_quotas(mp);
1431 xfs_rtunmount_inodes(mp); 1486 xfs_rtunmount_inodes(mp);
1432 IRELE(mp->m_rootip); 1487 IRELE(mp->m_rootip);
@@ -1450,21 +1505,16 @@ xfs_unmountfs(
1450 1505
1451 /* 1506 /*
1452 * And reclaim all inodes. At this point there should be no dirty 1507 * And reclaim all inodes. At this point there should be no dirty
1453 * inode, and none should be pinned or locked, but use synchronous 1508 * inodes and none should be pinned or locked, but use synchronous
1454 * reclaim just to be sure. 1509 * reclaim just to be sure. We can stop background inode reclaim
1510 * here as well if it is still running.
1455 */ 1511 */
1512 cancel_delayed_work_sync(&mp->m_reclaim_work);
1456 xfs_reclaim_inodes(mp, SYNC_WAIT); 1513 xfs_reclaim_inodes(mp, SYNC_WAIT);
1457 1514
1458 xfs_qm_unmount(mp); 1515 xfs_qm_unmount(mp);
1459 1516
1460 /* 1517 /*
1461 * Flush out the log synchronously so that we know for sure
1462 * that nothing is pinned. This is important because bflush()
1463 * will skip pinned buffers.
1464 */
1465 xfs_log_force(mp, XFS_LOG_SYNC);
1466
1467 /*
1468 * Unreserve any blocks we have so that when we unmount we don't account 1518 * Unreserve any blocks we have so that when we unmount we don't account
1469 * the reserved free space as used. This is really only necessary for 1519 * the reserved free space as used. This is really only necessary for
1470 * lazy superblock counting because it trusts the incore superblock 1520 * lazy superblock counting because it trusts the incore superblock
@@ -1489,23 +1539,6 @@ xfs_unmountfs(
1489 xfs_warn(mp, "Unable to update superblock counters. " 1539 xfs_warn(mp, "Unable to update superblock counters. "
1490 "Freespace may not be correct on next mount."); 1540 "Freespace may not be correct on next mount.");
1491 1541
1492 /*
1493 * At this point we might have modified the superblock again and thus
1494 * added an item to the AIL, thus flush it again.
1495 */
1496 xfs_ail_push_all_sync(mp->m_ail);
1497 xfs_wait_buftarg(mp->m_ddev_targp);
1498
1499 /*
1500 * The superblock buffer is uncached and xfsaild_push() will lock and
1501 * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
1502 * here but a lock on the superblock buffer will block until iodone()
1503 * has completed.
1504 */
1505 xfs_buf_lock(mp->m_sb_bp);
1506 xfs_buf_unlock(mp->m_sb_bp);
1507
1508 xfs_log_unmount_write(mp);
1509 xfs_log_unmount(mp); 1542 xfs_log_unmount(mp);
1510 xfs_uuid_unmount(mp); 1543 xfs_uuid_unmount(mp);
1511 1544
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index deee09e534dc..bab8314507e4 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -51,8 +51,6 @@ typedef struct xfs_trans_reservations {
51 51
52#else /* __KERNEL__ */ 52#else /* __KERNEL__ */
53 53
54#include "xfs_sync.h"
55
56struct xlog; 54struct xlog;
57struct xfs_inode; 55struct xfs_inode;
58struct xfs_mru_cache; 56struct xfs_mru_cache;
@@ -197,9 +195,9 @@ typedef struct xfs_mount {
197 struct mutex m_icsb_mutex; /* balancer sync lock */ 195 struct mutex m_icsb_mutex; /* balancer sync lock */
198#endif 196#endif
199 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ 197 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
200 struct delayed_work m_sync_work; /* background sync work */
201 struct delayed_work m_reclaim_work; /* background inode reclaim */ 198 struct delayed_work m_reclaim_work; /* background inode reclaim */
202 struct work_struct m_flush_work; /* background inode flush */ 199 struct delayed_work m_eofblocks_work; /* background eof blocks
200 trimming */
203 __int64_t m_update_flags; /* sb flags we need to update 201 __int64_t m_update_flags; /* sb flags we need to update
204 on the next remount,rw */ 202 on the next remount,rw */
205 struct shrinker m_inode_shrink; /* inode reclaim shrinker */ 203 struct shrinker m_inode_shrink; /* inode reclaim shrinker */
@@ -209,6 +207,9 @@ typedef struct xfs_mount {
209 struct workqueue_struct *m_data_workqueue; 207 struct workqueue_struct *m_data_workqueue;
210 struct workqueue_struct *m_unwritten_workqueue; 208 struct workqueue_struct *m_unwritten_workqueue;
211 struct workqueue_struct *m_cil_workqueue; 209 struct workqueue_struct *m_cil_workqueue;
210 struct workqueue_struct *m_reclaim_workqueue;
211 struct workqueue_struct *m_log_workqueue;
212 struct workqueue_struct *m_eofblocks_workqueue;
212} xfs_mount_t; 213} xfs_mount_t;
213 214
214/* 215/*
@@ -387,7 +388,9 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *);
387extern void xfs_mod_sb(struct xfs_trans *, __int64_t); 388extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
388extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, 389extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
389 xfs_agnumber_t *); 390 xfs_agnumber_t *);
390extern void xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *); 391extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
391extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); 392extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
392 393
394extern const struct xfs_buf_ops xfs_sb_buf_ops;
395
393#endif /* __XFS_MOUNT_H__ */ 396#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 2e86fa0cfc0d..60eff4763156 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -40,6 +40,7 @@
40#include "xfs_utils.h" 40#include "xfs_utils.h"
41#include "xfs_qm.h" 41#include "xfs_qm.h"
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43#include "xfs_icache.h"
43 44
44/* 45/*
45 * The global quota manager. There is only one of these for the entire 46 * The global quota manager. There is only one of these for the entire
@@ -891,7 +892,8 @@ xfs_qm_dqiter_bufs(
891 while (blkcnt--) { 892 while (blkcnt--) {
892 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, 893 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
893 XFS_FSB_TO_DADDR(mp, bno), 894 XFS_FSB_TO_DADDR(mp, bno),
894 mp->m_quotainfo->qi_dqchunklen, 0, &bp); 895 mp->m_quotainfo->qi_dqchunklen, 0, &bp,
896 &xfs_dquot_buf_ops);
895 if (error) 897 if (error)
896 break; 898 break;
897 899
@@ -978,7 +980,8 @@ xfs_qm_dqiterate(
978 while (rablkcnt--) { 980 while (rablkcnt--) {
979 xfs_buf_readahead(mp->m_ddev_targp, 981 xfs_buf_readahead(mp->m_ddev_targp,
980 XFS_FSB_TO_DADDR(mp, rablkno), 982 XFS_FSB_TO_DADDR(mp, rablkno),
981 mp->m_quotainfo->qi_dqchunklen); 983 mp->m_quotainfo->qi_dqchunklen,
984 NULL);
982 rablkno++; 985 rablkno++;
983 } 986 }
984 } 987 }
@@ -1453,7 +1456,7 @@ xfs_qm_dqreclaim_one(
1453 int error; 1456 int error;
1454 1457
1455 if (!xfs_dqlock_nowait(dqp)) 1458 if (!xfs_dqlock_nowait(dqp))
1456 goto out_busy; 1459 goto out_move_tail;
1457 1460
1458 /* 1461 /*
1459 * This dquot has acquired a reference in the meantime remove it from 1462 * This dquot has acquired a reference in the meantime remove it from
@@ -1476,7 +1479,7 @@ xfs_qm_dqreclaim_one(
1476 * getting flushed to disk, we don't want to reclaim it. 1479 * getting flushed to disk, we don't want to reclaim it.
1477 */ 1480 */
1478 if (!xfs_dqflock_nowait(dqp)) 1481 if (!xfs_dqflock_nowait(dqp))
1479 goto out_busy; 1482 goto out_unlock_move_tail;
1480 1483
1481 if (XFS_DQ_IS_DIRTY(dqp)) { 1484 if (XFS_DQ_IS_DIRTY(dqp)) {
1482 struct xfs_buf *bp = NULL; 1485 struct xfs_buf *bp = NULL;
@@ -1487,7 +1490,7 @@ xfs_qm_dqreclaim_one(
1487 if (error) { 1490 if (error) {
1488 xfs_warn(mp, "%s: dquot %p flush failed", 1491 xfs_warn(mp, "%s: dquot %p flush failed",
1489 __func__, dqp); 1492 __func__, dqp);
1490 goto out_busy; 1493 goto out_unlock_move_tail;
1491 } 1494 }
1492 1495
1493 xfs_buf_delwri_queue(bp, buffer_list); 1496 xfs_buf_delwri_queue(bp, buffer_list);
@@ -1496,7 +1499,7 @@ xfs_qm_dqreclaim_one(
1496 * Give the dquot another try on the freelist, as the 1499 * Give the dquot another try on the freelist, as the
1497 * flushing will take some time. 1500 * flushing will take some time.
1498 */ 1501 */
1499 goto out_busy; 1502 goto out_unlock_move_tail;
1500 } 1503 }
1501 xfs_dqfunlock(dqp); 1504 xfs_dqfunlock(dqp);
1502 1505
@@ -1515,14 +1518,13 @@ xfs_qm_dqreclaim_one(
1515 XFS_STATS_INC(xs_qm_dqreclaims); 1518 XFS_STATS_INC(xs_qm_dqreclaims);
1516 return; 1519 return;
1517 1520
1518out_busy:
1519 xfs_dqunlock(dqp);
1520
1521 /* 1521 /*
1522 * Move the dquot to the tail of the list so that we don't spin on it. 1522 * Move the dquot to the tail of the list so that we don't spin on it.
1523 */ 1523 */
1524out_unlock_move_tail:
1525 xfs_dqunlock(dqp);
1526out_move_tail:
1524 list_move_tail(&dqp->q_lru, &qi->qi_lru_list); 1527 list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
1525
1526 trace_xfs_dqreclaim_busy(dqp); 1528 trace_xfs_dqreclaim_busy(dqp);
1527 XFS_STATS_INC(xs_qm_dqreclaim_misses); 1529 XFS_STATS_INC(xs_qm_dqreclaim_misses);
1528} 1530}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 858a3b186110..5f53e75409b8 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -40,6 +40,7 @@
40#include "xfs_utils.h" 40#include "xfs_utils.h"
41#include "xfs_qm.h" 41#include "xfs_qm.h"
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43#include "xfs_icache.h"
43 44
44STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); 45STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
45STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, 46STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
@@ -845,7 +846,8 @@ STATIC int
845xfs_dqrele_inode( 846xfs_dqrele_inode(
846 struct xfs_inode *ip, 847 struct xfs_inode *ip,
847 struct xfs_perag *pag, 848 struct xfs_perag *pag,
848 int flags) 849 int flags,
850 void *args)
849{ 851{
850 /* skip quota inodes */ 852 /* skip quota inodes */
851 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || 853 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
@@ -881,5 +883,5 @@ xfs_qm_dqrele_all_inodes(
881 uint flags) 883 uint flags)
882{ 884{
883 ASSERT(mp->m_quotainfo); 885 ASSERT(mp->m_quotainfo);
884 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags); 886 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL);
885} 887}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ca28a4ba4b54..98dc670d3ee0 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -38,6 +38,7 @@
38#include "xfs_utils.h" 38#include "xfs_utils.h"
39#include "xfs_trace.h" 39#include "xfs_trace.h"
40#include "xfs_buf.h" 40#include "xfs_buf.h"
41#include "xfs_icache.h"
41 42
42 43
43/* 44/*
@@ -869,7 +870,7 @@ xfs_rtbuf_get(
869 ASSERT(map.br_startblock != NULLFSBLOCK); 870 ASSERT(map.br_startblock != NULLFSBLOCK);
870 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 871 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
871 XFS_FSB_TO_DADDR(mp, map.br_startblock), 872 XFS_FSB_TO_DADDR(mp, map.br_startblock),
872 mp->m_bsize, 0, &bp); 873 mp->m_bsize, 0, &bp, NULL);
873 if (error) 874 if (error)
874 return error; 875 return error;
875 ASSERT(!xfs_buf_geterror(bp)); 876 ASSERT(!xfs_buf_geterror(bp));
@@ -1872,9 +1873,14 @@ xfs_growfs_rt(
1872 */ 1873 */
1873 bp = xfs_buf_read_uncached(mp->m_rtdev_targp, 1874 bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
1874 XFS_FSB_TO_BB(mp, nrblocks - 1), 1875 XFS_FSB_TO_BB(mp, nrblocks - 1),
1875 XFS_FSB_TO_BB(mp, 1), 0); 1876 XFS_FSB_TO_BB(mp, 1), 0, NULL);
1876 if (!bp) 1877 if (!bp)
1877 return EIO; 1878 return EIO;
1879 if (bp->b_error) {
1880 error = bp->b_error;
1881 xfs_buf_relse(bp);
1882 return error;
1883 }
1878 xfs_buf_relse(bp); 1884 xfs_buf_relse(bp);
1879 1885
1880 /* 1886 /*
@@ -2219,9 +2225,11 @@ xfs_rtmount_init(
2219 } 2225 }
2220 bp = xfs_buf_read_uncached(mp->m_rtdev_targp, 2226 bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
2221 d - XFS_FSB_TO_BB(mp, 1), 2227 d - XFS_FSB_TO_BB(mp, 1),
2222 XFS_FSB_TO_BB(mp, 1), 0); 2228 XFS_FSB_TO_BB(mp, 1), 0, NULL);
2223 if (!bp) { 2229 if (!bp || bp->b_error) {
2224 xfs_warn(mp, "realtime device size check failed"); 2230 xfs_warn(mp, "realtime device size check failed");
2231 if (bp)
2232 xfs_buf_relse(bp);
2225 return EIO; 2233 return EIO;
2226 } 2234 }
2227 xfs_buf_relse(bp); 2235 xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index f429d9d5d325..a05b45175fb0 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -81,6 +81,7 @@ struct xfs_mount;
81#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ 81#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */
82#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ 82#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
83#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ 83#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */
84#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */
84 85
85#define XFS_SB_VERSION2_OKREALFBITS \ 86#define XFS_SB_VERSION2_OKREALFBITS \
86 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ 87 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
@@ -503,6 +504,12 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
503 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT); 504 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
504} 505}
505 506
507static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
508{
509 return (xfs_sb_version_hasmorebits(sbp) &&
510 (sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT));
511}
512
506/* 513/*
507 * end of superblock version macros 514 * end of superblock version macros
508 */ 515 */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 26a09bd7f975..ab8839b26272 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -49,7 +49,7 @@
49#include "xfs_extfree_item.h" 49#include "xfs_extfree_item.h"
50#include "xfs_mru_cache.h" 50#include "xfs_mru_cache.h"
51#include "xfs_inode_item.h" 51#include "xfs_inode_item.h"
52#include "xfs_sync.h" 52#include "xfs_icache.h"
53#include "xfs_trace.h" 53#include "xfs_trace.h"
54 54
55#include <linux/namei.h> 55#include <linux/namei.h>
@@ -863,8 +863,30 @@ xfs_init_mount_workqueues(
863 WQ_MEM_RECLAIM, 0, mp->m_fsname); 863 WQ_MEM_RECLAIM, 0, mp->m_fsname);
864 if (!mp->m_cil_workqueue) 864 if (!mp->m_cil_workqueue)
865 goto out_destroy_unwritten; 865 goto out_destroy_unwritten;
866
867 mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
868 WQ_NON_REENTRANT, 0, mp->m_fsname);
869 if (!mp->m_reclaim_workqueue)
870 goto out_destroy_cil;
871
872 mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
873 WQ_NON_REENTRANT, 0, mp->m_fsname);
874 if (!mp->m_log_workqueue)
875 goto out_destroy_reclaim;
876
877 mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
878 WQ_NON_REENTRANT, 0, mp->m_fsname);
879 if (!mp->m_eofblocks_workqueue)
880 goto out_destroy_log;
881
866 return 0; 882 return 0;
867 883
884out_destroy_log:
885 destroy_workqueue(mp->m_log_workqueue);
886out_destroy_reclaim:
887 destroy_workqueue(mp->m_reclaim_workqueue);
888out_destroy_cil:
889 destroy_workqueue(mp->m_cil_workqueue);
868out_destroy_unwritten: 890out_destroy_unwritten:
869 destroy_workqueue(mp->m_unwritten_workqueue); 891 destroy_workqueue(mp->m_unwritten_workqueue);
870out_destroy_data_iodone_queue: 892out_destroy_data_iodone_queue:
@@ -877,11 +899,32 @@ STATIC void
877xfs_destroy_mount_workqueues( 899xfs_destroy_mount_workqueues(
878 struct xfs_mount *mp) 900 struct xfs_mount *mp)
879{ 901{
902 destroy_workqueue(mp->m_eofblocks_workqueue);
903 destroy_workqueue(mp->m_log_workqueue);
904 destroy_workqueue(mp->m_reclaim_workqueue);
880 destroy_workqueue(mp->m_cil_workqueue); 905 destroy_workqueue(mp->m_cil_workqueue);
881 destroy_workqueue(mp->m_data_workqueue); 906 destroy_workqueue(mp->m_data_workqueue);
882 destroy_workqueue(mp->m_unwritten_workqueue); 907 destroy_workqueue(mp->m_unwritten_workqueue);
883} 908}
884 909
910/*
911 * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
912 * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
913 * for IO to complete so that we effectively throttle multiple callers to the
914 * rate at which IO is completing.
915 */
916void
917xfs_flush_inodes(
918 struct xfs_mount *mp)
919{
920 struct super_block *sb = mp->m_super;
921
922 if (down_read_trylock(&sb->s_umount)) {
923 sync_inodes_sb(sb);
924 up_read(&sb->s_umount);
925 }
926}
927
885/* Catch misguided souls that try to use this interface on XFS */ 928/* Catch misguided souls that try to use this interface on XFS */
886STATIC struct inode * 929STATIC struct inode *
887xfs_fs_alloc_inode( 930xfs_fs_alloc_inode(
@@ -1006,9 +1049,8 @@ xfs_fs_put_super(
1006 struct xfs_mount *mp = XFS_M(sb); 1049 struct xfs_mount *mp = XFS_M(sb);
1007 1050
1008 xfs_filestream_unmount(mp); 1051 xfs_filestream_unmount(mp);
1009 cancel_delayed_work_sync(&mp->m_sync_work);
1010 xfs_unmountfs(mp); 1052 xfs_unmountfs(mp);
1011 xfs_syncd_stop(mp); 1053
1012 xfs_freesb(mp); 1054 xfs_freesb(mp);
1013 xfs_icsb_destroy_counters(mp); 1055 xfs_icsb_destroy_counters(mp);
1014 xfs_destroy_mount_workqueues(mp); 1056 xfs_destroy_mount_workqueues(mp);
@@ -1023,7 +1065,6 @@ xfs_fs_sync_fs(
1023 int wait) 1065 int wait)
1024{ 1066{
1025 struct xfs_mount *mp = XFS_M(sb); 1067 struct xfs_mount *mp = XFS_M(sb);
1026 int error;
1027 1068
1028 /* 1069 /*
1029 * Doing anything during the async pass would be counterproductive. 1070 * Doing anything during the async pass would be counterproductive.
@@ -1031,17 +1072,14 @@ xfs_fs_sync_fs(
1031 if (!wait) 1072 if (!wait)
1032 return 0; 1073 return 0;
1033 1074
1034 error = xfs_quiesce_data(mp); 1075 xfs_log_force(mp, XFS_LOG_SYNC);
1035 if (error)
1036 return -error;
1037
1038 if (laptop_mode) { 1076 if (laptop_mode) {
1039 /* 1077 /*
1040 * The disk must be active because we're syncing. 1078 * The disk must be active because we're syncing.
1041 * We schedule xfssyncd now (now that the disk is 1079 * We schedule log work now (now that the disk is
1042 * active) instead of later (when it might not be). 1080 * active) instead of later (when it might not be).
1043 */ 1081 */
1044 flush_delayed_work(&mp->m_sync_work); 1082 flush_delayed_work(&mp->m_log->l_work);
1045 } 1083 }
1046 1084
1047 return 0; 1085 return 0;
@@ -1118,6 +1156,48 @@ xfs_restore_resvblks(struct xfs_mount *mp)
1118 xfs_reserve_blocks(mp, &resblks, NULL); 1156 xfs_reserve_blocks(mp, &resblks, NULL);
1119} 1157}
1120 1158
1159/*
1160 * Trigger writeback of all the dirty metadata in the file system.
1161 *
1162 * This ensures that the metadata is written to their location on disk rather
1163 * than just existing in transactions in the log. This means after a quiesce
1164 * there is no log replay required to write the inodes to disk - this is the
1165 * primary difference between a sync and a quiesce.
1166 *
1167 * Note: xfs_log_quiesce() stops background log work - the callers must ensure
1168 * it is started again when appropriate.
1169 */
1170void
1171xfs_quiesce_attr(
1172 struct xfs_mount *mp)
1173{
1174 int error = 0;
1175
1176 /* wait for all modifications to complete */
1177 while (atomic_read(&mp->m_active_trans) > 0)
1178 delay(100);
1179
1180 /* force the log to unpin objects from the now complete transactions */
1181 xfs_log_force(mp, XFS_LOG_SYNC);
1182
1183 /* reclaim inodes to do any IO before the freeze completes */
1184 xfs_reclaim_inodes(mp, 0);
1185 xfs_reclaim_inodes(mp, SYNC_WAIT);
1186
1187 /* Push the superblock and write an unmount record */
1188 error = xfs_log_sbcount(mp);
1189 if (error)
1190 xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
1191 "Frozen image may not be consistent.");
1192 /*
1193 * Just warn here till VFS can correctly support
1194 * read-only remount without racing.
1195 */
1196 WARN_ON(atomic_read(&mp->m_active_trans) != 0);
1197
1198 xfs_log_quiesce(mp);
1199}
1200
1121STATIC int 1201STATIC int
1122xfs_fs_remount( 1202xfs_fs_remount(
1123 struct super_block *sb, 1203 struct super_block *sb,
@@ -1198,20 +1278,18 @@ xfs_fs_remount(
1198 * value if it is non-zero, otherwise go with the default. 1278 * value if it is non-zero, otherwise go with the default.
1199 */ 1279 */
1200 xfs_restore_resvblks(mp); 1280 xfs_restore_resvblks(mp);
1281 xfs_log_work_queue(mp);
1201 } 1282 }
1202 1283
1203 /* rw -> ro */ 1284 /* rw -> ro */
1204 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { 1285 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
1205 /* 1286 /*
1206 * After we have synced the data but before we sync the 1287 * Before we sync the metadata, we need to free up the reserve
1207 * metadata, we need to free up the reserve block pool so that 1288 * block pool so that the used block count in the superblock on
1208 * the used block count in the superblock on disk is correct at 1289 * disk is correct at the end of the remount. Stash the current
1209 * the end of the remount. Stash the current reserve pool size 1290 * reserve pool size so that if we get remounted rw, we can
1210 * so that if we get remounted rw, we can return it to the same 1291 * return it to the same size.
1211 * size.
1212 */ 1292 */
1213
1214 xfs_quiesce_data(mp);
1215 xfs_save_resvblks(mp); 1293 xfs_save_resvblks(mp);
1216 xfs_quiesce_attr(mp); 1294 xfs_quiesce_attr(mp);
1217 mp->m_flags |= XFS_MOUNT_RDONLY; 1295 mp->m_flags |= XFS_MOUNT_RDONLY;
@@ -1243,6 +1321,7 @@ xfs_fs_unfreeze(
1243 struct xfs_mount *mp = XFS_M(sb); 1321 struct xfs_mount *mp = XFS_M(sb);
1244 1322
1245 xfs_restore_resvblks(mp); 1323 xfs_restore_resvblks(mp);
1324 xfs_log_work_queue(mp);
1246 return 0; 1325 return 0;
1247} 1326}
1248 1327
@@ -1321,6 +1400,8 @@ xfs_fs_fill_super(
1321 spin_lock_init(&mp->m_sb_lock); 1400 spin_lock_init(&mp->m_sb_lock);
1322 mutex_init(&mp->m_growlock); 1401 mutex_init(&mp->m_growlock);
1323 atomic_set(&mp->m_active_trans, 0); 1402 atomic_set(&mp->m_active_trans, 0);
1403 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
1404 INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
1324 1405
1325 mp->m_super = sb; 1406 mp->m_super = sb;
1326 sb->s_fs_info = mp; 1407 sb->s_fs_info = mp;
@@ -1371,10 +1452,6 @@ xfs_fs_fill_super(
1371 /* 1452 /*
1372 * we must configure the block size in the superblock before we run the 1453 * we must configure the block size in the superblock before we run the
1373 * full mount process as the mount process can lookup and cache inodes. 1454 * full mount process as the mount process can lookup and cache inodes.
1374 * For the same reason we must also initialise the syncd and register
1375 * the inode cache shrinker so that inodes can be reclaimed during
1376 * operations like a quotacheck that iterate all inodes in the
1377 * filesystem.
1378 */ 1455 */
1379 sb->s_magic = XFS_SB_MAGIC; 1456 sb->s_magic = XFS_SB_MAGIC;
1380 sb->s_blocksize = mp->m_sb.sb_blocksize; 1457 sb->s_blocksize = mp->m_sb.sb_blocksize;
@@ -1384,13 +1461,9 @@ xfs_fs_fill_super(
1384 sb->s_time_gran = 1; 1461 sb->s_time_gran = 1;
1385 set_posix_acl_flag(sb); 1462 set_posix_acl_flag(sb);
1386 1463
1387 error = xfs_syncd_init(mp);
1388 if (error)
1389 goto out_filestream_unmount;
1390
1391 error = xfs_mountfs(mp); 1464 error = xfs_mountfs(mp);
1392 if (error) 1465 if (error)
1393 goto out_syncd_stop; 1466 goto out_filestream_unmount;
1394 1467
1395 root = igrab(VFS_I(mp->m_rootip)); 1468 root = igrab(VFS_I(mp->m_rootip));
1396 if (!root) { 1469 if (!root) {
@@ -1408,8 +1481,7 @@ xfs_fs_fill_super(
1408 } 1481 }
1409 1482
1410 return 0; 1483 return 0;
1411 out_syncd_stop: 1484
1412 xfs_syncd_stop(mp);
1413 out_filestream_unmount: 1485 out_filestream_unmount:
1414 xfs_filestream_unmount(mp); 1486 xfs_filestream_unmount(mp);
1415 out_free_sb: 1487 out_free_sb:
@@ -1429,7 +1501,6 @@ out_destroy_workqueues:
1429 out_unmount: 1501 out_unmount:
1430 xfs_filestream_unmount(mp); 1502 xfs_filestream_unmount(mp);
1431 xfs_unmountfs(mp); 1503 xfs_unmountfs(mp);
1432 xfs_syncd_stop(mp);
1433 goto out_free_sb; 1504 goto out_free_sb;
1434} 1505}
1435 1506
@@ -1625,16 +1696,6 @@ STATIC int __init
1625xfs_init_workqueues(void) 1696xfs_init_workqueues(void)
1626{ 1697{
1627 /* 1698 /*
1628 * We never want to the same work item to run twice, reclaiming inodes
1629 * or idling the log is not going to get any faster by multiple CPUs
1630 * competing for ressources. Use the default large max_active value
1631 * so that even lots of filesystems can perform these task in parallel.
1632 */
1633 xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0);
1634 if (!xfs_syncd_wq)
1635 return -ENOMEM;
1636
1637 /*
1638 * The allocation workqueue can be used in memory reclaim situations 1699 * The allocation workqueue can be used in memory reclaim situations
1639 * (writepage path), and parallelism is only limited by the number of 1700 * (writepage path), and parallelism is only limited by the number of
1640 * AGs in all the filesystems mounted. Hence use the default large 1701 * AGs in all the filesystems mounted. Hence use the default large
@@ -1642,20 +1703,15 @@ xfs_init_workqueues(void)
1642 */ 1703 */
1643 xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0); 1704 xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0);
1644 if (!xfs_alloc_wq) 1705 if (!xfs_alloc_wq)
1645 goto out_destroy_syncd; 1706 return -ENOMEM;
1646 1707
1647 return 0; 1708 return 0;
1648
1649out_destroy_syncd:
1650 destroy_workqueue(xfs_syncd_wq);
1651 return -ENOMEM;
1652} 1709}
1653 1710
1654STATIC void 1711STATIC void
1655xfs_destroy_workqueues(void) 1712xfs_destroy_workqueues(void)
1656{ 1713{
1657 destroy_workqueue(xfs_alloc_wq); 1714 destroy_workqueue(xfs_alloc_wq);
1658 destroy_workqueue(xfs_syncd_wq);
1659} 1715}
1660 1716
1661STATIC int __init 1717STATIC int __init
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 9de4a920ba05..bbe3d15a7904 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -74,6 +74,7 @@ struct block_device;
74 74
75extern __uint64_t xfs_max_file_offset(unsigned int); 75extern __uint64_t xfs_max_file_offset(unsigned int);
76 76
77extern void xfs_flush_inodes(struct xfs_mount *mp);
77extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); 78extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
78extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *); 79extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *);
79extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *); 80extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *);
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index ee2d2adaa438..2801b5ce6cdb 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -202,6 +202,15 @@ static ctl_table xfs_table[] = {
202 .extra1 = &xfs_params.fstrm_timer.min, 202 .extra1 = &xfs_params.fstrm_timer.min,
203 .extra2 = &xfs_params.fstrm_timer.max, 203 .extra2 = &xfs_params.fstrm_timer.max,
204 }, 204 },
205 {
206 .procname = "speculative_prealloc_lifetime",
207 .data = &xfs_params.eofb_timer.val,
208 .maxlen = sizeof(int),
209 .mode = 0644,
210 .proc_handler = proc_dointvec_minmax,
211 .extra1 = &xfs_params.eofb_timer.min,
212 .extra2 = &xfs_params.eofb_timer.max,
213 },
205 /* please keep this the last entry */ 214 /* please keep this the last entry */
206#ifdef CONFIG_PROC_FS 215#ifdef CONFIG_PROC_FS
207 { 216 {
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index b9937d450f8e..bd8e157c20ef 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -47,6 +47,7 @@ typedef struct xfs_param {
47 xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ 47 xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */
48 xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ 48 xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
49 xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ 49 xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */
50 xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */
50} xfs_param_t; 51} xfs_param_t;
51 52
52/* 53/*
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 7d36ccf57f93..2e137d4a85ae 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -96,6 +96,8 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full);
96DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add); 96DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
97DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk); 97DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
98DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); 98DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
99DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
100DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
99 101
100DECLARE_EVENT_CLASS(xfs_perag_class, 102DECLARE_EVENT_CLASS(xfs_perag_class,
101 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, 103 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
@@ -130,6 +132,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
130DEFINE_PERAG_REF_EVENT(xfs_perag_put); 132DEFINE_PERAG_REF_EVENT(xfs_perag_put);
131DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); 133DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
132DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); 134DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
135DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks);
136DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks);
133 137
134TRACE_EVENT(xfs_attr_list_node_descend, 138TRACE_EVENT(xfs_attr_list_node_descend,
135 TP_PROTO(struct xfs_attr_list_context *ctx, 139 TP_PROTO(struct xfs_attr_list_context *ctx,
@@ -585,6 +589,10 @@ DEFINE_INODE_EVENT(xfs_update_time);
585DEFINE_INODE_EVENT(xfs_dquot_dqalloc); 589DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
586DEFINE_INODE_EVENT(xfs_dquot_dqdetach); 590DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
587 591
592DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
593DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
594DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
595
588DECLARE_EVENT_CLASS(xfs_iref_class, 596DECLARE_EVENT_CLASS(xfs_iref_class,
589 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), 597 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
590 TP_ARGS(ip, caller_ip), 598 TP_ARGS(ip, caller_ip),
@@ -1496,8 +1504,42 @@ DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
1496DEFINE_DIR2_EVENT(xfs_dir2_node_removename); 1504DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
1497DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf); 1505DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
1498 1506
1507DECLARE_EVENT_CLASS(xfs_attr_class,
1508 TP_PROTO(struct xfs_da_args *args),
1509 TP_ARGS(args),
1510 TP_STRUCT__entry(
1511 __field(dev_t, dev)
1512 __field(xfs_ino_t, ino)
1513 __dynamic_array(char, name, args->namelen)
1514 __field(int, namelen)
1515 __field(int, valuelen)
1516 __field(xfs_dahash_t, hashval)
1517 __field(int, op_flags)
1518 ),
1519 TP_fast_assign(
1520 __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
1521 __entry->ino = args->dp->i_ino;
1522 if (args->namelen)
1523 memcpy(__get_str(name), args->name, args->namelen);
1524 __entry->namelen = args->namelen;
1525 __entry->valuelen = args->valuelen;
1526 __entry->hashval = args->hashval;
1527 __entry->op_flags = args->op_flags;
1528 ),
1529 TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d "
1530 "hashval 0x%x op_flags %s",
1531 MAJOR(__entry->dev), MINOR(__entry->dev),
1532 __entry->ino,
1533 __entry->namelen,
1534 __entry->namelen ? __get_str(name) : NULL,
1535 __entry->namelen,
1536 __entry->valuelen,
1537 __entry->hashval,
1538 __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
1539)
1540
1499#define DEFINE_ATTR_EVENT(name) \ 1541#define DEFINE_ATTR_EVENT(name) \
1500DEFINE_EVENT(xfs_da_class, name, \ 1542DEFINE_EVENT(xfs_attr_class, name, \
1501 TP_PROTO(struct xfs_da_args *args), \ 1543 TP_PROTO(struct xfs_da_args *args), \
1502 TP_ARGS(args)) 1544 TP_ARGS(args))
1503DEFINE_ATTR_EVENT(xfs_attr_sf_add); 1545DEFINE_ATTR_EVENT(xfs_attr_sf_add);
@@ -1511,10 +1553,14 @@ DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf);
1511DEFINE_ATTR_EVENT(xfs_attr_leaf_add); 1553DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
1512DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old); 1554DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old);
1513DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new); 1555DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new);
1556DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work);
1514DEFINE_ATTR_EVENT(xfs_attr_leaf_addname); 1557DEFINE_ATTR_EVENT(xfs_attr_leaf_addname);
1515DEFINE_ATTR_EVENT(xfs_attr_leaf_create); 1558DEFINE_ATTR_EVENT(xfs_attr_leaf_create);
1559DEFINE_ATTR_EVENT(xfs_attr_leaf_compact);
1560DEFINE_ATTR_EVENT(xfs_attr_leaf_get);
1516DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup); 1561DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup);
1517DEFINE_ATTR_EVENT(xfs_attr_leaf_replace); 1562DEFINE_ATTR_EVENT(xfs_attr_leaf_replace);
1563DEFINE_ATTR_EVENT(xfs_attr_leaf_remove);
1518DEFINE_ATTR_EVENT(xfs_attr_leaf_removename); 1564DEFINE_ATTR_EVENT(xfs_attr_leaf_removename);
1519DEFINE_ATTR_EVENT(xfs_attr_leaf_split); 1565DEFINE_ATTR_EVENT(xfs_attr_leaf_split);
1520DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before); 1566DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before);
@@ -1526,12 +1572,21 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf);
1526DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node); 1572DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node);
1527DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance); 1573DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance);
1528DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance); 1574DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance);
1575DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall);
1529 1576
1530DEFINE_ATTR_EVENT(xfs_attr_node_addname); 1577DEFINE_ATTR_EVENT(xfs_attr_node_addname);
1578DEFINE_ATTR_EVENT(xfs_attr_node_get);
1531DEFINE_ATTR_EVENT(xfs_attr_node_lookup); 1579DEFINE_ATTR_EVENT(xfs_attr_node_lookup);
1532DEFINE_ATTR_EVENT(xfs_attr_node_replace); 1580DEFINE_ATTR_EVENT(xfs_attr_node_replace);
1533DEFINE_ATTR_EVENT(xfs_attr_node_removename); 1581DEFINE_ATTR_EVENT(xfs_attr_node_removename);
1534 1582
1583DEFINE_ATTR_EVENT(xfs_attr_fillstate);
1584DEFINE_ATTR_EVENT(xfs_attr_refillstate);
1585
1586DEFINE_ATTR_EVENT(xfs_attr_rmtval_get);
1587DEFINE_ATTR_EVENT(xfs_attr_rmtval_set);
1588DEFINE_ATTR_EVENT(xfs_attr_rmtval_remove);
1589
1535#define DEFINE_DA_EVENT(name) \ 1590#define DEFINE_DA_EVENT(name) \
1536DEFINE_EVENT(xfs_da_class, name, \ 1591DEFINE_EVENT(xfs_da_class, name, \
1537 TP_PROTO(struct xfs_da_args *args), \ 1592 TP_PROTO(struct xfs_da_args *args), \
@@ -1550,9 +1605,12 @@ DEFINE_DA_EVENT(xfs_da_node_split);
1550DEFINE_DA_EVENT(xfs_da_node_remove); 1605DEFINE_DA_EVENT(xfs_da_node_remove);
1551DEFINE_DA_EVENT(xfs_da_node_rebalance); 1606DEFINE_DA_EVENT(xfs_da_node_rebalance);
1552DEFINE_DA_EVENT(xfs_da_node_unbalance); 1607DEFINE_DA_EVENT(xfs_da_node_unbalance);
1608DEFINE_DA_EVENT(xfs_da_node_toosmall);
1553DEFINE_DA_EVENT(xfs_da_swap_lastblock); 1609DEFINE_DA_EVENT(xfs_da_swap_lastblock);
1554DEFINE_DA_EVENT(xfs_da_grow_inode); 1610DEFINE_DA_EVENT(xfs_da_grow_inode);
1555DEFINE_DA_EVENT(xfs_da_shrink_inode); 1611DEFINE_DA_EVENT(xfs_da_shrink_inode);
1612DEFINE_DA_EVENT(xfs_da_fixhashpath);
1613DEFINE_DA_EVENT(xfs_da_path_shift);
1556 1614
1557DECLARE_EVENT_CLASS(xfs_dir2_space_class, 1615DECLARE_EVENT_CLASS(xfs_dir2_space_class,
1558 TP_PROTO(struct xfs_da_args *args, int idx), 1616 TP_PROTO(struct xfs_da_args *args, int idx),
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index db056544cbb5..c6c0601abd7a 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -464,10 +464,7 @@ xfs_trans_get_buf(
464 int numblks, 464 int numblks,
465 uint flags) 465 uint flags)
466{ 466{
467 struct xfs_buf_map map = { 467 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
468 .bm_bn = blkno,
469 .bm_len = numblks,
470 };
471 return xfs_trans_get_buf_map(tp, target, &map, 1, flags); 468 return xfs_trans_get_buf_map(tp, target, &map, 1, flags);
472} 469}
473 470
@@ -476,7 +473,8 @@ int xfs_trans_read_buf_map(struct xfs_mount *mp,
476 struct xfs_buftarg *target, 473 struct xfs_buftarg *target,
477 struct xfs_buf_map *map, int nmaps, 474 struct xfs_buf_map *map, int nmaps,
478 xfs_buf_flags_t flags, 475 xfs_buf_flags_t flags,
479 struct xfs_buf **bpp); 476 struct xfs_buf **bpp,
477 const struct xfs_buf_ops *ops);
480 478
481static inline int 479static inline int
482xfs_trans_read_buf( 480xfs_trans_read_buf(
@@ -486,13 +484,12 @@ xfs_trans_read_buf(
486 xfs_daddr_t blkno, 484 xfs_daddr_t blkno,
487 int numblks, 485 int numblks,
488 xfs_buf_flags_t flags, 486 xfs_buf_flags_t flags,
489 struct xfs_buf **bpp) 487 struct xfs_buf **bpp,
488 const struct xfs_buf_ops *ops)
490{ 489{
491 struct xfs_buf_map map = { 490 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
492 .bm_bn = blkno, 491 return xfs_trans_read_buf_map(mp, tp, target, &map, 1,
493 .bm_len = numblks, 492 flags, bpp, ops);
494 };
495 return xfs_trans_read_buf_map(mp, tp, target, &map, 1, flags, bpp);
496} 493}
497 494
498struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int); 495struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 6311b99c267f..4fc17d479d42 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -257,7 +257,8 @@ xfs_trans_read_buf_map(
257 struct xfs_buf_map *map, 257 struct xfs_buf_map *map,
258 int nmaps, 258 int nmaps,
259 xfs_buf_flags_t flags, 259 xfs_buf_flags_t flags,
260 struct xfs_buf **bpp) 260 struct xfs_buf **bpp,
261 const struct xfs_buf_ops *ops)
261{ 262{
262 xfs_buf_t *bp; 263 xfs_buf_t *bp;
263 xfs_buf_log_item_t *bip; 264 xfs_buf_log_item_t *bip;
@@ -265,7 +266,7 @@ xfs_trans_read_buf_map(
265 266
266 *bpp = NULL; 267 *bpp = NULL;
267 if (!tp) { 268 if (!tp) {
268 bp = xfs_buf_read_map(target, map, nmaps, flags); 269 bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
269 if (!bp) 270 if (!bp)
270 return (flags & XBF_TRYLOCK) ? 271 return (flags & XBF_TRYLOCK) ?
271 EAGAIN : XFS_ERROR(ENOMEM); 272 EAGAIN : XFS_ERROR(ENOMEM);
@@ -312,7 +313,9 @@ xfs_trans_read_buf_map(
312 if (!(XFS_BUF_ISDONE(bp))) { 313 if (!(XFS_BUF_ISDONE(bp))) {
313 trace_xfs_trans_read_buf_io(bp, _RET_IP_); 314 trace_xfs_trans_read_buf_io(bp, _RET_IP_);
314 ASSERT(!XFS_BUF_ISASYNC(bp)); 315 ASSERT(!XFS_BUF_ISASYNC(bp));
316 ASSERT(bp->b_iodone == NULL);
315 XFS_BUF_READ(bp); 317 XFS_BUF_READ(bp);
318 bp->b_ops = ops;
316 xfsbdstrat(tp->t_mountp, bp); 319 xfsbdstrat(tp->t_mountp, bp);
317 error = xfs_buf_iowait(bp); 320 error = xfs_buf_iowait(bp);
318 if (error) { 321 if (error) {
@@ -349,7 +352,7 @@ xfs_trans_read_buf_map(
349 return 0; 352 return 0;
350 } 353 }
351 354
352 bp = xfs_buf_read_map(target, map, nmaps, flags); 355 bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
353 if (bp == NULL) { 356 if (bp == NULL) {
354 *bpp = NULL; 357 *bpp = NULL;
355 return (flags & XBF_TRYLOCK) ? 358 return (flags & XBF_TRYLOCK) ?
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 2a5c637344b4..d95f565a390e 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -47,6 +47,7 @@
47#include "xfs_filestream.h" 47#include "xfs_filestream.h"
48#include "xfs_vnodeops.h" 48#include "xfs_vnodeops.h"
49#include "xfs_trace.h" 49#include "xfs_trace.h"
50#include "xfs_icache.h"
50 51
51/* 52/*
52 * The maximum pathlen is 1024 bytes. Since the minimum file system 53 * The maximum pathlen is 1024 bytes. Since the minimum file system
@@ -79,7 +80,7 @@ xfs_readlink_bmap(
79 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); 80 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
80 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); 81 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
81 82
82 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); 83 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, NULL);
83 if (!bp) 84 if (!bp)
84 return XFS_ERROR(ENOMEM); 85 return XFS_ERROR(ENOMEM);
85 error = bp->b_error; 86 error = bp->b_error;
@@ -150,7 +151,7 @@ xfs_readlink(
150 * when the link count isn't zero and by xfs_dm_punch_hole() when 151 * when the link count isn't zero and by xfs_dm_punch_hole() when
151 * punching a hole to EOF. 152 * punching a hole to EOF.
152 */ 153 */
153STATIC int 154int
154xfs_free_eofblocks( 155xfs_free_eofblocks(
155 xfs_mount_t *mp, 156 xfs_mount_t *mp,
156 xfs_inode_t *ip, 157 xfs_inode_t *ip,
@@ -199,7 +200,7 @@ xfs_free_eofblocks(
199 if (need_iolock) { 200 if (need_iolock) {
200 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 201 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
201 xfs_trans_cancel(tp, 0); 202 xfs_trans_cancel(tp, 0);
202 return 0; 203 return EAGAIN;
203 } 204 }
204 } 205 }
205 206
@@ -237,6 +238,8 @@ xfs_free_eofblocks(
237 } else { 238 } else {
238 error = xfs_trans_commit(tp, 239 error = xfs_trans_commit(tp,
239 XFS_TRANS_RELEASE_LOG_RES); 240 XFS_TRANS_RELEASE_LOG_RES);
241 if (!error)
242 xfs_inode_clear_eofblocks_tag(ip);
240 } 243 }
241 244
242 xfs_iunlock(ip, XFS_ILOCK_EXCL); 245 xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -425,19 +428,18 @@ xfs_release(
425 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); 428 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
426 if (truncated) { 429 if (truncated) {
427 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); 430 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
428 if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) 431 if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
429 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); 432 error = -filemap_flush(VFS_I(ip)->i_mapping);
433 if (error)
434 return error;
435 }
430 } 436 }
431 } 437 }
432 438
433 if (ip->i_d.di_nlink == 0) 439 if (ip->i_d.di_nlink == 0)
434 return 0; 440 return 0;
435 441
436 if ((S_ISREG(ip->i_d.di_mode) && 442 if (xfs_can_free_eofblocks(ip, false)) {
437 (VFS_I(ip)->i_size > 0 ||
438 (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
439 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
440 (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
441 443
442 /* 444 /*
443 * If we can't get the iolock just skip truncating the blocks 445 * If we can't get the iolock just skip truncating the blocks
@@ -464,7 +466,7 @@ xfs_release(
464 return 0; 466 return 0;
465 467
466 error = xfs_free_eofblocks(mp, ip, true); 468 error = xfs_free_eofblocks(mp, ip, true);
467 if (error) 469 if (error && error != EAGAIN)
468 return error; 470 return error;
469 471
470 /* delalloc blocks after truncation means it really is dirty */ 472 /* delalloc blocks after truncation means it really is dirty */
@@ -513,13 +515,12 @@ xfs_inactive(
513 goto out; 515 goto out;
514 516
515 if (ip->i_d.di_nlink != 0) { 517 if (ip->i_d.di_nlink != 0) {
516 if ((S_ISREG(ip->i_d.di_mode) && 518 /*
517 (VFS_I(ip)->i_size > 0 || 519 * force is true because we are evicting an inode from the
518 (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && 520 * cache. Post-eof blocks must be freed, lest we end up with
519 (ip->i_df.if_flags & XFS_IFEXTENTS) && 521 * broken free space accounting.
520 (!(ip->i_d.di_flags & 522 */
521 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || 523 if (xfs_can_free_eofblocks(ip, true)) {
522 ip->i_delayed_blks != 0))) {
523 error = xfs_free_eofblocks(mp, ip, false); 524 error = xfs_free_eofblocks(mp, ip, false);
524 if (error) 525 if (error)
525 return VN_INACTIVE_CACHE; 526 return VN_INACTIVE_CACHE;
@@ -777,7 +778,7 @@ xfs_create(
777 XFS_TRANS_PERM_LOG_RES, log_count); 778 XFS_TRANS_PERM_LOG_RES, log_count);
778 if (error == ENOSPC) { 779 if (error == ENOSPC) {
779 /* flush outstanding delalloc blocks and retry */ 780 /* flush outstanding delalloc blocks and retry */
780 xfs_flush_inodes(dp); 781 xfs_flush_inodes(mp);
781 error = xfs_trans_reserve(tp, resblks, log_res, 0, 782 error = xfs_trans_reserve(tp, resblks, log_res, 0,
782 XFS_TRANS_PERM_LOG_RES, log_count); 783 XFS_TRANS_PERM_LOG_RES, log_count);
783 } 784 }
@@ -1957,12 +1958,11 @@ xfs_free_file_space(
1957 1958
1958 rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); 1959 rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
1959 ioffset = offset & ~(rounding - 1); 1960 ioffset = offset & ~(rounding - 1);
1960 1961 error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
1961 if (VN_CACHED(VFS_I(ip)) != 0) { 1962 ioffset, -1);
1962 error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED); 1963 if (error)
1963 if (error) 1964 goto out_unlock_iolock;
1964 goto out_unlock_iolock; 1965 truncate_pagecache_range(VFS_I(ip), ioffset, -1);
1965 }
1966 1966
1967 /* 1967 /*
1968 * Need to zero the stuff we're not freeing, on disk. 1968 * Need to zero the stuff we're not freeing, on disk.
@@ -2095,6 +2095,73 @@ xfs_free_file_space(
2095 return error; 2095 return error;
2096} 2096}
2097 2097
2098
2099STATIC int
2100xfs_zero_file_space(
2101 struct xfs_inode *ip,
2102 xfs_off_t offset,
2103 xfs_off_t len,
2104 int attr_flags)
2105{
2106 struct xfs_mount *mp = ip->i_mount;
2107 uint granularity;
2108 xfs_off_t start_boundary;
2109 xfs_off_t end_boundary;
2110 int error;
2111
2112 granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
2113
2114 /*
2115 * Round the range of extents we are going to convert inwards. If the
2116 * offset is aligned, then it doesn't get changed so we zero from the
2117 * start of the block offset points to.
2118 */
2119 start_boundary = round_up(offset, granularity);
2120 end_boundary = round_down(offset + len, granularity);
2121
2122 ASSERT(start_boundary >= offset);
2123 ASSERT(end_boundary <= offset + len);
2124
2125 if (!(attr_flags & XFS_ATTR_NOLOCK))
2126 xfs_ilock(ip, XFS_IOLOCK_EXCL);
2127
2128 if (start_boundary < end_boundary - 1) {
2129 /* punch out the page cache over the conversion range */
2130 truncate_pagecache_range(VFS_I(ip), start_boundary,
2131 end_boundary - 1);
2132 /* convert the blocks */
2133 error = xfs_alloc_file_space(ip, start_boundary,
2134 end_boundary - start_boundary - 1,
2135 XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT,
2136 attr_flags);
2137 if (error)
2138 goto out_unlock;
2139
2140 /* We've handled the interior of the range, now for the edges */
2141 if (start_boundary != offset)
2142 error = xfs_iozero(ip, offset, start_boundary - offset);
2143 if (error)
2144 goto out_unlock;
2145
2146 if (end_boundary != offset + len)
2147 error = xfs_iozero(ip, end_boundary,
2148 offset + len - end_boundary);
2149
2150 } else {
2151 /*
2152 * It's either a sub-granularity range or the range spanned lies
2153 * partially across two adjacent blocks.
2154 */
2155 error = xfs_iozero(ip, offset, len);
2156 }
2157
2158out_unlock:
2159 if (!(attr_flags & XFS_ATTR_NOLOCK))
2160 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
2161 return error;
2162
2163}
2164
2098/* 2165/*
2099 * xfs_change_file_space() 2166 * xfs_change_file_space()
2100 * This routine allocates or frees disk space for the given file. 2167 * This routine allocates or frees disk space for the given file.
@@ -2120,10 +2187,8 @@ xfs_change_file_space(
2120 xfs_fsize_t fsize; 2187 xfs_fsize_t fsize;
2121 int setprealloc; 2188 int setprealloc;
2122 xfs_off_t startoffset; 2189 xfs_off_t startoffset;
2123 xfs_off_t llen;
2124 xfs_trans_t *tp; 2190 xfs_trans_t *tp;
2125 struct iattr iattr; 2191 struct iattr iattr;
2126 int prealloc_type;
2127 2192
2128 if (!S_ISREG(ip->i_d.di_mode)) 2193 if (!S_ISREG(ip->i_d.di_mode))
2129 return XFS_ERROR(EINVAL); 2194 return XFS_ERROR(EINVAL);
@@ -2141,12 +2206,30 @@ xfs_change_file_space(
2141 return XFS_ERROR(EINVAL); 2206 return XFS_ERROR(EINVAL);
2142 } 2207 }
2143 2208
2144 llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len; 2209 /*
2210 * length of <= 0 for resv/unresv/zero is invalid. length for
2211 * alloc/free is ignored completely and we have no idea what userspace
2212 * might have set it to, so set it to zero to allow range
2213 * checks to pass.
2214 */
2215 switch (cmd) {
2216 case XFS_IOC_ZERO_RANGE:
2217 case XFS_IOC_RESVSP:
2218 case XFS_IOC_RESVSP64:
2219 case XFS_IOC_UNRESVSP:
2220 case XFS_IOC_UNRESVSP64:
2221 if (bf->l_len <= 0)
2222 return XFS_ERROR(EINVAL);
2223 break;
2224 default:
2225 bf->l_len = 0;
2226 break;
2227 }
2145 2228
2146 if (bf->l_start < 0 || 2229 if (bf->l_start < 0 ||
2147 bf->l_start > mp->m_super->s_maxbytes || 2230 bf->l_start > mp->m_super->s_maxbytes ||
2148 bf->l_start + llen < 0 || 2231 bf->l_start + bf->l_len < 0 ||
2149 bf->l_start + llen > mp->m_super->s_maxbytes) 2232 bf->l_start + bf->l_len >= mp->m_super->s_maxbytes)
2150 return XFS_ERROR(EINVAL); 2233 return XFS_ERROR(EINVAL);
2151 2234
2152 bf->l_whence = 0; 2235 bf->l_whence = 0;
@@ -2154,29 +2237,20 @@ xfs_change_file_space(
2154 startoffset = bf->l_start; 2237 startoffset = bf->l_start;
2155 fsize = XFS_ISIZE(ip); 2238 fsize = XFS_ISIZE(ip);
2156 2239
2157 /*
2158 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
2159 * file space.
2160 * These calls do NOT zero the data space allocated to the file,
2161 * nor do they change the file size.
2162 *
2163 * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
2164 * space.
2165 * These calls cause the new file data to be zeroed and the file
2166 * size to be changed.
2167 */
2168 setprealloc = clrprealloc = 0; 2240 setprealloc = clrprealloc = 0;
2169 prealloc_type = XFS_BMAPI_PREALLOC;
2170
2171 switch (cmd) { 2241 switch (cmd) {
2172 case XFS_IOC_ZERO_RANGE: 2242 case XFS_IOC_ZERO_RANGE:
2173 prealloc_type |= XFS_BMAPI_CONVERT; 2243 error = xfs_zero_file_space(ip, startoffset, bf->l_len,
2174 xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0); 2244 attr_flags);
2175 /* FALLTHRU */ 2245 if (error)
2246 return error;
2247 setprealloc = 1;
2248 break;
2249
2176 case XFS_IOC_RESVSP: 2250 case XFS_IOC_RESVSP:
2177 case XFS_IOC_RESVSP64: 2251 case XFS_IOC_RESVSP64:
2178 error = xfs_alloc_file_space(ip, startoffset, bf->l_len, 2252 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
2179 prealloc_type, attr_flags); 2253 XFS_BMAPI_PREALLOC, attr_flags);
2180 if (error) 2254 if (error)
2181 return error; 2255 return error;
2182 setprealloc = 1; 2256 setprealloc = 1;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 447e146b2ba6..5163022d9808 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -48,14 +48,9 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
48int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); 48int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
49int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, 49int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
50 int flags, struct attrlist_cursor_kern *cursor); 50 int flags, struct attrlist_cursor_kern *cursor);
51void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
52 xfs_off_t last, int fiopt);
53int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,
54 xfs_off_t last, int fiopt);
55int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
56 xfs_off_t last, uint64_t flags, int fiopt);
57int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
58 51
52int xfs_iozero(struct xfs_inode *, loff_t, size_t);
59int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); 53int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
54int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool);
60 55
61#endif /* _XFS_VNODEOPS_H */ 56#endif /* _XFS_VNODEOPS_H */