aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/Locking3
-rw-r--r--Documentation/filesystems/vfs.txt4
-rw-r--r--arch/alpha/include/asm/posix_types.h3
-rw-r--r--arch/arm/include/asm/posix_types.h3
-rw-r--r--arch/avr32/include/asm/posix_types.h3
-rw-r--r--arch/blackfin/include/asm/posix_types.h3
-rw-r--r--arch/cris/include/asm/posix_types.h3
-rw-r--r--arch/frv/include/asm/posix_types.h3
-rw-r--r--arch/h8300/include/asm/posix_types.h3
-rw-r--r--arch/ia64/include/asm/posix_types.h3
-rw-r--r--arch/ia64/kernel/perfmon.c10
-rw-r--r--arch/ia64/kernel/sys_ia64.c19
-rw-r--r--arch/m32r/include/asm/posix_types.h3
-rw-r--r--arch/m68k/include/asm/posix_types.h3
-rw-r--r--arch/mips/include/asm/posix_types.h5
-rw-r--r--arch/mips/include/asm/stat.h6
-rw-r--r--arch/mn10300/include/asm/posix_types.h3
-rw-r--r--arch/parisc/include/asm/posix_types.h3
-rw-r--r--arch/parisc/include/asm/stat.h4
-rw-r--r--arch/powerpc/include/asm/posix_types.h3
-rw-r--r--arch/powerpc/include/asm/stat.h4
-rw-r--r--arch/s390/include/asm/posix_types.h3
-rw-r--r--arch/sh/include/asm/posix_types_32.h2
-rw-r--r--arch/sh/include/asm/posix_types_64.h2
-rw-r--r--arch/sparc/include/asm/posix_types.h5
-rw-r--r--arch/sparc/kernel/sys_sparc_64.c11
-rw-r--r--arch/tile/include/asm/compat.h1
-rw-r--r--arch/x86/include/asm/posix_types_32.h3
-rw-r--r--drivers/base/soc.c2
-rw-r--r--drivers/gpu/drm/i810/i810_dma.c4
-rw-r--r--fs/9p/vfs_inode_dotl.c24
-rw-r--r--fs/affs/affs.h8
-rw-r--r--fs/aio.c6
-rw-r--r--fs/attr.c5
-rw-r--r--fs/binfmt_elf.c8
-rw-r--r--fs/binfmt_flat.c8
-rw-r--r--fs/btrfs/acl.c4
-rw-r--r--fs/btrfs/backref.c495
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/btrfs_inode.h50
-rw-r--r--fs/btrfs/check-integrity.c584
-rw-r--r--fs/btrfs/ctree.c861
-rw-r--r--fs/btrfs/ctree.h78
-rw-r--r--fs/btrfs/delayed-inode.c8
-rw-r--r--fs/btrfs/delayed-ref.c10
-rw-r--r--fs/btrfs/delayed-ref.h24
-rw-r--r--fs/btrfs/disk-io.c57
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/export.c15
-rw-r--r--fs/btrfs/extent-tree.c23
-rw-r--r--fs/btrfs/extent_io.c168
-rw-r--r--fs/btrfs/extent_io.h8
-rw-r--r--fs/btrfs/file.c78
-rw-r--r--fs/btrfs/free-space-cache.c52
-rw-r--r--fs/btrfs/inode.c317
-rw-r--r--fs/btrfs/ioctl.c50
-rw-r--r--fs/btrfs/ioctl.h33
-rw-r--r--fs/btrfs/ordered-data.c165
-rw-r--r--fs/btrfs/ordered-data.h13
-rw-r--r--fs/btrfs/print-tree.c3
-rw-r--r--fs/btrfs/reada.c5
-rw-r--r--fs/btrfs/scrub.c65
-rw-r--r--fs/btrfs/super.c117
-rw-r--r--fs/btrfs/transaction.c59
-rw-r--r--fs/btrfs/tree-log.c35
-rw-r--r--fs/btrfs/ulist.c38
-rw-r--r--fs/btrfs/ulist.h15
-rw-r--r--fs/btrfs/volumes.c306
-rw-r--r--fs/btrfs/volumes.h52
-rw-r--r--fs/btrfs/xattr.c1
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/ceph/export.c32
-rw-r--r--fs/compat.c33
-rw-r--r--fs/dcache.c20
-rw-r--r--fs/ecryptfs/inode.c48
-rw-r--r--fs/exec.c4
-rw-r--r--fs/exportfs/expfs.c33
-rw-r--r--fs/ext4/Kconfig2
-rw-r--r--fs/ext4/balloc.c41
-rw-r--r--fs/ext4/bitmap.c83
-rw-r--r--fs/ext4/dir.c12
-rw-r--r--fs/ext4/ext4.h130
-rw-r--r--fs/ext4/ext4_extents.h24
-rw-r--r--fs/ext4/ext4_jbd2.c9
-rw-r--r--fs/ext4/ext4_jbd2.h7
-rw-r--r--fs/ext4/extents.c91
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/ialloc.c81
-rw-r--r--fs/ext4/inode.c119
-rw-r--r--fs/ext4/ioctl.c19
-rw-r--r--fs/ext4/mballoc.c30
-rw-r--r--fs/ext4/mmp.c44
-rw-r--r--fs/ext4/namei.c445
-rw-r--r--fs/ext4/resize.c71
-rw-r--r--fs/ext4/super.c253
-rw-r--r--fs/ext4/xattr.c92
-rw-r--r--fs/ext4/xattr.h4
-rw-r--r--fs/fat/inode.c9
-rw-r--r--fs/fcntl.c42
-rw-r--r--fs/file_table.c17
-rw-r--r--fs/fuse/file.c4
-rw-r--r--fs/fuse/inode.c17
-rw-r--r--fs/gfs2/export.c17
-rw-r--r--fs/hpfs/alloc.c14
-rw-r--r--fs/hpfs/anode.c43
-rw-r--r--fs/hpfs/dir.c2
-rw-r--r--fs/hpfs/dnode.c10
-rw-r--r--fs/hpfs/ea.c60
-rw-r--r--fs/hpfs/hpfs.h289
-rw-r--r--fs/hpfs/hpfs_fn.h16
-rw-r--r--fs/hpfs/inode.c2
-rw-r--r--fs/hpfs/map.c20
-rw-r--r--fs/hpfs/namei.c2
-rw-r--r--fs/hpfs/super.c4
-rw-r--r--fs/inode.c124
-rw-r--r--fs/internal.h3
-rw-r--r--fs/isofs/export.c13
-rw-r--r--fs/jbd2/Kconfig2
-rw-r--r--fs/jbd2/commit.c70
-rw-r--r--fs/jbd2/journal.c132
-rw-r--r--fs/jbd2/recovery.c126
-rw-r--r--fs/jbd2/revoke.c27
-rw-r--r--fs/jbd2/transaction.c4
-rw-r--r--fs/jffs2/jffs2_fs_sb.h4
-rw-r--r--fs/jffs2/os-linux.h7
-rw-r--r--fs/jffs2/super.c21
-rw-r--r--fs/jffs2/wbuf.c55
-rw-r--r--fs/lockd/svc.c145
-rw-r--r--fs/locks.c5
-rw-r--r--fs/namei.c177
-rw-r--r--fs/namespace.c142
-rw-r--r--fs/ncpfs/file.c6
-rw-r--r--fs/ncpfs/ncp_fs_sb.h10
-rw-r--r--fs/nfs/callback.c13
-rw-r--r--fs/nfs/dir.c56
-rw-r--r--fs/nfs/file.c77
-rw-r--r--fs/nfsd/auth.c2
-rw-r--r--fs/nfsd/export.c8
-rw-r--r--fs/nfsd/fault_inject.c1
-rw-r--r--fs/nfsd/nfs4callback.c5
-rw-r--r--fs/nfsd/nfs4idmap.c4
-rw-r--r--fs/nfsd/nfs4recover.c4
-rw-r--r--fs/nfsd/nfs4state.c525
-rw-r--r--fs/nfsd/nfs4xdr.c62
-rw-r--r--fs/nfsd/nfsctl.c12
-rw-r--r--fs/nfsd/nfssvc.c23
-rw-r--r--fs/nfsd/state.h1
-rw-r--r--fs/nfsd/xdr4.h6
-rw-r--r--fs/nilfs2/namei.c22
-rw-r--r--fs/notify/fsnotify.c12
-rw-r--r--fs/ntfs/file.c4
-rw-r--r--fs/ocfs2/blockcheck.c42
-rw-r--r--fs/ocfs2/dlm/dlmast.c2
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h6
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c2
-rw-r--r--fs/ocfs2/export.c19
-rw-r--r--fs/ocfs2/inode.c13
-rw-r--r--fs/ocfs2/ioctl.c31
-rw-r--r--fs/ocfs2/move_extents.c6
-rw-r--r--fs/ocfs2/namei.c5
-rw-r--r--fs/ocfs2/symlink.c115
-rw-r--r--fs/ocfs2/symlink.h2
-rw-r--r--fs/open.c76
-rw-r--r--fs/pipe.c7
-rw-r--r--fs/pnode.c4
-rw-r--r--fs/proc_namespace.c4
-rw-r--r--fs/readdir.c33
-rw-r--r--fs/reiserfs/inode.c30
-rw-r--r--fs/reiserfs/journal.c15
-rw-r--r--fs/reiserfs/reiserfs.h12
-rw-r--r--fs/reiserfs/resize.c1
-rw-r--r--fs/reiserfs/super.c74
-rw-r--r--fs/signalfd.c7
-rw-r--r--fs/splice.c6
-rw-r--r--fs/statfs.c5
-rw-r--r--fs/sync.c5
-rw-r--r--fs/ubifs/dir.c11
-rw-r--r--fs/udf/namei.c14
-rw-r--r--fs/utimes.c5
-rw-r--r--fs/xattr.c20
-rw-r--r--fs/xfs/kmem.c10
-rw-r--r--fs/xfs/kmem.h21
-rw-r--r--fs/xfs/xfs_export.c23
-rw-r--r--fs/xfs/xfs_file.c7
-rw-r--r--fs/xfs/xfs_log.c2
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_trans.c2
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--include/asm-generic/posix_types.h4
-rw-r--r--include/linux/errno.h1
-rw-r--r--include/linux/exportfs.h4
-rw-r--r--include/linux/fs.h10
-rw-r--r--include/linux/fsnotify_backend.h2
-rw-r--r--include/linux/jbd2.h59
-rw-r--r--include/linux/jbd_common.h2
-rw-r--r--include/linux/lglock.h179
-rw-r--r--include/linux/mm.h2
-rw-r--r--include/linux/security.h40
-rw-r--r--include/linux/sunrpc/svc.h2
-rw-r--r--include/linux/sunrpc/svcauth.h10
-rw-r--r--include/linux/sunrpc/svcauth_gss.h1
-rw-r--r--include/linux/types.h2
-rw-r--r--ipc/shm.c7
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/lglock.c89
-rw-r--r--mm/cleancache.c6
-rw-r--r--mm/filemap.c69
-rw-r--r--mm/filemap_xip.c4
-rw-r--r--mm/internal.h4
-rw-r--r--mm/mmap.c54
-rw-r--r--mm/mremap.c26
-rw-r--r--mm/nommu.c35
-rw-r--r--mm/shmem.c6
-rw-r--r--mm/util.c30
-rw-r--r--net/sched/sch_atm.c2
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_wrap.c61
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c27
-rw-r--r--net/sunrpc/rpcb_clnt.c12
-rw-r--r--net/sunrpc/svc.c23
-rw-r--r--net/sunrpc/svc_xprt.c4
-rw-r--r--net/sunrpc/svcauth_unix.c6
-rw-r--r--security/apparmor/lsm.c15
-rw-r--r--security/capability.c3
-rw-r--r--security/commoncap.c17
-rw-r--r--security/security.c51
-rw-r--r--security/selinux/hooks.c15
-rw-r--r--security/selinux/selinuxfs.c36
-rw-r--r--security/smack/smack_lsm.c15
228 files changed, 6650 insertions, 3116 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index d449e632e6a0..8e2da1e06e3b 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -61,6 +61,7 @@ ata *);
61 ssize_t (*listxattr) (struct dentry *, char *, size_t); 61 ssize_t (*listxattr) (struct dentry *, char *, size_t);
62 int (*removexattr) (struct dentry *, const char *); 62 int (*removexattr) (struct dentry *, const char *);
63 int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); 63 int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
64 void (*update_time)(struct inode *, struct timespec *, int);
64 65
65locking rules: 66locking rules:
66 all may block 67 all may block
@@ -87,6 +88,8 @@ getxattr: no
87listxattr: no 88listxattr: no
88removexattr: yes 89removexattr: yes
89fiemap: no 90fiemap: no
91update_time: no
92
90 Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on 93 Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
91victim. 94victim.
92 cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem. 95 cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index ef19f91a0f12..efd23f481704 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -363,6 +363,7 @@ struct inode_operations {
363 ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); 363 ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
364 ssize_t (*listxattr) (struct dentry *, char *, size_t); 364 ssize_t (*listxattr) (struct dentry *, char *, size_t);
365 int (*removexattr) (struct dentry *, const char *); 365 int (*removexattr) (struct dentry *, const char *);
366 void (*update_time)(struct inode *, struct timespec *, int);
366}; 367};
367 368
368Again, all methods are called without any locks being held, unless 369Again, all methods are called without any locks being held, unless
@@ -471,6 +472,9 @@ otherwise noted.
471 removexattr: called by the VFS to remove an extended attribute from 472 removexattr: called by the VFS to remove an extended attribute from
472 a file. This method is called by removexattr(2) system call. 473 a file. This method is called by removexattr(2) system call.
473 474
475 update_time: called by the VFS to update a specific time or the i_version of
476 an inode. If this is not defined the VFS will update the inode itself
477 and call mark_inode_dirty_sync.
474 478
475The Address Space Object 479The Address Space Object
476======================== 480========================
diff --git a/arch/alpha/include/asm/posix_types.h b/arch/alpha/include/asm/posix_types.h
index 24779fc95994..5a8a48320efe 100644
--- a/arch/alpha/include/asm/posix_types.h
+++ b/arch/alpha/include/asm/posix_types.h
@@ -10,9 +10,6 @@
10typedef unsigned int __kernel_ino_t; 10typedef unsigned int __kernel_ino_t;
11#define __kernel_ino_t __kernel_ino_t 11#define __kernel_ino_t __kernel_ino_t
12 12
13typedef unsigned int __kernel_nlink_t;
14#define __kernel_nlink_t __kernel_nlink_t
15
16typedef unsigned long __kernel_sigset_t; /* at least 32 bits */ 13typedef unsigned long __kernel_sigset_t; /* at least 32 bits */
17 14
18#include <asm-generic/posix_types.h> 15#include <asm-generic/posix_types.h>
diff --git a/arch/arm/include/asm/posix_types.h b/arch/arm/include/asm/posix_types.h
index efdf99045d87..d2de9cbbcd9b 100644
--- a/arch/arm/include/asm/posix_types.h
+++ b/arch/arm/include/asm/posix_types.h
@@ -22,9 +22,6 @@
22typedef unsigned short __kernel_mode_t; 22typedef unsigned short __kernel_mode_t;
23#define __kernel_mode_t __kernel_mode_t 23#define __kernel_mode_t __kernel_mode_t
24 24
25typedef unsigned short __kernel_nlink_t;
26#define __kernel_nlink_t __kernel_nlink_t
27
28typedef unsigned short __kernel_ipc_pid_t; 25typedef unsigned short __kernel_ipc_pid_t;
29#define __kernel_ipc_pid_t __kernel_ipc_pid_t 26#define __kernel_ipc_pid_t __kernel_ipc_pid_t
30 27
diff --git a/arch/avr32/include/asm/posix_types.h b/arch/avr32/include/asm/posix_types.h
index 74667bfc88cc..9ba9e749b3f3 100644
--- a/arch/avr32/include/asm/posix_types.h
+++ b/arch/avr32/include/asm/posix_types.h
@@ -17,9 +17,6 @@
17typedef unsigned short __kernel_mode_t; 17typedef unsigned short __kernel_mode_t;
18#define __kernel_mode_t __kernel_mode_t 18#define __kernel_mode_t __kernel_mode_t
19 19
20typedef unsigned short __kernel_nlink_t;
21#define __kernel_nlink_t __kernel_nlink_t
22
23typedef unsigned short __kernel_ipc_pid_t; 20typedef unsigned short __kernel_ipc_pid_t;
24#define __kernel_ipc_pid_t __kernel_ipc_pid_t 21#define __kernel_ipc_pid_t __kernel_ipc_pid_t
25 22
diff --git a/arch/blackfin/include/asm/posix_types.h b/arch/blackfin/include/asm/posix_types.h
index 41bc1875c4d7..1bd3436db6a7 100644
--- a/arch/blackfin/include/asm/posix_types.h
+++ b/arch/blackfin/include/asm/posix_types.h
@@ -10,9 +10,6 @@
10typedef unsigned short __kernel_mode_t; 10typedef unsigned short __kernel_mode_t;
11#define __kernel_mode_t __kernel_mode_t 11#define __kernel_mode_t __kernel_mode_t
12 12
13typedef unsigned short __kernel_nlink_t;
14#define __kernel_nlink_t __kernel_nlink_t
15
16typedef unsigned int __kernel_ipc_pid_t; 13typedef unsigned int __kernel_ipc_pid_t;
17#define __kernel_ipc_pid_t __kernel_ipc_pid_t 14#define __kernel_ipc_pid_t __kernel_ipc_pid_t
18 15
diff --git a/arch/cris/include/asm/posix_types.h b/arch/cris/include/asm/posix_types.h
index 234891c74e2b..ce4e51793151 100644
--- a/arch/cris/include/asm/posix_types.h
+++ b/arch/cris/include/asm/posix_types.h
@@ -15,9 +15,6 @@
15typedef unsigned short __kernel_mode_t; 15typedef unsigned short __kernel_mode_t;
16#define __kernel_mode_t __kernel_mode_t 16#define __kernel_mode_t __kernel_mode_t
17 17
18typedef unsigned short __kernel_nlink_t;
19#define __kernel_nlink_t __kernel_nlink_t
20
21typedef unsigned short __kernel_ipc_pid_t; 18typedef unsigned short __kernel_ipc_pid_t;
22#define __kernel_ipc_pid_t __kernel_ipc_pid_t 19#define __kernel_ipc_pid_t __kernel_ipc_pid_t
23 20
diff --git a/arch/frv/include/asm/posix_types.h b/arch/frv/include/asm/posix_types.h
index 3f34cb45fbb3..fe512af74a5a 100644
--- a/arch/frv/include/asm/posix_types.h
+++ b/arch/frv/include/asm/posix_types.h
@@ -10,9 +10,6 @@
10typedef unsigned short __kernel_mode_t; 10typedef unsigned short __kernel_mode_t;
11#define __kernel_mode_t __kernel_mode_t 11#define __kernel_mode_t __kernel_mode_t
12 12
13typedef unsigned short __kernel_nlink_t;
14#define __kernel_nlink_t __kernel_nlink_t
15
16typedef unsigned short __kernel_ipc_pid_t; 13typedef unsigned short __kernel_ipc_pid_t;
17#define __kernel_ipc_pid_t __kernel_ipc_pid_t 14#define __kernel_ipc_pid_t __kernel_ipc_pid_t
18 15
diff --git a/arch/h8300/include/asm/posix_types.h b/arch/h8300/include/asm/posix_types.h
index bc4c34efb1ad..91e62ba4c7b0 100644
--- a/arch/h8300/include/asm/posix_types.h
+++ b/arch/h8300/include/asm/posix_types.h
@@ -10,9 +10,6 @@
10typedef unsigned short __kernel_mode_t; 10typedef unsigned short __kernel_mode_t;
11#define __kernel_mode_t __kernel_mode_t 11#define __kernel_mode_t __kernel_mode_t
12 12
13typedef unsigned short __kernel_nlink_t;
14#define __kernel_nlink_t __kernel_nlink_t
15
16typedef unsigned short __kernel_ipc_pid_t; 13typedef unsigned short __kernel_ipc_pid_t;
17#define __kernel_ipc_pid_t __kernel_ipc_pid_t 14#define __kernel_ipc_pid_t __kernel_ipc_pid_t
18 15
diff --git a/arch/ia64/include/asm/posix_types.h b/arch/ia64/include/asm/posix_types.h
index 7323ab9467eb..99ee1d6510cf 100644
--- a/arch/ia64/include/asm/posix_types.h
+++ b/arch/ia64/include/asm/posix_types.h
@@ -1,9 +1,6 @@
1#ifndef _ASM_IA64_POSIX_TYPES_H 1#ifndef _ASM_IA64_POSIX_TYPES_H
2#define _ASM_IA64_POSIX_TYPES_H 2#define _ASM_IA64_POSIX_TYPES_H
3 3
4typedef unsigned int __kernel_nlink_t;
5#define __kernel_nlink_t __kernel_nlink_t
6
7typedef unsigned long __kernel_sigset_t; /* at least 32 bits */ 4typedef unsigned long __kernel_sigset_t; /* at least 32 bits */
8 5
9#include <asm-generic/posix_types.h> 6#include <asm-generic/posix_types.h>
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index f00ba025375d..d7f558c1e711 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -604,12 +604,6 @@ pfm_unprotect_ctx_ctxsw(pfm_context_t *x, unsigned long f)
604 spin_unlock(&(x)->ctx_lock); 604 spin_unlock(&(x)->ctx_lock);
605} 605}
606 606
607static inline unsigned long
608pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, unsigned long exec)
609{
610 return get_unmapped_area(file, addr, len, pgoff, flags);
611}
612
613/* forward declaration */ 607/* forward declaration */
614static const struct dentry_operations pfmfs_dentry_operations; 608static const struct dentry_operations pfmfs_dentry_operations;
615 609
@@ -2333,8 +2327,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
2333 down_write(&task->mm->mmap_sem); 2327 down_write(&task->mm->mmap_sem);
2334 2328
2335 /* find some free area in address space, must have mmap sem held */ 2329 /* find some free area in address space, must have mmap sem held */
2336 vma->vm_start = pfm_get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS, 0); 2330 vma->vm_start = get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS);
2337 if (vma->vm_start == 0UL) { 2331 if (IS_ERR_VALUE(vma->vm_start)) {
2338 DPRINT(("Cannot find unmapped area for size %ld\n", size)); 2332 DPRINT(("Cannot find unmapped area for size %ld\n", size));
2339 up_write(&task->mm->mmap_sem); 2333 up_write(&task->mm->mmap_sem);
2340 goto error; 2334 goto error;
diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c
index 609d50056a6c..d9439ef2f661 100644
--- a/arch/ia64/kernel/sys_ia64.c
+++ b/arch/ia64/kernel/sys_ia64.c
@@ -171,22 +171,9 @@ asmlinkage unsigned long
171ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, 171ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags,
172 unsigned long new_addr) 172 unsigned long new_addr)
173{ 173{
174 extern unsigned long do_mremap (unsigned long addr, 174 addr = sys_mremap(addr, old_len, new_len, flags, new_addr);
175 unsigned long old_len, 175 if (!IS_ERR((void *) addr))
176 unsigned long new_len, 176 force_successful_syscall_return();
177 unsigned long flags,
178 unsigned long new_addr);
179
180 down_write(&current->mm->mmap_sem);
181 {
182 addr = do_mremap(addr, old_len, new_len, flags, new_addr);
183 }
184 up_write(&current->mm->mmap_sem);
185
186 if (IS_ERR((void *) addr))
187 return addr;
188
189 force_successful_syscall_return();
190 return addr; 177 return addr;
191} 178}
192 179
diff --git a/arch/m32r/include/asm/posix_types.h b/arch/m32r/include/asm/posix_types.h
index 0195850e1f88..236de26a409b 100644
--- a/arch/m32r/include/asm/posix_types.h
+++ b/arch/m32r/include/asm/posix_types.h
@@ -10,9 +10,6 @@
10typedef unsigned short __kernel_mode_t; 10typedef unsigned short __kernel_mode_t;
11#define __kernel_mode_t __kernel_mode_t 11#define __kernel_mode_t __kernel_mode_t
12 12
13typedef unsigned short __kernel_nlink_t;
14#define __kernel_nlink_t __kernel_nlink_t
15
16typedef unsigned short __kernel_ipc_pid_t; 13typedef unsigned short __kernel_ipc_pid_t;
17#define __kernel_ipc_pid_t __kernel_ipc_pid_t 14#define __kernel_ipc_pid_t __kernel_ipc_pid_t
18 15
diff --git a/arch/m68k/include/asm/posix_types.h b/arch/m68k/include/asm/posix_types.h
index 6373093be72b..cf4dbf70fdc7 100644
--- a/arch/m68k/include/asm/posix_types.h
+++ b/arch/m68k/include/asm/posix_types.h
@@ -10,9 +10,6 @@
10typedef unsigned short __kernel_mode_t; 10typedef unsigned short __kernel_mode_t;
11#define __kernel_mode_t __kernel_mode_t 11#define __kernel_mode_t __kernel_mode_t
12 12
13typedef unsigned short __kernel_nlink_t;
14#define __kernel_nlink_t __kernel_nlink_t
15
16typedef unsigned short __kernel_ipc_pid_t; 13typedef unsigned short __kernel_ipc_pid_t;
17#define __kernel_ipc_pid_t __kernel_ipc_pid_t 14#define __kernel_ipc_pid_t __kernel_ipc_pid_t
18 15
diff --git a/arch/mips/include/asm/posix_types.h b/arch/mips/include/asm/posix_types.h
index e0308dcca135..fa03ec3fbf89 100644
--- a/arch/mips/include/asm/posix_types.h
+++ b/arch/mips/include/asm/posix_types.h
@@ -17,11 +17,6 @@
17 * assume GCC is being used. 17 * assume GCC is being used.
18 */ 18 */
19 19
20#if (_MIPS_SZLONG == 64)
21typedef unsigned int __kernel_nlink_t;
22#define __kernel_nlink_t __kernel_nlink_t
23#endif
24
25typedef long __kernel_daddr_t; 20typedef long __kernel_daddr_t;
26#define __kernel_daddr_t __kernel_daddr_t 21#define __kernel_daddr_t __kernel_daddr_t
27 22
diff --git a/arch/mips/include/asm/stat.h b/arch/mips/include/asm/stat.h
index 6e00f751ab6d..fe9a4c3ec5a1 100644
--- a/arch/mips/include/asm/stat.h
+++ b/arch/mips/include/asm/stat.h
@@ -20,7 +20,7 @@ struct stat {
20 long st_pad1[3]; /* Reserved for network id */ 20 long st_pad1[3]; /* Reserved for network id */
21 ino_t st_ino; 21 ino_t st_ino;
22 mode_t st_mode; 22 mode_t st_mode;
23 nlink_t st_nlink; 23 __u32 st_nlink;
24 uid_t st_uid; 24 uid_t st_uid;
25 gid_t st_gid; 25 gid_t st_gid;
26 unsigned st_rdev; 26 unsigned st_rdev;
@@ -55,7 +55,7 @@ struct stat64 {
55 unsigned long long st_ino; 55 unsigned long long st_ino;
56 56
57 mode_t st_mode; 57 mode_t st_mode;
58 nlink_t st_nlink; 58 __u32 st_nlink;
59 59
60 uid_t st_uid; 60 uid_t st_uid;
61 gid_t st_gid; 61 gid_t st_gid;
@@ -96,7 +96,7 @@ struct stat {
96 unsigned long st_ino; 96 unsigned long st_ino;
97 97
98 mode_t st_mode; 98 mode_t st_mode;
99 nlink_t st_nlink; 99 __u32 st_nlink;
100 100
101 uid_t st_uid; 101 uid_t st_uid;
102 gid_t st_gid; 102 gid_t st_gid;
diff --git a/arch/mn10300/include/asm/posix_types.h b/arch/mn10300/include/asm/posix_types.h
index ab506181ec31..d31eeea480cf 100644
--- a/arch/mn10300/include/asm/posix_types.h
+++ b/arch/mn10300/include/asm/posix_types.h
@@ -20,9 +20,6 @@
20typedef unsigned short __kernel_mode_t; 20typedef unsigned short __kernel_mode_t;
21#define __kernel_mode_t __kernel_mode_t 21#define __kernel_mode_t __kernel_mode_t
22 22
23typedef unsigned short __kernel_nlink_t;
24#define __kernel_nlink_t __kernel_nlink_t
25
26typedef unsigned short __kernel_ipc_pid_t; 23typedef unsigned short __kernel_ipc_pid_t;
27#define __kernel_ipc_pid_t __kernel_ipc_pid_t 24#define __kernel_ipc_pid_t __kernel_ipc_pid_t
28 25
diff --git a/arch/parisc/include/asm/posix_types.h b/arch/parisc/include/asm/posix_types.h
index 5212b0357daf..b9344256f76b 100644
--- a/arch/parisc/include/asm/posix_types.h
+++ b/arch/parisc/include/asm/posix_types.h
@@ -10,9 +10,6 @@
10typedef unsigned short __kernel_mode_t; 10typedef unsigned short __kernel_mode_t;
11#define __kernel_mode_t __kernel_mode_t 11#define __kernel_mode_t __kernel_mode_t
12 12
13typedef unsigned short __kernel_nlink_t;
14#define __kernel_nlink_t __kernel_nlink_t
15
16typedef unsigned short __kernel_ipc_pid_t; 13typedef unsigned short __kernel_ipc_pid_t;
17#define __kernel_ipc_pid_t __kernel_ipc_pid_t 14#define __kernel_ipc_pid_t __kernel_ipc_pid_t
18 15
diff --git a/arch/parisc/include/asm/stat.h b/arch/parisc/include/asm/stat.h
index 9d5fbbc5c31f..d76fbda5d62c 100644
--- a/arch/parisc/include/asm/stat.h
+++ b/arch/parisc/include/asm/stat.h
@@ -7,7 +7,7 @@ struct stat {
7 unsigned int st_dev; /* dev_t is 32 bits on parisc */ 7 unsigned int st_dev; /* dev_t is 32 bits on parisc */
8 ino_t st_ino; /* 32 bits */ 8 ino_t st_ino; /* 32 bits */
9 mode_t st_mode; /* 16 bits */ 9 mode_t st_mode; /* 16 bits */
10 nlink_t st_nlink; /* 16 bits */ 10 unsigned short st_nlink; /* 16 bits */
11 unsigned short st_reserved1; /* old st_uid */ 11 unsigned short st_reserved1; /* old st_uid */
12 unsigned short st_reserved2; /* old st_gid */ 12 unsigned short st_reserved2; /* old st_gid */
13 unsigned int st_rdev; 13 unsigned int st_rdev;
@@ -42,7 +42,7 @@ struct hpux_stat64 {
42 unsigned int st_dev; /* dev_t is 32 bits on parisc */ 42 unsigned int st_dev; /* dev_t is 32 bits on parisc */
43 ino_t st_ino; /* 32 bits */ 43 ino_t st_ino; /* 32 bits */
44 mode_t st_mode; /* 16 bits */ 44 mode_t st_mode; /* 16 bits */
45 nlink_t st_nlink; /* 16 bits */ 45 unsigned short st_nlink; /* 16 bits */
46 unsigned short st_reserved1; /* old st_uid */ 46 unsigned short st_reserved1; /* old st_uid */
47 unsigned short st_reserved2; /* old st_gid */ 47 unsigned short st_reserved2; /* old st_gid */
48 unsigned int st_rdev; 48 unsigned int st_rdev;
diff --git a/arch/powerpc/include/asm/posix_types.h b/arch/powerpc/include/asm/posix_types.h
index f1393252bbda..2958c5b97b2d 100644
--- a/arch/powerpc/include/asm/posix_types.h
+++ b/arch/powerpc/include/asm/posix_types.h
@@ -16,9 +16,6 @@ typedef int __kernel_ssize_t;
16typedef long __kernel_ptrdiff_t; 16typedef long __kernel_ptrdiff_t;
17#define __kernel_size_t __kernel_size_t 17#define __kernel_size_t __kernel_size_t
18 18
19typedef unsigned short __kernel_nlink_t;
20#define __kernel_nlink_t __kernel_nlink_t
21
22typedef short __kernel_ipc_pid_t; 19typedef short __kernel_ipc_pid_t;
23#define __kernel_ipc_pid_t __kernel_ipc_pid_t 20#define __kernel_ipc_pid_t __kernel_ipc_pid_t
24#endif 21#endif
diff --git a/arch/powerpc/include/asm/stat.h b/arch/powerpc/include/asm/stat.h
index e4edc510b530..10cfb558e0fd 100644
--- a/arch/powerpc/include/asm/stat.h
+++ b/arch/powerpc/include/asm/stat.h
@@ -30,11 +30,11 @@ struct stat {
30 unsigned long st_dev; 30 unsigned long st_dev;
31 ino_t st_ino; 31 ino_t st_ino;
32#ifdef __powerpc64__ 32#ifdef __powerpc64__
33 nlink_t st_nlink; 33 unsigned short st_nlink;
34 mode_t st_mode; 34 mode_t st_mode;
35#else 35#else
36 mode_t st_mode; 36 mode_t st_mode;
37 nlink_t st_nlink; 37 unsigned short st_nlink;
38#endif 38#endif
39 uid_t st_uid; 39 uid_t st_uid;
40 gid_t st_gid; 40 gid_t st_gid;
diff --git a/arch/s390/include/asm/posix_types.h b/arch/s390/include/asm/posix_types.h
index edf8527ff08d..7be104c0f192 100644
--- a/arch/s390/include/asm/posix_types.h
+++ b/arch/s390/include/asm/posix_types.h
@@ -24,7 +24,6 @@ typedef unsigned short __kernel_old_dev_t;
24 24
25typedef unsigned long __kernel_ino_t; 25typedef unsigned long __kernel_ino_t;
26typedef unsigned short __kernel_mode_t; 26typedef unsigned short __kernel_mode_t;
27typedef unsigned short __kernel_nlink_t;
28typedef unsigned short __kernel_ipc_pid_t; 27typedef unsigned short __kernel_ipc_pid_t;
29typedef unsigned short __kernel_uid_t; 28typedef unsigned short __kernel_uid_t;
30typedef unsigned short __kernel_gid_t; 29typedef unsigned short __kernel_gid_t;
@@ -35,7 +34,6 @@ typedef int __kernel_ptrdiff_t;
35 34
36typedef unsigned int __kernel_ino_t; 35typedef unsigned int __kernel_ino_t;
37typedef unsigned int __kernel_mode_t; 36typedef unsigned int __kernel_mode_t;
38typedef unsigned int __kernel_nlink_t;
39typedef int __kernel_ipc_pid_t; 37typedef int __kernel_ipc_pid_t;
40typedef unsigned int __kernel_uid_t; 38typedef unsigned int __kernel_uid_t;
41typedef unsigned int __kernel_gid_t; 39typedef unsigned int __kernel_gid_t;
@@ -47,7 +45,6 @@ typedef unsigned long __kernel_sigset_t; /* at least 32 bits */
47 45
48#define __kernel_ino_t __kernel_ino_t 46#define __kernel_ino_t __kernel_ino_t
49#define __kernel_mode_t __kernel_mode_t 47#define __kernel_mode_t __kernel_mode_t
50#define __kernel_nlink_t __kernel_nlink_t
51#define __kernel_ipc_pid_t __kernel_ipc_pid_t 48#define __kernel_ipc_pid_t __kernel_ipc_pid_t
52#define __kernel_uid_t __kernel_uid_t 49#define __kernel_uid_t __kernel_uid_t
53#define __kernel_gid_t __kernel_gid_t 50#define __kernel_gid_t __kernel_gid_t
diff --git a/arch/sh/include/asm/posix_types_32.h b/arch/sh/include/asm/posix_types_32.h
index abda58467ece..ba0bdc423b07 100644
--- a/arch/sh/include/asm/posix_types_32.h
+++ b/arch/sh/include/asm/posix_types_32.h
@@ -3,8 +3,6 @@
3 3
4typedef unsigned short __kernel_mode_t; 4typedef unsigned short __kernel_mode_t;
5#define __kernel_mode_t __kernel_mode_t 5#define __kernel_mode_t __kernel_mode_t
6typedef unsigned short __kernel_nlink_t;
7#define __kernel_nlink_t __kernel_nlink_t
8typedef unsigned short __kernel_ipc_pid_t; 6typedef unsigned short __kernel_ipc_pid_t;
9#define __kernel_ipc_pid_t __kernel_ipc_pid_t 7#define __kernel_ipc_pid_t __kernel_ipc_pid_t
10typedef unsigned short __kernel_uid_t; 8typedef unsigned short __kernel_uid_t;
diff --git a/arch/sh/include/asm/posix_types_64.h b/arch/sh/include/asm/posix_types_64.h
index fcda07b4a616..244f7e950e17 100644
--- a/arch/sh/include/asm/posix_types_64.h
+++ b/arch/sh/include/asm/posix_types_64.h
@@ -3,8 +3,6 @@
3 3
4typedef unsigned short __kernel_mode_t; 4typedef unsigned short __kernel_mode_t;
5#define __kernel_mode_t __kernel_mode_t 5#define __kernel_mode_t __kernel_mode_t
6typedef unsigned short __kernel_nlink_t;
7#define __kernel_nlink_t __kernel_nlink_t
8typedef unsigned short __kernel_ipc_pid_t; 6typedef unsigned short __kernel_ipc_pid_t;
9#define __kernel_ipc_pid_t __kernel_ipc_pid_t 7#define __kernel_ipc_pid_t __kernel_ipc_pid_t
10typedef unsigned short __kernel_uid_t; 8typedef unsigned short __kernel_uid_t;
diff --git a/arch/sparc/include/asm/posix_types.h b/arch/sparc/include/asm/posix_types.h
index 3070f25ae90a..156220ed99eb 100644
--- a/arch/sparc/include/asm/posix_types.h
+++ b/arch/sparc/include/asm/posix_types.h
@@ -9,8 +9,6 @@
9 9
10#if defined(__sparc__) && defined(__arch64__) 10#if defined(__sparc__) && defined(__arch64__)
11/* sparc 64 bit */ 11/* sparc 64 bit */
12typedef unsigned int __kernel_nlink_t;
13#define __kernel_nlink_t __kernel_nlink_t
14 12
15typedef unsigned short __kernel_old_uid_t; 13typedef unsigned short __kernel_old_uid_t;
16typedef unsigned short __kernel_old_gid_t; 14typedef unsigned short __kernel_old_gid_t;
@@ -38,9 +36,6 @@ typedef unsigned short __kernel_gid_t;
38typedef unsigned short __kernel_mode_t; 36typedef unsigned short __kernel_mode_t;
39#define __kernel_mode_t __kernel_mode_t 37#define __kernel_mode_t __kernel_mode_t
40 38
41typedef short __kernel_nlink_t;
42#define __kernel_nlink_t __kernel_nlink_t
43
44typedef long __kernel_daddr_t; 39typedef long __kernel_daddr_t;
45#define __kernel_daddr_t __kernel_daddr_t 40#define __kernel_daddr_t __kernel_daddr_t
46 41
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index 3ee51f189a55..275f74fd6f6a 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -580,16 +580,9 @@ SYSCALL_DEFINE5(64_mremap, unsigned long, addr, unsigned long, old_len,
580 unsigned long, new_len, unsigned long, flags, 580 unsigned long, new_len, unsigned long, flags,
581 unsigned long, new_addr) 581 unsigned long, new_addr)
582{ 582{
583 unsigned long ret = -EINVAL;
584
585 if (test_thread_flag(TIF_32BIT)) 583 if (test_thread_flag(TIF_32BIT))
586 goto out; 584 return -EINVAL;
587 585 return sys_mremap(addr, old_len, new_len, flags, new_addr);
588 down_write(&current->mm->mmap_sem);
589 ret = do_mremap(addr, old_len, new_len, flags, new_addr);
590 up_write(&current->mm->mmap_sem);
591out:
592 return ret;
593} 586}
594 587
595/* we come to here via sys_nis_syscall so it can setup the regs argument */ 588/* we come to here via sys_nis_syscall so it can setup the regs argument */
diff --git a/arch/tile/include/asm/compat.h b/arch/tile/include/asm/compat.h
index 69adc08d36a5..6e74450ff0a1 100644
--- a/arch/tile/include/asm/compat.h
+++ b/arch/tile/include/asm/compat.h
@@ -44,7 +44,6 @@ typedef __kernel_uid32_t __compat_gid32_t;
44typedef __kernel_mode_t compat_mode_t; 44typedef __kernel_mode_t compat_mode_t;
45typedef __kernel_dev_t compat_dev_t; 45typedef __kernel_dev_t compat_dev_t;
46typedef __kernel_loff_t compat_loff_t; 46typedef __kernel_loff_t compat_loff_t;
47typedef __kernel_nlink_t compat_nlink_t;
48typedef __kernel_ipc_pid_t compat_ipc_pid_t; 47typedef __kernel_ipc_pid_t compat_ipc_pid_t;
49typedef __kernel_daddr_t compat_daddr_t; 48typedef __kernel_daddr_t compat_daddr_t;
50typedef __kernel_fsid_t compat_fsid_t; 49typedef __kernel_fsid_t compat_fsid_t;
diff --git a/arch/x86/include/asm/posix_types_32.h b/arch/x86/include/asm/posix_types_32.h
index 99f262e04b91..8e525059e7d8 100644
--- a/arch/x86/include/asm/posix_types_32.h
+++ b/arch/x86/include/asm/posix_types_32.h
@@ -10,9 +10,6 @@
10typedef unsigned short __kernel_mode_t; 10typedef unsigned short __kernel_mode_t;
11#define __kernel_mode_t __kernel_mode_t 11#define __kernel_mode_t __kernel_mode_t
12 12
13typedef unsigned short __kernel_nlink_t;
14#define __kernel_nlink_t __kernel_nlink_t
15
16typedef unsigned short __kernel_ipc_pid_t; 13typedef unsigned short __kernel_ipc_pid_t;
17#define __kernel_ipc_pid_t __kernel_ipc_pid_t 14#define __kernel_ipc_pid_t __kernel_ipc_pid_t
18 15
diff --git a/drivers/base/soc.c b/drivers/base/soc.c
index ba29b2e73d48..72b5e7280d14 100644
--- a/drivers/base/soc.c
+++ b/drivers/base/soc.c
@@ -42,7 +42,7 @@ struct device *soc_device_to_device(struct soc_device *soc_dev)
42 return &soc_dev->dev; 42 return &soc_dev->dev;
43} 43}
44 44
45static mode_t soc_attribute_mode(struct kobject *kobj, 45static umode_t soc_attribute_mode(struct kobject *kobj,
46 struct attribute *attr, 46 struct attribute *attr,
47 int index) 47 int index)
48{ 48{
diff --git a/drivers/gpu/drm/i810/i810_dma.c b/drivers/gpu/drm/i810/i810_dma.c
index f920fb5e42b6..fa9439159ebd 100644
--- a/drivers/gpu/drm/i810/i810_dma.c
+++ b/drivers/gpu/drm/i810/i810_dma.c
@@ -130,11 +130,10 @@ static int i810_map_buffer(struct drm_buf *buf, struct drm_file *file_priv)
130 return -EINVAL; 130 return -EINVAL;
131 131
132 /* This is all entirely broken */ 132 /* This is all entirely broken */
133 down_write(&current->mm->mmap_sem);
134 old_fops = file_priv->filp->f_op; 133 old_fops = file_priv->filp->f_op;
135 file_priv->filp->f_op = &i810_buffer_fops; 134 file_priv->filp->f_op = &i810_buffer_fops;
136 dev_priv->mmap_buffer = buf; 135 dev_priv->mmap_buffer = buf;
137 buf_priv->virtual = (void *)do_mmap(file_priv->filp, 0, buf->total, 136 buf_priv->virtual = (void *)vm_mmap(file_priv->filp, 0, buf->total,
138 PROT_READ | PROT_WRITE, 137 PROT_READ | PROT_WRITE,
139 MAP_SHARED, buf->bus_address); 138 MAP_SHARED, buf->bus_address);
140 dev_priv->mmap_buffer = NULL; 139 dev_priv->mmap_buffer = NULL;
@@ -145,7 +144,6 @@ static int i810_map_buffer(struct drm_buf *buf, struct drm_file *file_priv)
145 retcode = PTR_ERR(buf_priv->virtual); 144 retcode = PTR_ERR(buf_priv->virtual);
146 buf_priv->virtual = NULL; 145 buf_priv->virtual = NULL;
147 } 146 }
148 up_write(&current->mm->mmap_sem);
149 147
150 return retcode; 148 return retcode;
151} 149}
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index a1e6c990cd41..e3dd2a1e2bfc 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -68,24 +68,6 @@ static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
68 return current_fsgid(); 68 return current_fsgid();
69} 69}
70 70
71/**
72 * v9fs_dentry_from_dir_inode - helper function to get the dentry from
73 * dir inode.
74 *
75 */
76
77static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
78{
79 struct dentry *dentry;
80
81 spin_lock(&inode->i_lock);
82 /* Directory should have only one entry. */
83 BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
84 dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
85 spin_unlock(&inode->i_lock);
86 return dentry;
87}
88
89static int v9fs_test_inode_dotl(struct inode *inode, void *data) 71static int v9fs_test_inode_dotl(struct inode *inode, void *data)
90{ 72{
91 struct v9fs_inode *v9inode = V9FS_I(inode); 73 struct v9fs_inode *v9inode = V9FS_I(inode);
@@ -415,7 +397,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
415 if (dir->i_mode & S_ISGID) 397 if (dir->i_mode & S_ISGID)
416 omode |= S_ISGID; 398 omode |= S_ISGID;
417 399
418 dir_dentry = v9fs_dentry_from_dir_inode(dir); 400 dir_dentry = dentry->d_parent;
419 dfid = v9fs_fid_lookup(dir_dentry); 401 dfid = v9fs_fid_lookup(dir_dentry);
420 if (IS_ERR(dfid)) { 402 if (IS_ERR(dfid)) {
421 err = PTR_ERR(dfid); 403 err = PTR_ERR(dfid);
@@ -793,7 +775,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
793 dir->i_ino, old_dentry->d_name.name, dentry->d_name.name); 775 dir->i_ino, old_dentry->d_name.name, dentry->d_name.name);
794 776
795 v9ses = v9fs_inode2v9ses(dir); 777 v9ses = v9fs_inode2v9ses(dir);
796 dir_dentry = v9fs_dentry_from_dir_inode(dir); 778 dir_dentry = dentry->d_parent;
797 dfid = v9fs_fid_lookup(dir_dentry); 779 dfid = v9fs_fid_lookup(dir_dentry);
798 if (IS_ERR(dfid)) 780 if (IS_ERR(dfid))
799 return PTR_ERR(dfid); 781 return PTR_ERR(dfid);
@@ -858,7 +840,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
858 return -EINVAL; 840 return -EINVAL;
859 841
860 v9ses = v9fs_inode2v9ses(dir); 842 v9ses = v9fs_inode2v9ses(dir);
861 dir_dentry = v9fs_dentry_from_dir_inode(dir); 843 dir_dentry = dentry->d_parent;
862 dfid = v9fs_fid_lookup(dir_dentry); 844 dfid = v9fs_fid_lookup(dir_dentry);
863 if (IS_ERR(dfid)) { 845 if (IS_ERR(dfid)) {
864 err = PTR_ERR(dfid); 846 err = PTR_ERR(dfid);
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 45a0ce45d7b4..1fceb320d2f2 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -18,14 +18,6 @@
18#define AFFS_GET_HASHENTRY(data,hashkey) be32_to_cpu(((struct dir_front *)data)->hashtable[hashkey]) 18#define AFFS_GET_HASHENTRY(data,hashkey) be32_to_cpu(((struct dir_front *)data)->hashtable[hashkey])
19#define AFFS_BLOCK(sb, bh, blk) (AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)]) 19#define AFFS_BLOCK(sb, bh, blk) (AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)])
20 20
21#ifdef __LITTLE_ENDIAN
22#define BO_EXBITS 0x18UL
23#elif defined(__BIG_ENDIAN)
24#define BO_EXBITS 0x00UL
25#else
26#error Endianness must be known for affs to work.
27#endif
28
29#define AFFS_HEAD(bh) ((struct affs_head *)(bh)->b_data) 21#define AFFS_HEAD(bh) ((struct affs_head *)(bh)->b_data)
30#define AFFS_TAIL(sb, bh) ((struct affs_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_tail))) 22#define AFFS_TAIL(sb, bh) ((struct affs_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_tail)))
31#define AFFS_ROOT_HEAD(bh) ((struct affs_root_head *)(bh)->b_data) 23#define AFFS_ROOT_HEAD(bh) ((struct affs_root_head *)(bh)->b_data)
diff --git a/fs/aio.c b/fs/aio.c
index 8c7c8b805372..55c4c7656053 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -134,9 +134,9 @@ static int aio_setup_ring(struct kioctx *ctx)
134 info->mmap_size = nr_pages * PAGE_SIZE; 134 info->mmap_size = nr_pages * PAGE_SIZE;
135 dprintk("attempting mmap of %lu bytes\n", info->mmap_size); 135 dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
136 down_write(&ctx->mm->mmap_sem); 136 down_write(&ctx->mm->mmap_sem);
137 info->mmap_base = do_mmap(NULL, 0, info->mmap_size, 137 info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size,
138 PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, 138 PROT_READ|PROT_WRITE,
139 0); 139 MAP_ANONYMOUS|MAP_PRIVATE, 0);
140 if (IS_ERR((void *)info->mmap_base)) { 140 if (IS_ERR((void *)info->mmap_base)) {
141 up_write(&ctx->mm->mmap_sem); 141 up_write(&ctx->mm->mmap_sem);
142 info->mmap_size = 0; 142 info->mmap_size = 0;
diff --git a/fs/attr.c b/fs/attr.c
index 584620e5dee5..0da90951d277 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -176,6 +176,11 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
176 return -EPERM; 176 return -EPERM;
177 } 177 }
178 178
179 if ((ia_valid & ATTR_SIZE) && IS_I_VERSION(inode)) {
180 if (attr->ia_size != inode->i_size)
181 inode_inc_iversion(inode);
182 }
183
179 if ((ia_valid & ATTR_MODE)) { 184 if ((ia_valid & ATTR_MODE)) {
180 umode_t amode = attr->ia_mode; 185 umode_t amode = attr->ia_mode;
181 /* Flag setting protected by i_mutex */ 186 /* Flag setting protected by i_mutex */
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e658dd134b95..1b52956afe33 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -329,7 +329,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
329 if (!size) 329 if (!size)
330 return addr; 330 return addr;
331 331
332 down_write(&current->mm->mmap_sem);
333 /* 332 /*
334 * total_size is the size of the ELF (interpreter) image. 333 * total_size is the size of the ELF (interpreter) image.
335 * The _first_ mmap needs to know the full size, otherwise 334 * The _first_ mmap needs to know the full size, otherwise
@@ -340,13 +339,12 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
340 */ 339 */
341 if (total_size) { 340 if (total_size) {
342 total_size = ELF_PAGEALIGN(total_size); 341 total_size = ELF_PAGEALIGN(total_size);
343 map_addr = do_mmap(filep, addr, total_size, prot, type, off); 342 map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
344 if (!BAD_ADDR(map_addr)) 343 if (!BAD_ADDR(map_addr))
345 do_munmap(current->mm, map_addr+size, total_size-size); 344 vm_munmap(map_addr+size, total_size-size);
346 } else 345 } else
347 map_addr = do_mmap(filep, addr, size, prot, type, off); 346 map_addr = vm_mmap(filep, addr, size, prot, type, off);
348 347
349 up_write(&current->mm->mmap_sem);
350 return(map_addr); 348 return(map_addr);
351} 349}
352 350
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 6b2daf99fab8..178cb70acc26 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -562,7 +562,7 @@ static int load_flat_file(struct linux_binprm * bprm,
562 realdatastart = (unsigned long) -ENOMEM; 562 realdatastart = (unsigned long) -ENOMEM;
563 printk("Unable to allocate RAM for process data, errno %d\n", 563 printk("Unable to allocate RAM for process data, errno %d\n",
564 (int)-realdatastart); 564 (int)-realdatastart);
565 do_munmap(current->mm, textpos, text_len); 565 vm_munmap(textpos, text_len);
566 ret = realdatastart; 566 ret = realdatastart;
567 goto err; 567 goto err;
568 } 568 }
@@ -586,8 +586,8 @@ static int load_flat_file(struct linux_binprm * bprm,
586 } 586 }
587 if (IS_ERR_VALUE(result)) { 587 if (IS_ERR_VALUE(result)) {
588 printk("Unable to read data+bss, errno %d\n", (int)-result); 588 printk("Unable to read data+bss, errno %d\n", (int)-result);
589 do_munmap(current->mm, textpos, text_len); 589 vm_munmap(textpos, text_len);
590 do_munmap(current->mm, realdatastart, len); 590 vm_munmap(realdatastart, len);
591 ret = result; 591 ret = result;
592 goto err; 592 goto err;
593 } 593 }
@@ -654,7 +654,7 @@ static int load_flat_file(struct linux_binprm * bprm,
654 } 654 }
655 if (IS_ERR_VALUE(result)) { 655 if (IS_ERR_VALUE(result)) {
656 printk("Unable to read code+data+bss, errno %d\n",(int)-result); 656 printk("Unable to read code+data+bss, errno %d\n",(int)-result);
657 do_munmap(current->mm, textpos, text_len + data_len + extra + 657 vm_munmap(textpos, text_len + data_len + extra +
658 MAX_SHARED_LIBS * sizeof(unsigned long)); 658 MAX_SHARED_LIBS * sizeof(unsigned long));
659 ret = result; 659 ret = result;
660 goto err; 660 goto err;
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 89b156d85d63..761e2cd8fed1 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -227,7 +227,11 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
227 if (ret > 0) { 227 if (ret > 0) {
228 /* we need an acl */ 228 /* we need an acl */
229 ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); 229 ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
230 } else {
231 cache_no_acl(inode);
230 } 232 }
233 } else {
234 cache_no_acl(inode);
231 } 235 }
232failed: 236failed:
233 posix_acl_release(acl); 237 posix_acl_release(acl);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index bcec06750232..3f75895c919b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -24,22 +24,135 @@
24#include "delayed-ref.h" 24#include "delayed-ref.h"
25#include "locking.h" 25#include "locking.h"
26 26
27struct extent_inode_elem {
28 u64 inum;
29 u64 offset;
30 struct extent_inode_elem *next;
31};
32
33static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
34 struct btrfs_file_extent_item *fi,
35 u64 extent_item_pos,
36 struct extent_inode_elem **eie)
37{
38 u64 data_offset;
39 u64 data_len;
40 struct extent_inode_elem *e;
41
42 data_offset = btrfs_file_extent_offset(eb, fi);
43 data_len = btrfs_file_extent_num_bytes(eb, fi);
44
45 if (extent_item_pos < data_offset ||
46 extent_item_pos >= data_offset + data_len)
47 return 1;
48
49 e = kmalloc(sizeof(*e), GFP_NOFS);
50 if (!e)
51 return -ENOMEM;
52
53 e->next = *eie;
54 e->inum = key->objectid;
55 e->offset = key->offset + (extent_item_pos - data_offset);
56 *eie = e;
57
58 return 0;
59}
60
61static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte,
62 u64 extent_item_pos,
63 struct extent_inode_elem **eie)
64{
65 u64 disk_byte;
66 struct btrfs_key key;
67 struct btrfs_file_extent_item *fi;
68 int slot;
69 int nritems;
70 int extent_type;
71 int ret;
72
73 /*
74 * from the shared data ref, we only have the leaf but we need
75 * the key. thus, we must look into all items and see that we
76 * find one (some) with a reference to our extent item.
77 */
78 nritems = btrfs_header_nritems(eb);
79 for (slot = 0; slot < nritems; ++slot) {
80 btrfs_item_key_to_cpu(eb, &key, slot);
81 if (key.type != BTRFS_EXTENT_DATA_KEY)
82 continue;
83 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
84 extent_type = btrfs_file_extent_type(eb, fi);
85 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
86 continue;
87 /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
88 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
89 if (disk_byte != wanted_disk_byte)
90 continue;
91
92 ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie);
93 if (ret < 0)
94 return ret;
95 }
96
97 return 0;
98}
99
27/* 100/*
28 * this structure records all encountered refs on the way up to the root 101 * this structure records all encountered refs on the way up to the root
29 */ 102 */
30struct __prelim_ref { 103struct __prelim_ref {
31 struct list_head list; 104 struct list_head list;
32 u64 root_id; 105 u64 root_id;
33 struct btrfs_key key; 106 struct btrfs_key key_for_search;
34 int level; 107 int level;
35 int count; 108 int count;
109 struct extent_inode_elem *inode_list;
36 u64 parent; 110 u64 parent;
37 u64 wanted_disk_byte; 111 u64 wanted_disk_byte;
38}; 112};
39 113
114/*
115 * the rules for all callers of this function are:
116 * - obtaining the parent is the goal
117 * - if you add a key, you must know that it is a correct key
118 * - if you cannot add the parent or a correct key, then we will look into the
119 * block later to set a correct key
120 *
121 * delayed refs
122 * ============
123 * backref type | shared | indirect | shared | indirect
124 * information | tree | tree | data | data
125 * --------------------+--------+----------+--------+----------
126 * parent logical | y | - | - | -
127 * key to resolve | - | y | y | y
128 * tree block logical | - | - | - | -
129 * root for resolving | y | y | y | y
130 *
131 * - column 1: we've the parent -> done
132 * - column 2, 3, 4: we use the key to find the parent
133 *
134 * on disk refs (inline or keyed)
135 * ==============================
136 * backref type | shared | indirect | shared | indirect
137 * information | tree | tree | data | data
138 * --------------------+--------+----------+--------+----------
139 * parent logical | y | - | y | -
140 * key to resolve | - | - | - | y
141 * tree block logical | y | y | y | y
142 * root for resolving | - | y | y | y
143 *
144 * - column 1, 3: we've the parent -> done
145 * - column 2: we take the first key from the block to find the parent
146 * (see __add_missing_keys)
147 * - column 4: we use the key to find the parent
148 *
149 * additional information that's available but not required to find the parent
150 * block might help in merging entries to gain some speed.
151 */
152
40static int __add_prelim_ref(struct list_head *head, u64 root_id, 153static int __add_prelim_ref(struct list_head *head, u64 root_id,
41 struct btrfs_key *key, int level, u64 parent, 154 struct btrfs_key *key, int level,
42 u64 wanted_disk_byte, int count) 155 u64 parent, u64 wanted_disk_byte, int count)
43{ 156{
44 struct __prelim_ref *ref; 157 struct __prelim_ref *ref;
45 158
@@ -50,10 +163,11 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
50 163
51 ref->root_id = root_id; 164 ref->root_id = root_id;
52 if (key) 165 if (key)
53 ref->key = *key; 166 ref->key_for_search = *key;
54 else 167 else
55 memset(&ref->key, 0, sizeof(ref->key)); 168 memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
56 169
170 ref->inode_list = NULL;
57 ref->level = level; 171 ref->level = level;
58 ref->count = count; 172 ref->count = count;
59 ref->parent = parent; 173 ref->parent = parent;
@@ -64,18 +178,26 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
64} 178}
65 179
66static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, 180static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
67 struct ulist *parents, 181 struct ulist *parents, int level,
68 struct extent_buffer *eb, int level, 182 struct btrfs_key *key, u64 wanted_disk_byte,
69 u64 wanted_objectid, u64 wanted_disk_byte) 183 const u64 *extent_item_pos)
70{ 184{
71 int ret; 185 int ret;
72 int slot; 186 int slot = path->slots[level];
187 struct extent_buffer *eb = path->nodes[level];
73 struct btrfs_file_extent_item *fi; 188 struct btrfs_file_extent_item *fi;
74 struct btrfs_key key; 189 struct extent_inode_elem *eie = NULL;
75 u64 disk_byte; 190 u64 disk_byte;
191 u64 wanted_objectid = key->objectid;
76 192
77add_parent: 193add_parent:
78 ret = ulist_add(parents, eb->start, 0, GFP_NOFS); 194 if (level == 0 && extent_item_pos) {
195 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
196 ret = check_extent_in_eb(key, eb, fi, *extent_item_pos, &eie);
197 if (ret < 0)
198 return ret;
199 }
200 ret = ulist_add(parents, eb->start, (unsigned long)eie, GFP_NOFS);
79 if (ret < 0) 201 if (ret < 0)
80 return ret; 202 return ret;
81 203
@@ -89,6 +211,7 @@ add_parent:
89 * repeat this until we don't find any additional EXTENT_DATA items. 211 * repeat this until we don't find any additional EXTENT_DATA items.
90 */ 212 */
91 while (1) { 213 while (1) {
214 eie = NULL;
92 ret = btrfs_next_leaf(root, path); 215 ret = btrfs_next_leaf(root, path);
93 if (ret < 0) 216 if (ret < 0)
94 return ret; 217 return ret;
@@ -97,9 +220,9 @@ add_parent:
97 220
98 eb = path->nodes[0]; 221 eb = path->nodes[0];
99 for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) { 222 for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
100 btrfs_item_key_to_cpu(eb, &key, slot); 223 btrfs_item_key_to_cpu(eb, key, slot);
101 if (key.objectid != wanted_objectid || 224 if (key->objectid != wanted_objectid ||
102 key.type != BTRFS_EXTENT_DATA_KEY) 225 key->type != BTRFS_EXTENT_DATA_KEY)
103 return 0; 226 return 0;
104 fi = btrfs_item_ptr(eb, slot, 227 fi = btrfs_item_ptr(eb, slot,
105 struct btrfs_file_extent_item); 228 struct btrfs_file_extent_item);
@@ -118,8 +241,10 @@ add_parent:
118 */ 241 */
119static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, 242static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
120 int search_commit_root, 243 int search_commit_root,
244 u64 time_seq,
121 struct __prelim_ref *ref, 245 struct __prelim_ref *ref,
122 struct ulist *parents) 246 struct ulist *parents,
247 const u64 *extent_item_pos)
123{ 248{
124 struct btrfs_path *path; 249 struct btrfs_path *path;
125 struct btrfs_root *root; 250 struct btrfs_root *root;
@@ -152,12 +277,13 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
152 goto out; 277 goto out;
153 278
154 path->lowest_level = level; 279 path->lowest_level = level;
155 ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0); 280 ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
156 pr_debug("search slot in root %llu (level %d, ref count %d) returned " 281 pr_debug("search slot in root %llu (level %d, ref count %d) returned "
157 "%d for key (%llu %u %llu)\n", 282 "%d for key (%llu %u %llu)\n",
158 (unsigned long long)ref->root_id, level, ref->count, ret, 283 (unsigned long long)ref->root_id, level, ref->count, ret,
159 (unsigned long long)ref->key.objectid, ref->key.type, 284 (unsigned long long)ref->key_for_search.objectid,
160 (unsigned long long)ref->key.offset); 285 ref->key_for_search.type,
286 (unsigned long long)ref->key_for_search.offset);
161 if (ret < 0) 287 if (ret < 0)
162 goto out; 288 goto out;
163 289
@@ -179,9 +305,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
179 btrfs_item_key_to_cpu(eb, &key, path->slots[0]); 305 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
180 } 306 }
181 307
182 /* the last two parameters will only be used for level == 0 */ 308 ret = add_all_parents(root, path, parents, level, &key,
183 ret = add_all_parents(root, path, parents, eb, level, key.objectid, 309 ref->wanted_disk_byte, extent_item_pos);
184 ref->wanted_disk_byte);
185out: 310out:
186 btrfs_free_path(path); 311 btrfs_free_path(path);
187 return ret; 312 return ret;
@@ -191,8 +316,9 @@ out:
191 * resolve all indirect backrefs from the list 316 * resolve all indirect backrefs from the list
192 */ 317 */
193static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, 318static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
194 int search_commit_root, 319 int search_commit_root, u64 time_seq,
195 struct list_head *head) 320 struct list_head *head,
321 const u64 *extent_item_pos)
196{ 322{
197 int err; 323 int err;
198 int ret = 0; 324 int ret = 0;
@@ -201,6 +327,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
201 struct __prelim_ref *new_ref; 327 struct __prelim_ref *new_ref;
202 struct ulist *parents; 328 struct ulist *parents;
203 struct ulist_node *node; 329 struct ulist_node *node;
330 struct ulist_iterator uiter;
204 331
205 parents = ulist_alloc(GFP_NOFS); 332 parents = ulist_alloc(GFP_NOFS);
206 if (!parents) 333 if (!parents)
@@ -217,7 +344,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
217 if (ref->count == 0) 344 if (ref->count == 0)
218 continue; 345 continue;
219 err = __resolve_indirect_ref(fs_info, search_commit_root, 346 err = __resolve_indirect_ref(fs_info, search_commit_root,
220 ref, parents); 347 time_seq, ref, parents,
348 extent_item_pos);
221 if (err) { 349 if (err) {
222 if (ret == 0) 350 if (ret == 0)
223 ret = err; 351 ret = err;
@@ -225,11 +353,14 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
225 } 353 }
226 354
227 /* we put the first parent into the ref at hand */ 355 /* we put the first parent into the ref at hand */
228 node = ulist_next(parents, NULL); 356 ULIST_ITER_INIT(&uiter);
357 node = ulist_next(parents, &uiter);
229 ref->parent = node ? node->val : 0; 358 ref->parent = node ? node->val : 0;
359 ref->inode_list =
360 node ? (struct extent_inode_elem *)node->aux : 0;
230 361
231 /* additional parents require new refs being added here */ 362 /* additional parents require new refs being added here */
232 while ((node = ulist_next(parents, node))) { 363 while ((node = ulist_next(parents, &uiter))) {
233 new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS); 364 new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
234 if (!new_ref) { 365 if (!new_ref) {
235 ret = -ENOMEM; 366 ret = -ENOMEM;
@@ -237,6 +368,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
237 } 368 }
238 memcpy(new_ref, ref, sizeof(*ref)); 369 memcpy(new_ref, ref, sizeof(*ref));
239 new_ref->parent = node->val; 370 new_ref->parent = node->val;
371 new_ref->inode_list =
372 (struct extent_inode_elem *)node->aux;
240 list_add(&new_ref->list, &ref->list); 373 list_add(&new_ref->list, &ref->list);
241 } 374 }
242 ulist_reinit(parents); 375 ulist_reinit(parents);
@@ -246,10 +379,65 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
246 return ret; 379 return ret;
247} 380}
248 381
382static inline int ref_for_same_block(struct __prelim_ref *ref1,
383 struct __prelim_ref *ref2)
384{
385 if (ref1->level != ref2->level)
386 return 0;
387 if (ref1->root_id != ref2->root_id)
388 return 0;
389 if (ref1->key_for_search.type != ref2->key_for_search.type)
390 return 0;
391 if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
392 return 0;
393 if (ref1->key_for_search.offset != ref2->key_for_search.offset)
394 return 0;
395 if (ref1->parent != ref2->parent)
396 return 0;
397
398 return 1;
399}
400
401/*
402 * read tree blocks and add keys where required.
403 */
404static int __add_missing_keys(struct btrfs_fs_info *fs_info,
405 struct list_head *head)
406{
407 struct list_head *pos;
408 struct extent_buffer *eb;
409
410 list_for_each(pos, head) {
411 struct __prelim_ref *ref;
412 ref = list_entry(pos, struct __prelim_ref, list);
413
414 if (ref->parent)
415 continue;
416 if (ref->key_for_search.type)
417 continue;
418 BUG_ON(!ref->wanted_disk_byte);
419 eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
420 fs_info->tree_root->leafsize, 0);
421 BUG_ON(!eb);
422 btrfs_tree_read_lock(eb);
423 if (btrfs_header_level(eb) == 0)
424 btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
425 else
426 btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
427 btrfs_tree_read_unlock(eb);
428 free_extent_buffer(eb);
429 }
430 return 0;
431}
432
249/* 433/*
250 * merge two lists of backrefs and adjust counts accordingly 434 * merge two lists of backrefs and adjust counts accordingly
251 * 435 *
252 * mode = 1: merge identical keys, if key is set 436 * mode = 1: merge identical keys, if key is set
437 * FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
438 * additionally, we could even add a key range for the blocks we
439 * looked into to merge even more (-> replace unresolved refs by those
440 * having a parent).
253 * mode = 2: merge identical parents 441 * mode = 2: merge identical parents
254 */ 442 */
255static int __merge_refs(struct list_head *head, int mode) 443static int __merge_refs(struct list_head *head, int mode)
@@ -263,20 +451,21 @@ static int __merge_refs(struct list_head *head, int mode)
263 451
264 ref1 = list_entry(pos1, struct __prelim_ref, list); 452 ref1 = list_entry(pos1, struct __prelim_ref, list);
265 453
266 if (mode == 1 && ref1->key.type == 0)
267 continue;
268 for (pos2 = pos1->next, n2 = pos2->next; pos2 != head; 454 for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
269 pos2 = n2, n2 = pos2->next) { 455 pos2 = n2, n2 = pos2->next) {
270 struct __prelim_ref *ref2; 456 struct __prelim_ref *ref2;
457 struct __prelim_ref *xchg;
271 458
272 ref2 = list_entry(pos2, struct __prelim_ref, list); 459 ref2 = list_entry(pos2, struct __prelim_ref, list);
273 460
274 if (mode == 1) { 461 if (mode == 1) {
275 if (memcmp(&ref1->key, &ref2->key, 462 if (!ref_for_same_block(ref1, ref2))
276 sizeof(ref1->key)) ||
277 ref1->level != ref2->level ||
278 ref1->root_id != ref2->root_id)
279 continue; 463 continue;
464 if (!ref1->parent && ref2->parent) {
465 xchg = ref1;
466 ref1 = ref2;
467 ref2 = xchg;
468 }
280 ref1->count += ref2->count; 469 ref1->count += ref2->count;
281 } else { 470 } else {
282 if (ref1->parent != ref2->parent) 471 if (ref1->parent != ref2->parent)
@@ -296,16 +485,17 @@ static int __merge_refs(struct list_head *head, int mode)
296 * smaller or equal that seq to the list 485 * smaller or equal that seq to the list
297 */ 486 */
298static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, 487static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
299 struct btrfs_key *info_key,
300 struct list_head *prefs) 488 struct list_head *prefs)
301{ 489{
302 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 490 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
303 struct rb_node *n = &head->node.rb_node; 491 struct rb_node *n = &head->node.rb_node;
492 struct btrfs_key key;
493 struct btrfs_key op_key = {0};
304 int sgn; 494 int sgn;
305 int ret = 0; 495 int ret = 0;
306 496
307 if (extent_op && extent_op->update_key) 497 if (extent_op && extent_op->update_key)
308 btrfs_disk_key_to_cpu(info_key, &extent_op->key); 498 btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
309 499
310 while ((n = rb_prev(n))) { 500 while ((n = rb_prev(n))) {
311 struct btrfs_delayed_ref_node *node; 501 struct btrfs_delayed_ref_node *node;
@@ -337,7 +527,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
337 struct btrfs_delayed_tree_ref *ref; 527 struct btrfs_delayed_tree_ref *ref;
338 528
339 ref = btrfs_delayed_node_to_tree_ref(node); 529 ref = btrfs_delayed_node_to_tree_ref(node);
340 ret = __add_prelim_ref(prefs, ref->root, info_key, 530 ret = __add_prelim_ref(prefs, ref->root, &op_key,
341 ref->level + 1, 0, node->bytenr, 531 ref->level + 1, 0, node->bytenr,
342 node->ref_mod * sgn); 532 node->ref_mod * sgn);
343 break; 533 break;
@@ -346,7 +536,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
346 struct btrfs_delayed_tree_ref *ref; 536 struct btrfs_delayed_tree_ref *ref;
347 537
348 ref = btrfs_delayed_node_to_tree_ref(node); 538 ref = btrfs_delayed_node_to_tree_ref(node);
349 ret = __add_prelim_ref(prefs, ref->root, info_key, 539 ret = __add_prelim_ref(prefs, ref->root, NULL,
350 ref->level + 1, ref->parent, 540 ref->level + 1, ref->parent,
351 node->bytenr, 541 node->bytenr,
352 node->ref_mod * sgn); 542 node->ref_mod * sgn);
@@ -354,8 +544,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
354 } 544 }
355 case BTRFS_EXTENT_DATA_REF_KEY: { 545 case BTRFS_EXTENT_DATA_REF_KEY: {
356 struct btrfs_delayed_data_ref *ref; 546 struct btrfs_delayed_data_ref *ref;
357 struct btrfs_key key;
358
359 ref = btrfs_delayed_node_to_data_ref(node); 547 ref = btrfs_delayed_node_to_data_ref(node);
360 548
361 key.objectid = ref->objectid; 549 key.objectid = ref->objectid;
@@ -368,7 +556,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
368 } 556 }
369 case BTRFS_SHARED_DATA_REF_KEY: { 557 case BTRFS_SHARED_DATA_REF_KEY: {
370 struct btrfs_delayed_data_ref *ref; 558 struct btrfs_delayed_data_ref *ref;
371 struct btrfs_key key;
372 559
373 ref = btrfs_delayed_node_to_data_ref(node); 560 ref = btrfs_delayed_node_to_data_ref(node);
374 561
@@ -394,8 +581,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
394 */ 581 */
395static int __add_inline_refs(struct btrfs_fs_info *fs_info, 582static int __add_inline_refs(struct btrfs_fs_info *fs_info,
396 struct btrfs_path *path, u64 bytenr, 583 struct btrfs_path *path, u64 bytenr,
397 struct btrfs_key *info_key, int *info_level, 584 int *info_level, struct list_head *prefs)
398 struct list_head *prefs)
399{ 585{
400 int ret = 0; 586 int ret = 0;
401 int slot; 587 int slot;
@@ -411,7 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
411 * enumerate all inline refs 597 * enumerate all inline refs
412 */ 598 */
413 leaf = path->nodes[0]; 599 leaf = path->nodes[0];
414 slot = path->slots[0] - 1; 600 slot = path->slots[0];
415 601
416 item_size = btrfs_item_size_nr(leaf, slot); 602 item_size = btrfs_item_size_nr(leaf, slot);
417 BUG_ON(item_size < sizeof(*ei)); 603 BUG_ON(item_size < sizeof(*ei));
@@ -424,12 +610,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
424 610
425 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 611 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
426 struct btrfs_tree_block_info *info; 612 struct btrfs_tree_block_info *info;
427 struct btrfs_disk_key disk_key;
428 613
429 info = (struct btrfs_tree_block_info *)ptr; 614 info = (struct btrfs_tree_block_info *)ptr;
430 *info_level = btrfs_tree_block_level(leaf, info); 615 *info_level = btrfs_tree_block_level(leaf, info);
431 btrfs_tree_block_key(leaf, info, &disk_key);
432 btrfs_disk_key_to_cpu(info_key, &disk_key);
433 ptr += sizeof(struct btrfs_tree_block_info); 616 ptr += sizeof(struct btrfs_tree_block_info);
434 BUG_ON(ptr > end); 617 BUG_ON(ptr > end);
435 } else { 618 } else {
@@ -447,7 +630,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
447 630
448 switch (type) { 631 switch (type) {
449 case BTRFS_SHARED_BLOCK_REF_KEY: 632 case BTRFS_SHARED_BLOCK_REF_KEY:
450 ret = __add_prelim_ref(prefs, 0, info_key, 633 ret = __add_prelim_ref(prefs, 0, NULL,
451 *info_level + 1, offset, 634 *info_level + 1, offset,
452 bytenr, 1); 635 bytenr, 1);
453 break; 636 break;
@@ -462,8 +645,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
462 break; 645 break;
463 } 646 }
464 case BTRFS_TREE_BLOCK_REF_KEY: 647 case BTRFS_TREE_BLOCK_REF_KEY:
465 ret = __add_prelim_ref(prefs, offset, info_key, 648 ret = __add_prelim_ref(prefs, offset, NULL,
466 *info_level + 1, 0, bytenr, 1); 649 *info_level + 1, 0,
650 bytenr, 1);
467 break; 651 break;
468 case BTRFS_EXTENT_DATA_REF_KEY: { 652 case BTRFS_EXTENT_DATA_REF_KEY: {
469 struct btrfs_extent_data_ref *dref; 653 struct btrfs_extent_data_ref *dref;
@@ -477,8 +661,8 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
477 key.type = BTRFS_EXTENT_DATA_KEY; 661 key.type = BTRFS_EXTENT_DATA_KEY;
478 key.offset = btrfs_extent_data_ref_offset(leaf, dref); 662 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
479 root = btrfs_extent_data_ref_root(leaf, dref); 663 root = btrfs_extent_data_ref_root(leaf, dref);
480 ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr, 664 ret = __add_prelim_ref(prefs, root, &key, 0, 0,
481 count); 665 bytenr, count);
482 break; 666 break;
483 } 667 }
484 default: 668 default:
@@ -496,8 +680,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
496 */ 680 */
497static int __add_keyed_refs(struct btrfs_fs_info *fs_info, 681static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
498 struct btrfs_path *path, u64 bytenr, 682 struct btrfs_path *path, u64 bytenr,
499 struct btrfs_key *info_key, int info_level, 683 int info_level, struct list_head *prefs)
500 struct list_head *prefs)
501{ 684{
502 struct btrfs_root *extent_root = fs_info->extent_root; 685 struct btrfs_root *extent_root = fs_info->extent_root;
503 int ret; 686 int ret;
@@ -527,7 +710,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
527 710
528 switch (key.type) { 711 switch (key.type) {
529 case BTRFS_SHARED_BLOCK_REF_KEY: 712 case BTRFS_SHARED_BLOCK_REF_KEY:
530 ret = __add_prelim_ref(prefs, 0, info_key, 713 ret = __add_prelim_ref(prefs, 0, NULL,
531 info_level + 1, key.offset, 714 info_level + 1, key.offset,
532 bytenr, 1); 715 bytenr, 1);
533 break; 716 break;
@@ -543,8 +726,9 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
543 break; 726 break;
544 } 727 }
545 case BTRFS_TREE_BLOCK_REF_KEY: 728 case BTRFS_TREE_BLOCK_REF_KEY:
546 ret = __add_prelim_ref(prefs, key.offset, info_key, 729 ret = __add_prelim_ref(prefs, key.offset, NULL,
547 info_level + 1, 0, bytenr, 1); 730 info_level + 1, 0,
731 bytenr, 1);
548 break; 732 break;
549 case BTRFS_EXTENT_DATA_REF_KEY: { 733 case BTRFS_EXTENT_DATA_REF_KEY: {
550 struct btrfs_extent_data_ref *dref; 734 struct btrfs_extent_data_ref *dref;
@@ -560,7 +744,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
560 key.offset = btrfs_extent_data_ref_offset(leaf, dref); 744 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
561 root = btrfs_extent_data_ref_root(leaf, dref); 745 root = btrfs_extent_data_ref_root(leaf, dref);
562 ret = __add_prelim_ref(prefs, root, &key, 0, 0, 746 ret = __add_prelim_ref(prefs, root, &key, 0, 0,
563 bytenr, count); 747 bytenr, count);
564 break; 748 break;
565 } 749 }
566 default: 750 default:
@@ -582,11 +766,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
582 */ 766 */
583static int find_parent_nodes(struct btrfs_trans_handle *trans, 767static int find_parent_nodes(struct btrfs_trans_handle *trans,
584 struct btrfs_fs_info *fs_info, u64 bytenr, 768 struct btrfs_fs_info *fs_info, u64 bytenr,
585 u64 seq, struct ulist *refs, struct ulist *roots) 769 u64 delayed_ref_seq, u64 time_seq,
770 struct ulist *refs, struct ulist *roots,
771 const u64 *extent_item_pos)
586{ 772{
587 struct btrfs_key key; 773 struct btrfs_key key;
588 struct btrfs_path *path; 774 struct btrfs_path *path;
589 struct btrfs_key info_key = { 0 };
590 struct btrfs_delayed_ref_root *delayed_refs = NULL; 775 struct btrfs_delayed_ref_root *delayed_refs = NULL;
591 struct btrfs_delayed_ref_head *head; 776 struct btrfs_delayed_ref_head *head;
592 int info_level = 0; 777 int info_level = 0;
@@ -645,7 +830,7 @@ again:
645 btrfs_put_delayed_ref(&head->node); 830 btrfs_put_delayed_ref(&head->node);
646 goto again; 831 goto again;
647 } 832 }
648 ret = __add_delayed_refs(head, seq, &info_key, 833 ret = __add_delayed_refs(head, delayed_ref_seq,
649 &prefs_delayed); 834 &prefs_delayed);
650 if (ret) { 835 if (ret) {
651 spin_unlock(&delayed_refs->lock); 836 spin_unlock(&delayed_refs->lock);
@@ -659,16 +844,17 @@ again:
659 struct extent_buffer *leaf; 844 struct extent_buffer *leaf;
660 int slot; 845 int slot;
661 846
847 path->slots[0]--;
662 leaf = path->nodes[0]; 848 leaf = path->nodes[0];
663 slot = path->slots[0] - 1; 849 slot = path->slots[0];
664 btrfs_item_key_to_cpu(leaf, &key, slot); 850 btrfs_item_key_to_cpu(leaf, &key, slot);
665 if (key.objectid == bytenr && 851 if (key.objectid == bytenr &&
666 key.type == BTRFS_EXTENT_ITEM_KEY) { 852 key.type == BTRFS_EXTENT_ITEM_KEY) {
667 ret = __add_inline_refs(fs_info, path, bytenr, 853 ret = __add_inline_refs(fs_info, path, bytenr,
668 &info_key, &info_level, &prefs); 854 &info_level, &prefs);
669 if (ret) 855 if (ret)
670 goto out; 856 goto out;
671 ret = __add_keyed_refs(fs_info, path, bytenr, &info_key, 857 ret = __add_keyed_refs(fs_info, path, bytenr,
672 info_level, &prefs); 858 info_level, &prefs);
673 if (ret) 859 if (ret)
674 goto out; 860 goto out;
@@ -676,21 +862,18 @@ again:
676 } 862 }
677 btrfs_release_path(path); 863 btrfs_release_path(path);
678 864
679 /*
680 * when adding the delayed refs above, the info_key might not have
681 * been known yet. Go over the list and replace the missing keys
682 */
683 list_for_each_entry(ref, &prefs_delayed, list) {
684 if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
685 memcpy(&ref->key, &info_key, sizeof(ref->key));
686 }
687 list_splice_init(&prefs_delayed, &prefs); 865 list_splice_init(&prefs_delayed, &prefs);
688 866
867 ret = __add_missing_keys(fs_info, &prefs);
868 if (ret)
869 goto out;
870
689 ret = __merge_refs(&prefs, 1); 871 ret = __merge_refs(&prefs, 1);
690 if (ret) 872 if (ret)
691 goto out; 873 goto out;
692 874
693 ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs); 875 ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq,
876 &prefs, extent_item_pos);
694 if (ret) 877 if (ret)
695 goto out; 878 goto out;
696 879
@@ -709,7 +892,33 @@ again:
709 BUG_ON(ret < 0); 892 BUG_ON(ret < 0);
710 } 893 }
711 if (ref->count && ref->parent) { 894 if (ref->count && ref->parent) {
712 ret = ulist_add(refs, ref->parent, 0, GFP_NOFS); 895 struct extent_inode_elem *eie = NULL;
896 if (extent_item_pos && !ref->inode_list) {
897 u32 bsz;
898 struct extent_buffer *eb;
899 bsz = btrfs_level_size(fs_info->extent_root,
900 info_level);
901 eb = read_tree_block(fs_info->extent_root,
902 ref->parent, bsz, 0);
903 BUG_ON(!eb);
904 ret = find_extent_in_eb(eb, bytenr,
905 *extent_item_pos, &eie);
906 ref->inode_list = eie;
907 free_extent_buffer(eb);
908 }
909 ret = ulist_add_merge(refs, ref->parent,
910 (unsigned long)ref->inode_list,
911 (unsigned long *)&eie, GFP_NOFS);
912 if (!ret && extent_item_pos) {
913 /*
914 * we've recorded that parent, so we must extend
915 * its inode list here
916 */
917 BUG_ON(!eie);
918 while (eie->next)
919 eie = eie->next;
920 eie->next = ref->inode_list;
921 }
713 BUG_ON(ret < 0); 922 BUG_ON(ret < 0);
714 } 923 }
715 kfree(ref); 924 kfree(ref);
@@ -734,6 +943,28 @@ out:
734 return ret; 943 return ret;
735} 944}
736 945
946static void free_leaf_list(struct ulist *blocks)
947{
948 struct ulist_node *node = NULL;
949 struct extent_inode_elem *eie;
950 struct extent_inode_elem *eie_next;
951 struct ulist_iterator uiter;
952
953 ULIST_ITER_INIT(&uiter);
954 while ((node = ulist_next(blocks, &uiter))) {
955 if (!node->aux)
956 continue;
957 eie = (struct extent_inode_elem *)node->aux;
958 for (; eie; eie = eie_next) {
959 eie_next = eie->next;
960 kfree(eie);
961 }
962 node->aux = 0;
963 }
964
965 ulist_free(blocks);
966}
967
737/* 968/*
738 * Finds all leafs with a reference to the specified combination of bytenr and 969 * Finds all leafs with a reference to the specified combination of bytenr and
739 * offset. key_list_head will point to a list of corresponding keys (caller must 970 * offset. key_list_head will point to a list of corresponding keys (caller must
@@ -744,7 +975,9 @@ out:
744 */ 975 */
745static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, 976static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
746 struct btrfs_fs_info *fs_info, u64 bytenr, 977 struct btrfs_fs_info *fs_info, u64 bytenr,
747 u64 num_bytes, u64 seq, struct ulist **leafs) 978 u64 delayed_ref_seq, u64 time_seq,
979 struct ulist **leafs,
980 const u64 *extent_item_pos)
748{ 981{
749 struct ulist *tmp; 982 struct ulist *tmp;
750 int ret; 983 int ret;
@@ -758,11 +991,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
758 return -ENOMEM; 991 return -ENOMEM;
759 } 992 }
760 993
761 ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp); 994 ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
995 time_seq, *leafs, tmp, extent_item_pos);
762 ulist_free(tmp); 996 ulist_free(tmp);
763 997
764 if (ret < 0 && ret != -ENOENT) { 998 if (ret < 0 && ret != -ENOENT) {
765 ulist_free(*leafs); 999 free_leaf_list(*leafs);
766 return ret; 1000 return ret;
767 } 1001 }
768 1002
@@ -784,10 +1018,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
784 */ 1018 */
785int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 1019int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
786 struct btrfs_fs_info *fs_info, u64 bytenr, 1020 struct btrfs_fs_info *fs_info, u64 bytenr,
787 u64 num_bytes, u64 seq, struct ulist **roots) 1021 u64 delayed_ref_seq, u64 time_seq,
1022 struct ulist **roots)
788{ 1023{
789 struct ulist *tmp; 1024 struct ulist *tmp;
790 struct ulist_node *node = NULL; 1025 struct ulist_node *node = NULL;
1026 struct ulist_iterator uiter;
791 int ret; 1027 int ret;
792 1028
793 tmp = ulist_alloc(GFP_NOFS); 1029 tmp = ulist_alloc(GFP_NOFS);
@@ -799,15 +1035,16 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
799 return -ENOMEM; 1035 return -ENOMEM;
800 } 1036 }
801 1037
1038 ULIST_ITER_INIT(&uiter);
802 while (1) { 1039 while (1) {
803 ret = find_parent_nodes(trans, fs_info, bytenr, seq, 1040 ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
804 tmp, *roots); 1041 time_seq, tmp, *roots, NULL);
805 if (ret < 0 && ret != -ENOENT) { 1042 if (ret < 0 && ret != -ENOENT) {
806 ulist_free(tmp); 1043 ulist_free(tmp);
807 ulist_free(*roots); 1044 ulist_free(*roots);
808 return ret; 1045 return ret;
809 } 1046 }
810 node = ulist_next(tmp, node); 1047 node = ulist_next(tmp, &uiter);
811 if (!node) 1048 if (!node)
812 break; 1049 break;
813 bytenr = node->val; 1050 bytenr = node->val;
@@ -1093,67 +1330,25 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
1093 return 0; 1330 return 0;
1094} 1331}
1095 1332
1096static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, u64 logical, 1333static int iterate_leaf_refs(struct extent_inode_elem *inode_list,
1097 u64 orig_extent_item_objectid, 1334 u64 root, u64 extent_item_objectid,
1098 u64 extent_item_pos, u64 root,
1099 iterate_extent_inodes_t *iterate, void *ctx) 1335 iterate_extent_inodes_t *iterate, void *ctx)
1100{ 1336{
1101 u64 disk_byte; 1337 struct extent_inode_elem *eie;
1102 struct btrfs_key key;
1103 struct btrfs_file_extent_item *fi;
1104 struct extent_buffer *eb;
1105 int slot;
1106 int nritems;
1107 int ret = 0; 1338 int ret = 0;
1108 int extent_type;
1109 u64 data_offset;
1110 u64 data_len;
1111
1112 eb = read_tree_block(fs_info->tree_root, logical,
1113 fs_info->tree_root->leafsize, 0);
1114 if (!eb)
1115 return -EIO;
1116
1117 /*
1118 * from the shared data ref, we only have the leaf but we need
1119 * the key. thus, we must look into all items and see that we
1120 * find one (some) with a reference to our extent item.
1121 */
1122 nritems = btrfs_header_nritems(eb);
1123 for (slot = 0; slot < nritems; ++slot) {
1124 btrfs_item_key_to_cpu(eb, &key, slot);
1125 if (key.type != BTRFS_EXTENT_DATA_KEY)
1126 continue;
1127 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
1128 extent_type = btrfs_file_extent_type(eb, fi);
1129 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1130 continue;
1131 /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
1132 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
1133 if (disk_byte != orig_extent_item_objectid)
1134 continue;
1135
1136 data_offset = btrfs_file_extent_offset(eb, fi);
1137 data_len = btrfs_file_extent_num_bytes(eb, fi);
1138
1139 if (extent_item_pos < data_offset ||
1140 extent_item_pos >= data_offset + data_len)
1141 continue;
1142 1339
1340 for (eie = inode_list; eie; eie = eie->next) {
1143 pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), " 1341 pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
1144 "root %llu\n", orig_extent_item_objectid, 1342 "root %llu\n", extent_item_objectid,
1145 key.objectid, key.offset, root); 1343 eie->inum, eie->offset, root);
1146 ret = iterate(key.objectid, 1344 ret = iterate(eie->inum, eie->offset, root, ctx);
1147 key.offset + (extent_item_pos - data_offset),
1148 root, ctx);
1149 if (ret) { 1345 if (ret) {
1150 pr_debug("stopping iteration because ret=%d\n", ret); 1346 pr_debug("stopping iteration for %llu due to ret=%d\n",
1347 extent_item_objectid, ret);
1151 break; 1348 break;
1152 } 1349 }
1153 } 1350 }
1154 1351
1155 free_extent_buffer(eb);
1156
1157 return ret; 1352 return ret;
1158} 1353}
1159 1354
@@ -1175,7 +1370,10 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1175 struct ulist *roots = NULL; 1370 struct ulist *roots = NULL;
1176 struct ulist_node *ref_node = NULL; 1371 struct ulist_node *ref_node = NULL;
1177 struct ulist_node *root_node = NULL; 1372 struct ulist_node *root_node = NULL;
1178 struct seq_list seq_elem; 1373 struct seq_list seq_elem = {};
1374 struct seq_list tree_mod_seq_elem = {};
1375 struct ulist_iterator ref_uiter;
1376 struct ulist_iterator root_uiter;
1179 struct btrfs_delayed_ref_root *delayed_refs = NULL; 1377 struct btrfs_delayed_ref_root *delayed_refs = NULL;
1180 1378
1181 pr_debug("resolving all inodes for extent %llu\n", 1379 pr_debug("resolving all inodes for extent %llu\n",
@@ -1192,34 +1390,41 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1192 spin_lock(&delayed_refs->lock); 1390 spin_lock(&delayed_refs->lock);
1193 btrfs_get_delayed_seq(delayed_refs, &seq_elem); 1391 btrfs_get_delayed_seq(delayed_refs, &seq_elem);
1194 spin_unlock(&delayed_refs->lock); 1392 spin_unlock(&delayed_refs->lock);
1393 btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1195 } 1394 }
1196 1395
1197 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, 1396 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
1198 extent_item_pos, seq_elem.seq, 1397 seq_elem.seq, tree_mod_seq_elem.seq, &refs,
1199 &refs); 1398 &extent_item_pos);
1200
1201 if (ret) 1399 if (ret)
1202 goto out; 1400 goto out;
1203 1401
1204 while (!ret && (ref_node = ulist_next(refs, ref_node))) { 1402 ULIST_ITER_INIT(&ref_uiter);
1205 ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1, 1403 while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
1206 seq_elem.seq, &roots); 1404 ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
1405 seq_elem.seq,
1406 tree_mod_seq_elem.seq, &roots);
1207 if (ret) 1407 if (ret)
1208 break; 1408 break;
1209 while (!ret && (root_node = ulist_next(roots, root_node))) { 1409 ULIST_ITER_INIT(&root_uiter);
1210 pr_debug("root %llu references leaf %llu\n", 1410 while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
1211 root_node->val, ref_node->val); 1411 pr_debug("root %llu references leaf %llu, data list "
1212 ret = iterate_leaf_refs(fs_info, ref_node->val, 1412 "%#lx\n", root_node->val, ref_node->val,
1213 extent_item_objectid, 1413 ref_node->aux);
1214 extent_item_pos, root_node->val, 1414 ret = iterate_leaf_refs(
1215 iterate, ctx); 1415 (struct extent_inode_elem *)ref_node->aux,
1416 root_node->val, extent_item_objectid,
1417 iterate, ctx);
1216 } 1418 }
1419 ulist_free(roots);
1420 roots = NULL;
1217 } 1421 }
1218 1422
1219 ulist_free(refs); 1423 free_leaf_list(refs);
1220 ulist_free(roots); 1424 ulist_free(roots);
1221out: 1425out:
1222 if (!search_commit_root) { 1426 if (!search_commit_root) {
1427 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1223 btrfs_put_delayed_seq(delayed_refs, &seq_elem); 1428 btrfs_put_delayed_seq(delayed_refs, &seq_elem);
1224 btrfs_end_transaction(trans, fs_info->extent_root); 1429 btrfs_end_transaction(trans, fs_info->extent_root);
1225 } 1430 }
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 57ea2e959e4d..c18d8ac7b795 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -58,7 +58,8 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
58 58
59int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 59int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
60 struct btrfs_fs_info *fs_info, u64 bytenr, 60 struct btrfs_fs_info *fs_info, u64 bytenr,
61 u64 num_bytes, u64 seq, struct ulist **roots); 61 u64 delayed_ref_seq, u64 time_seq,
62 struct ulist **roots);
62 63
63struct btrfs_data_container *init_data_container(u32 total_bytes); 64struct btrfs_data_container *init_data_container(u32 total_bytes);
64struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, 65struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9b9b15fd5204..e616f8872e69 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -24,6 +24,20 @@
24#include "ordered-data.h" 24#include "ordered-data.h"
25#include "delayed-inode.h" 25#include "delayed-inode.h"
26 26
27/*
28 * ordered_data_close is set by truncate when a file that used
29 * to have good data has been truncated to zero. When it is set
30 * the btrfs file release call will add this inode to the
31 * ordered operations list so that we make sure to flush out any
32 * new data the application may have written before commit.
33 */
34#define BTRFS_INODE_ORDERED_DATA_CLOSE 0
35#define BTRFS_INODE_ORPHAN_META_RESERVED 1
36#define BTRFS_INODE_DUMMY 2
37#define BTRFS_INODE_IN_DEFRAG 3
38#define BTRFS_INODE_DELALLOC_META_RESERVED 4
39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
40
27/* in memory btrfs inode */ 41/* in memory btrfs inode */
28struct btrfs_inode { 42struct btrfs_inode {
29 /* which subvolume this inode belongs to */ 43 /* which subvolume this inode belongs to */
@@ -57,9 +71,6 @@ struct btrfs_inode {
57 /* used to order data wrt metadata */ 71 /* used to order data wrt metadata */
58 struct btrfs_ordered_inode_tree ordered_tree; 72 struct btrfs_ordered_inode_tree ordered_tree;
59 73
60 /* for keeping track of orphaned inodes */
61 struct list_head i_orphan;
62
63 /* list of all the delalloc inodes in the FS. There are times we need 74 /* list of all the delalloc inodes in the FS. There are times we need
64 * to write all the delalloc pages to disk, and this list is used 75 * to write all the delalloc pages to disk, and this list is used
65 * to walk them all. 76 * to walk them all.
@@ -78,14 +89,13 @@ struct btrfs_inode {
78 /* the space_info for where this inode's data allocations are done */ 89 /* the space_info for where this inode's data allocations are done */
79 struct btrfs_space_info *space_info; 90 struct btrfs_space_info *space_info;
80 91
92 unsigned long runtime_flags;
93
81 /* full 64 bit generation number, struct vfs_inode doesn't have a big 94 /* full 64 bit generation number, struct vfs_inode doesn't have a big
82 * enough field for this. 95 * enough field for this.
83 */ 96 */
84 u64 generation; 97 u64 generation;
85 98
86 /* sequence number for NFS changes */
87 u64 sequence;
88
89 /* 99 /*
90 * transid of the trans_handle that last modified this inode 100 * transid of the trans_handle that last modified this inode
91 */ 101 */
@@ -145,22 +155,9 @@ struct btrfs_inode {
145 unsigned reserved_extents; 155 unsigned reserved_extents;
146 156
147 /* 157 /*
148 * ordered_data_close is set by truncate when a file that used
149 * to have good data has been truncated to zero. When it is set
150 * the btrfs file release call will add this inode to the
151 * ordered operations list so that we make sure to flush out any
152 * new data the application may have written before commit.
153 */
154 unsigned ordered_data_close:1;
155 unsigned orphan_meta_reserved:1;
156 unsigned dummy_inode:1;
157 unsigned in_defrag:1;
158 unsigned delalloc_meta_reserved:1;
159
160 /*
161 * always compress this one file 158 * always compress this one file
162 */ 159 */
163 unsigned force_compress:4; 160 unsigned force_compress;
164 161
165 struct btrfs_delayed_node *delayed_node; 162 struct btrfs_delayed_node *delayed_node;
166 163
@@ -202,4 +199,17 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_root *root,
202 return false; 199 return false;
203} 200}
204 201
202static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
203{
204 struct btrfs_root *root = BTRFS_I(inode)->root;
205 int ret = 0;
206
207 mutex_lock(&root->log_mutex);
208 if (BTRFS_I(inode)->logged_trans == generation &&
209 BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
210 ret = 1;
211 mutex_unlock(&root->log_mutex);
212 return ret;
213}
214
205#endif 215#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index c053e90f2006..9cebb1fd6a3c 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -103,8 +103,6 @@
103#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300 103#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
104#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters, 104#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters,
105 * excluding " [...]" */ 105 * excluding " [...]" */
106#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
107
108#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1) 106#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
109 107
110/* 108/*
@@ -210,8 +208,9 @@ struct btrfsic_block_data_ctx {
210 u64 dev_bytenr; /* physical bytenr on device */ 208 u64 dev_bytenr; /* physical bytenr on device */
211 u32 len; 209 u32 len;
212 struct btrfsic_dev_state *dev; 210 struct btrfsic_dev_state *dev;
213 char *data; 211 char **datav;
214 struct buffer_head *bh; /* do not use if set to NULL */ 212 struct page **pagev;
213 void *mem_to_free;
215}; 214};
216 215
217/* This structure is used to implement recursion without occupying 216/* This structure is used to implement recursion without occupying
@@ -243,6 +242,8 @@ struct btrfsic_state {
243 struct btrfs_root *root; 242 struct btrfs_root *root;
244 u64 max_superblock_generation; 243 u64 max_superblock_generation;
245 struct btrfsic_block *latest_superblock; 244 struct btrfsic_block *latest_superblock;
245 u32 metablock_size;
246 u32 datablock_size;
246}; 247};
247 248
248static void btrfsic_block_init(struct btrfsic_block *b); 249static void btrfsic_block_init(struct btrfsic_block *b);
@@ -290,8 +291,10 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
290static int btrfsic_process_metablock(struct btrfsic_state *state, 291static int btrfsic_process_metablock(struct btrfsic_state *state,
291 struct btrfsic_block *block, 292 struct btrfsic_block *block,
292 struct btrfsic_block_data_ctx *block_ctx, 293 struct btrfsic_block_data_ctx *block_ctx,
293 struct btrfs_header *hdr,
294 int limit_nesting, int force_iodone_flag); 294 int limit_nesting, int force_iodone_flag);
295static void btrfsic_read_from_block_data(
296 struct btrfsic_block_data_ctx *block_ctx,
297 void *dst, u32 offset, size_t len);
295static int btrfsic_create_link_to_next_block( 298static int btrfsic_create_link_to_next_block(
296 struct btrfsic_state *state, 299 struct btrfsic_state *state,
297 struct btrfsic_block *block, 300 struct btrfsic_block *block,
@@ -318,12 +321,13 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
318static int btrfsic_read_block(struct btrfsic_state *state, 321static int btrfsic_read_block(struct btrfsic_state *state,
319 struct btrfsic_block_data_ctx *block_ctx); 322 struct btrfsic_block_data_ctx *block_ctx);
320static void btrfsic_dump_database(struct btrfsic_state *state); 323static void btrfsic_dump_database(struct btrfsic_state *state);
324static void btrfsic_complete_bio_end_io(struct bio *bio, int err);
321static int btrfsic_test_for_metadata(struct btrfsic_state *state, 325static int btrfsic_test_for_metadata(struct btrfsic_state *state,
322 const u8 *data, unsigned int size); 326 char **datav, unsigned int num_pages);
323static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, 327static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
324 u64 dev_bytenr, u8 *mapped_data, 328 u64 dev_bytenr, char **mapped_datav,
325 unsigned int len, struct bio *bio, 329 unsigned int num_pages,
326 int *bio_is_patched, 330 struct bio *bio, int *bio_is_patched,
327 struct buffer_head *bh, 331 struct buffer_head *bh,
328 int submit_bio_bh_rw); 332 int submit_bio_bh_rw);
329static int btrfsic_process_written_superblock( 333static int btrfsic_process_written_superblock(
@@ -375,7 +379,7 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
375static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, 379static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
376 u64 bytenr, 380 u64 bytenr,
377 struct btrfsic_dev_state *dev_state, 381 struct btrfsic_dev_state *dev_state,
378 u64 dev_bytenr, char *data); 382 u64 dev_bytenr);
379 383
380static struct mutex btrfsic_mutex; 384static struct mutex btrfsic_mutex;
381static int btrfsic_is_initialized; 385static int btrfsic_is_initialized;
@@ -651,7 +655,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
651 int pass; 655 int pass;
652 656
653 BUG_ON(NULL == state); 657 BUG_ON(NULL == state);
654 selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS); 658 selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
655 if (NULL == selected_super) { 659 if (NULL == selected_super) {
656 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); 660 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
657 return -1; 661 return -1;
@@ -718,7 +722,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
718 722
719 num_copies = 723 num_copies =
720 btrfs_num_copies(&state->root->fs_info->mapping_tree, 724 btrfs_num_copies(&state->root->fs_info->mapping_tree,
721 next_bytenr, PAGE_SIZE); 725 next_bytenr, state->metablock_size);
722 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 726 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
723 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 727 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
724 (unsigned long long)next_bytenr, num_copies); 728 (unsigned long long)next_bytenr, num_copies);
@@ -727,9 +731,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
727 struct btrfsic_block *next_block; 731 struct btrfsic_block *next_block;
728 struct btrfsic_block_data_ctx tmp_next_block_ctx; 732 struct btrfsic_block_data_ctx tmp_next_block_ctx;
729 struct btrfsic_block_link *l; 733 struct btrfsic_block_link *l;
730 struct btrfs_header *hdr;
731 734
732 ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, 735 ret = btrfsic_map_block(state, next_bytenr,
736 state->metablock_size,
733 &tmp_next_block_ctx, 737 &tmp_next_block_ctx,
734 mirror_num); 738 mirror_num);
735 if (ret) { 739 if (ret) {
@@ -758,7 +762,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
758 BUG_ON(NULL == l); 762 BUG_ON(NULL == l);
759 763
760 ret = btrfsic_read_block(state, &tmp_next_block_ctx); 764 ret = btrfsic_read_block(state, &tmp_next_block_ctx);
761 if (ret < (int)BTRFSIC_BLOCK_SIZE) { 765 if (ret < (int)PAGE_CACHE_SIZE) {
762 printk(KERN_INFO 766 printk(KERN_INFO
763 "btrfsic: read @logical %llu failed!\n", 767 "btrfsic: read @logical %llu failed!\n",
764 (unsigned long long) 768 (unsigned long long)
@@ -768,11 +772,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
768 return -1; 772 return -1;
769 } 773 }
770 774
771 hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
772 ret = btrfsic_process_metablock(state, 775 ret = btrfsic_process_metablock(state,
773 next_block, 776 next_block,
774 &tmp_next_block_ctx, 777 &tmp_next_block_ctx,
775 hdr,
776 BTRFS_MAX_LEVEL + 3, 1); 778 BTRFS_MAX_LEVEL + 3, 1);
777 btrfsic_release_block_ctx(&tmp_next_block_ctx); 779 btrfsic_release_block_ctx(&tmp_next_block_ctx);
778 } 780 }
@@ -799,7 +801,10 @@ static int btrfsic_process_superblock_dev_mirror(
799 801
800 /* super block bytenr is always the unmapped device bytenr */ 802 /* super block bytenr is always the unmapped device bytenr */
801 dev_bytenr = btrfs_sb_offset(superblock_mirror_num); 803 dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
802 bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096); 804 if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
805 return -1;
806 bh = __bread(superblock_bdev, dev_bytenr / 4096,
807 BTRFS_SUPER_INFO_SIZE);
803 if (NULL == bh) 808 if (NULL == bh)
804 return -1; 809 return -1;
805 super_tmp = (struct btrfs_super_block *) 810 super_tmp = (struct btrfs_super_block *)
@@ -808,7 +813,10 @@ static int btrfsic_process_superblock_dev_mirror(
808 if (btrfs_super_bytenr(super_tmp) != dev_bytenr || 813 if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
809 strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, 814 strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
810 sizeof(super_tmp->magic)) || 815 sizeof(super_tmp->magic)) ||
811 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) { 816 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
817 btrfs_super_nodesize(super_tmp) != state->metablock_size ||
818 btrfs_super_leafsize(super_tmp) != state->metablock_size ||
819 btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
812 brelse(bh); 820 brelse(bh);
813 return 0; 821 return 0;
814 } 822 }
@@ -893,7 +901,7 @@ static int btrfsic_process_superblock_dev_mirror(
893 901
894 num_copies = 902 num_copies =
895 btrfs_num_copies(&state->root->fs_info->mapping_tree, 903 btrfs_num_copies(&state->root->fs_info->mapping_tree,
896 next_bytenr, PAGE_SIZE); 904 next_bytenr, state->metablock_size);
897 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 905 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
898 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 906 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
899 (unsigned long long)next_bytenr, num_copies); 907 (unsigned long long)next_bytenr, num_copies);
@@ -902,7 +910,8 @@ static int btrfsic_process_superblock_dev_mirror(
902 struct btrfsic_block_data_ctx tmp_next_block_ctx; 910 struct btrfsic_block_data_ctx tmp_next_block_ctx;
903 struct btrfsic_block_link *l; 911 struct btrfsic_block_link *l;
904 912
905 if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE, 913 if (btrfsic_map_block(state, next_bytenr,
914 state->metablock_size,
906 &tmp_next_block_ctx, 915 &tmp_next_block_ctx,
907 mirror_num)) { 916 mirror_num)) {
908 printk(KERN_INFO "btrfsic: btrfsic_map_block(" 917 printk(KERN_INFO "btrfsic: btrfsic_map_block("
@@ -966,13 +975,15 @@ static int btrfsic_process_metablock(
966 struct btrfsic_state *state, 975 struct btrfsic_state *state,
967 struct btrfsic_block *const first_block, 976 struct btrfsic_block *const first_block,
968 struct btrfsic_block_data_ctx *const first_block_ctx, 977 struct btrfsic_block_data_ctx *const first_block_ctx,
969 struct btrfs_header *const first_hdr,
970 int first_limit_nesting, int force_iodone_flag) 978 int first_limit_nesting, int force_iodone_flag)
971{ 979{
972 struct btrfsic_stack_frame initial_stack_frame = { 0 }; 980 struct btrfsic_stack_frame initial_stack_frame = { 0 };
973 struct btrfsic_stack_frame *sf; 981 struct btrfsic_stack_frame *sf;
974 struct btrfsic_stack_frame *next_stack; 982 struct btrfsic_stack_frame *next_stack;
983 struct btrfs_header *const first_hdr =
984 (struct btrfs_header *)first_block_ctx->datav[0];
975 985
986 BUG_ON(!first_hdr);
976 sf = &initial_stack_frame; 987 sf = &initial_stack_frame;
977 sf->error = 0; 988 sf->error = 0;
978 sf->i = -1; 989 sf->i = -1;
@@ -1012,21 +1023,47 @@ continue_with_current_leaf_stack_frame:
1012 } 1023 }
1013 1024
1014 if (sf->i < sf->nr) { 1025 if (sf->i < sf->nr) {
1015 struct btrfs_item *disk_item = leafhdr->items + sf->i; 1026 struct btrfs_item disk_item;
1016 struct btrfs_disk_key *disk_key = &disk_item->key; 1027 u32 disk_item_offset =
1028 (uintptr_t)(leafhdr->items + sf->i) -
1029 (uintptr_t)leafhdr;
1030 struct btrfs_disk_key *disk_key;
1017 u8 type; 1031 u8 type;
1018 const u32 item_offset = le32_to_cpu(disk_item->offset); 1032 u32 item_offset;
1019 1033
1034 if (disk_item_offset + sizeof(struct btrfs_item) >
1035 sf->block_ctx->len) {
1036leaf_item_out_of_bounce_error:
1037 printk(KERN_INFO
1038 "btrfsic: leaf item out of bounce at logical %llu, dev %s\n",
1039 sf->block_ctx->start,
1040 sf->block_ctx->dev->name);
1041 goto one_stack_frame_backwards;
1042 }
1043 btrfsic_read_from_block_data(sf->block_ctx,
1044 &disk_item,
1045 disk_item_offset,
1046 sizeof(struct btrfs_item));
1047 item_offset = le32_to_cpu(disk_item.offset);
1048 disk_key = &disk_item.key;
1020 type = disk_key->type; 1049 type = disk_key->type;
1021 1050
1022 if (BTRFS_ROOT_ITEM_KEY == type) { 1051 if (BTRFS_ROOT_ITEM_KEY == type) {
1023 const struct btrfs_root_item *const root_item = 1052 struct btrfs_root_item root_item;
1024 (struct btrfs_root_item *) 1053 u32 root_item_offset;
1025 (sf->block_ctx->data + 1054 u64 next_bytenr;
1026 offsetof(struct btrfs_leaf, items) + 1055
1027 item_offset); 1056 root_item_offset = item_offset +
1028 const u64 next_bytenr = 1057 offsetof(struct btrfs_leaf, items);
1029 le64_to_cpu(root_item->bytenr); 1058 if (root_item_offset +
1059 sizeof(struct btrfs_root_item) >
1060 sf->block_ctx->len)
1061 goto leaf_item_out_of_bounce_error;
1062 btrfsic_read_from_block_data(
1063 sf->block_ctx, &root_item,
1064 root_item_offset,
1065 sizeof(struct btrfs_root_item));
1066 next_bytenr = le64_to_cpu(root_item.bytenr);
1030 1067
1031 sf->error = 1068 sf->error =
1032 btrfsic_create_link_to_next_block( 1069 btrfsic_create_link_to_next_block(
@@ -1041,7 +1078,7 @@ continue_with_current_leaf_stack_frame:
1041 &sf->num_copies, 1078 &sf->num_copies,
1042 &sf->mirror_num, 1079 &sf->mirror_num,
1043 disk_key, 1080 disk_key,
1044 le64_to_cpu(root_item-> 1081 le64_to_cpu(root_item.
1045 generation)); 1082 generation));
1046 if (sf->error) 1083 if (sf->error)
1047 goto one_stack_frame_backwards; 1084 goto one_stack_frame_backwards;
@@ -1049,7 +1086,7 @@ continue_with_current_leaf_stack_frame:
1049 if (NULL != sf->next_block) { 1086 if (NULL != sf->next_block) {
1050 struct btrfs_header *const next_hdr = 1087 struct btrfs_header *const next_hdr =
1051 (struct btrfs_header *) 1088 (struct btrfs_header *)
1052 sf->next_block_ctx.data; 1089 sf->next_block_ctx.datav[0];
1053 1090
1054 next_stack = 1091 next_stack =
1055 btrfsic_stack_frame_alloc(); 1092 btrfsic_stack_frame_alloc();
@@ -1111,10 +1148,24 @@ continue_with_current_node_stack_frame:
1111 } 1148 }
1112 1149
1113 if (sf->i < sf->nr) { 1150 if (sf->i < sf->nr) {
1114 struct btrfs_key_ptr *disk_key_ptr = 1151 struct btrfs_key_ptr key_ptr;
1115 nodehdr->ptrs + sf->i; 1152 u32 key_ptr_offset;
1116 const u64 next_bytenr = 1153 u64 next_bytenr;
1117 le64_to_cpu(disk_key_ptr->blockptr); 1154
1155 key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) -
1156 (uintptr_t)nodehdr;
1157 if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
1158 sf->block_ctx->len) {
1159 printk(KERN_INFO
1160 "btrfsic: node item out of bounce at logical %llu, dev %s\n",
1161 sf->block_ctx->start,
1162 sf->block_ctx->dev->name);
1163 goto one_stack_frame_backwards;
1164 }
1165 btrfsic_read_from_block_data(
1166 sf->block_ctx, &key_ptr, key_ptr_offset,
1167 sizeof(struct btrfs_key_ptr));
1168 next_bytenr = le64_to_cpu(key_ptr.blockptr);
1118 1169
1119 sf->error = btrfsic_create_link_to_next_block( 1170 sf->error = btrfsic_create_link_to_next_block(
1120 state, 1171 state,
@@ -1127,15 +1178,15 @@ continue_with_current_node_stack_frame:
1127 force_iodone_flag, 1178 force_iodone_flag,
1128 &sf->num_copies, 1179 &sf->num_copies,
1129 &sf->mirror_num, 1180 &sf->mirror_num,
1130 &disk_key_ptr->key, 1181 &key_ptr.key,
1131 le64_to_cpu(disk_key_ptr->generation)); 1182 le64_to_cpu(key_ptr.generation));
1132 if (sf->error) 1183 if (sf->error)
1133 goto one_stack_frame_backwards; 1184 goto one_stack_frame_backwards;
1134 1185
1135 if (NULL != sf->next_block) { 1186 if (NULL != sf->next_block) {
1136 struct btrfs_header *const next_hdr = 1187 struct btrfs_header *const next_hdr =
1137 (struct btrfs_header *) 1188 (struct btrfs_header *)
1138 sf->next_block_ctx.data; 1189 sf->next_block_ctx.datav[0];
1139 1190
1140 next_stack = btrfsic_stack_frame_alloc(); 1191 next_stack = btrfsic_stack_frame_alloc();
1141 if (NULL == next_stack) 1192 if (NULL == next_stack)
@@ -1181,6 +1232,35 @@ one_stack_frame_backwards:
1181 return sf->error; 1232 return sf->error;
1182} 1233}
1183 1234
1235static void btrfsic_read_from_block_data(
1236 struct btrfsic_block_data_ctx *block_ctx,
1237 void *dstv, u32 offset, size_t len)
1238{
1239 size_t cur;
1240 size_t offset_in_page;
1241 char *kaddr;
1242 char *dst = (char *)dstv;
1243 size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1);
1244 unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT;
1245
1246 WARN_ON(offset + len > block_ctx->len);
1247 offset_in_page = (start_offset + offset) &
1248 ((unsigned long)PAGE_CACHE_SIZE - 1);
1249
1250 while (len > 0) {
1251 cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
1252 BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >>
1253 PAGE_CACHE_SHIFT);
1254 kaddr = block_ctx->datav[i];
1255 memcpy(dst, kaddr + offset_in_page, cur);
1256
1257 dst += cur;
1258 len -= cur;
1259 offset_in_page = 0;
1260 i++;
1261 }
1262}
1263
1184static int btrfsic_create_link_to_next_block( 1264static int btrfsic_create_link_to_next_block(
1185 struct btrfsic_state *state, 1265 struct btrfsic_state *state,
1186 struct btrfsic_block *block, 1266 struct btrfsic_block *block,
@@ -1204,7 +1284,7 @@ static int btrfsic_create_link_to_next_block(
1204 if (0 == *num_copiesp) { 1284 if (0 == *num_copiesp) {
1205 *num_copiesp = 1285 *num_copiesp =
1206 btrfs_num_copies(&state->root->fs_info->mapping_tree, 1286 btrfs_num_copies(&state->root->fs_info->mapping_tree,
1207 next_bytenr, PAGE_SIZE); 1287 next_bytenr, state->metablock_size);
1208 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1288 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1209 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 1289 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
1210 (unsigned long long)next_bytenr, *num_copiesp); 1290 (unsigned long long)next_bytenr, *num_copiesp);
@@ -1219,7 +1299,7 @@ static int btrfsic_create_link_to_next_block(
1219 "btrfsic_create_link_to_next_block(mirror_num=%d)\n", 1299 "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
1220 *mirror_nump); 1300 *mirror_nump);
1221 ret = btrfsic_map_block(state, next_bytenr, 1301 ret = btrfsic_map_block(state, next_bytenr,
1222 BTRFSIC_BLOCK_SIZE, 1302 state->metablock_size,
1223 next_block_ctx, *mirror_nump); 1303 next_block_ctx, *mirror_nump);
1224 if (ret) { 1304 if (ret) {
1225 printk(KERN_INFO 1305 printk(KERN_INFO
@@ -1314,7 +1394,7 @@ static int btrfsic_create_link_to_next_block(
1314 1394
1315 if (limit_nesting > 0 && did_alloc_block_link) { 1395 if (limit_nesting > 0 && did_alloc_block_link) {
1316 ret = btrfsic_read_block(state, next_block_ctx); 1396 ret = btrfsic_read_block(state, next_block_ctx);
1317 if (ret < (int)BTRFSIC_BLOCK_SIZE) { 1397 if (ret < (int)next_block_ctx->len) {
1318 printk(KERN_INFO 1398 printk(KERN_INFO
1319 "btrfsic: read block @logical %llu failed!\n", 1399 "btrfsic: read block @logical %llu failed!\n",
1320 (unsigned long long)next_bytenr); 1400 (unsigned long long)next_bytenr);
@@ -1339,43 +1419,74 @@ static int btrfsic_handle_extent_data(
1339 u32 item_offset, int force_iodone_flag) 1419 u32 item_offset, int force_iodone_flag)
1340{ 1420{
1341 int ret; 1421 int ret;
1342 struct btrfs_file_extent_item *file_extent_item = 1422 struct btrfs_file_extent_item file_extent_item;
1343 (struct btrfs_file_extent_item *)(block_ctx->data + 1423 u64 file_extent_item_offset;
1344 offsetof(struct btrfs_leaf, 1424 u64 next_bytenr;
1345 items) + item_offset); 1425 u64 num_bytes;
1346 u64 next_bytenr = 1426 u64 generation;
1347 le64_to_cpu(file_extent_item->disk_bytenr) +
1348 le64_to_cpu(file_extent_item->offset);
1349 u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
1350 u64 generation = le64_to_cpu(file_extent_item->generation);
1351 struct btrfsic_block_link *l; 1427 struct btrfsic_block_link *l;
1352 1428
1429 file_extent_item_offset = offsetof(struct btrfs_leaf, items) +
1430 item_offset;
1431 if (file_extent_item_offset +
1432 offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
1433 block_ctx->len) {
1434 printk(KERN_INFO
1435 "btrfsic: file item out of bounce at logical %llu, dev %s\n",
1436 block_ctx->start, block_ctx->dev->name);
1437 return -1;
1438 }
1439
1440 btrfsic_read_from_block_data(block_ctx, &file_extent_item,
1441 file_extent_item_offset,
1442 offsetof(struct btrfs_file_extent_item, disk_num_bytes));
1443 if (BTRFS_FILE_EXTENT_REG != file_extent_item.type ||
1444 ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) {
1445 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
1446 printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n",
1447 file_extent_item.type,
1448 (unsigned long long)
1449 le64_to_cpu(file_extent_item.disk_bytenr));
1450 return 0;
1451 }
1452
1453 if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
1454 block_ctx->len) {
1455 printk(KERN_INFO
1456 "btrfsic: file item out of bounce at logical %llu, dev %s\n",
1457 block_ctx->start, block_ctx->dev->name);
1458 return -1;
1459 }
1460 btrfsic_read_from_block_data(block_ctx, &file_extent_item,
1461 file_extent_item_offset,
1462 sizeof(struct btrfs_file_extent_item));
1463 next_bytenr = le64_to_cpu(file_extent_item.disk_bytenr) +
1464 le64_to_cpu(file_extent_item.offset);
1465 generation = le64_to_cpu(file_extent_item.generation);
1466 num_bytes = le64_to_cpu(file_extent_item.num_bytes);
1467 generation = le64_to_cpu(file_extent_item.generation);
1468
1353 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) 1469 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
1354 printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu," 1470 printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
1355 " offset = %llu, num_bytes = %llu\n", 1471 " offset = %llu, num_bytes = %llu\n",
1356 file_extent_item->type, 1472 file_extent_item.type,
1357 (unsigned long long) 1473 (unsigned long long)
1358 le64_to_cpu(file_extent_item->disk_bytenr), 1474 le64_to_cpu(file_extent_item.disk_bytenr),
1359 (unsigned long long) 1475 (unsigned long long)le64_to_cpu(file_extent_item.offset),
1360 le64_to_cpu(file_extent_item->offset), 1476 (unsigned long long)num_bytes);
1361 (unsigned long long)
1362 le64_to_cpu(file_extent_item->num_bytes));
1363 if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
1364 ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
1365 return 0;
1366 while (num_bytes > 0) { 1477 while (num_bytes > 0) {
1367 u32 chunk_len; 1478 u32 chunk_len;
1368 int num_copies; 1479 int num_copies;
1369 int mirror_num; 1480 int mirror_num;
1370 1481
1371 if (num_bytes > BTRFSIC_BLOCK_SIZE) 1482 if (num_bytes > state->datablock_size)
1372 chunk_len = BTRFSIC_BLOCK_SIZE; 1483 chunk_len = state->datablock_size;
1373 else 1484 else
1374 chunk_len = num_bytes; 1485 chunk_len = num_bytes;
1375 1486
1376 num_copies = 1487 num_copies =
1377 btrfs_num_copies(&state->root->fs_info->mapping_tree, 1488 btrfs_num_copies(&state->root->fs_info->mapping_tree,
1378 next_bytenr, PAGE_SIZE); 1489 next_bytenr, state->datablock_size);
1379 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1490 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1380 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 1491 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
1381 (unsigned long long)next_bytenr, num_copies); 1492 (unsigned long long)next_bytenr, num_copies);
@@ -1475,8 +1586,9 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1475 block_ctx_out->dev_bytenr = multi->stripes[0].physical; 1586 block_ctx_out->dev_bytenr = multi->stripes[0].physical;
1476 block_ctx_out->start = bytenr; 1587 block_ctx_out->start = bytenr;
1477 block_ctx_out->len = len; 1588 block_ctx_out->len = len;
1478 block_ctx_out->data = NULL; 1589 block_ctx_out->datav = NULL;
1479 block_ctx_out->bh = NULL; 1590 block_ctx_out->pagev = NULL;
1591 block_ctx_out->mem_to_free = NULL;
1480 1592
1481 if (0 == ret) 1593 if (0 == ret)
1482 kfree(multi); 1594 kfree(multi);
@@ -1496,8 +1608,9 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
1496 block_ctx_out->dev_bytenr = bytenr; 1608 block_ctx_out->dev_bytenr = bytenr;
1497 block_ctx_out->start = bytenr; 1609 block_ctx_out->start = bytenr;
1498 block_ctx_out->len = len; 1610 block_ctx_out->len = len;
1499 block_ctx_out->data = NULL; 1611 block_ctx_out->datav = NULL;
1500 block_ctx_out->bh = NULL; 1612 block_ctx_out->pagev = NULL;
1613 block_ctx_out->mem_to_free = NULL;
1501 if (NULL != block_ctx_out->dev) { 1614 if (NULL != block_ctx_out->dev) {
1502 return 0; 1615 return 0;
1503 } else { 1616 } else {
@@ -1508,38 +1621,127 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
1508 1621
1509static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) 1622static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
1510{ 1623{
1511 if (NULL != block_ctx->bh) { 1624 if (block_ctx->mem_to_free) {
1512 brelse(block_ctx->bh); 1625 unsigned int num_pages;
1513 block_ctx->bh = NULL; 1626
1627 BUG_ON(!block_ctx->datav);
1628 BUG_ON(!block_ctx->pagev);
1629 num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
1630 PAGE_CACHE_SHIFT;
1631 while (num_pages > 0) {
1632 num_pages--;
1633 if (block_ctx->datav[num_pages]) {
1634 kunmap(block_ctx->pagev[num_pages]);
1635 block_ctx->datav[num_pages] = NULL;
1636 }
1637 if (block_ctx->pagev[num_pages]) {
1638 __free_page(block_ctx->pagev[num_pages]);
1639 block_ctx->pagev[num_pages] = NULL;
1640 }
1641 }
1642
1643 kfree(block_ctx->mem_to_free);
1644 block_ctx->mem_to_free = NULL;
1645 block_ctx->pagev = NULL;
1646 block_ctx->datav = NULL;
1514 } 1647 }
1515} 1648}
1516 1649
1517static int btrfsic_read_block(struct btrfsic_state *state, 1650static int btrfsic_read_block(struct btrfsic_state *state,
1518 struct btrfsic_block_data_ctx *block_ctx) 1651 struct btrfsic_block_data_ctx *block_ctx)
1519{ 1652{
1520 block_ctx->bh = NULL; 1653 unsigned int num_pages;
1521 if (block_ctx->dev_bytenr & 4095) { 1654 unsigned int i;
1655 u64 dev_bytenr;
1656 int ret;
1657
1658 BUG_ON(block_ctx->datav);
1659 BUG_ON(block_ctx->pagev);
1660 BUG_ON(block_ctx->mem_to_free);
1661 if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) {
1522 printk(KERN_INFO 1662 printk(KERN_INFO
1523 "btrfsic: read_block() with unaligned bytenr %llu\n", 1663 "btrfsic: read_block() with unaligned bytenr %llu\n",
1524 (unsigned long long)block_ctx->dev_bytenr); 1664 (unsigned long long)block_ctx->dev_bytenr);
1525 return -1; 1665 return -1;
1526 } 1666 }
1527 if (block_ctx->len > 4096) { 1667
1528 printk(KERN_INFO 1668 num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
1529 "btrfsic: read_block() with too huge size %d\n", 1669 PAGE_CACHE_SHIFT;
1530 block_ctx->len); 1670 block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) +
1671 sizeof(*block_ctx->pagev)) *
1672 num_pages, GFP_NOFS);
1673 if (!block_ctx->mem_to_free)
1531 return -1; 1674 return -1;
1675 block_ctx->datav = block_ctx->mem_to_free;
1676 block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
1677 for (i = 0; i < num_pages; i++) {
1678 block_ctx->pagev[i] = alloc_page(GFP_NOFS);
1679 if (!block_ctx->pagev[i])
1680 return -1;
1532 } 1681 }
1533 1682
1534 block_ctx->bh = __bread(block_ctx->dev->bdev, 1683 dev_bytenr = block_ctx->dev_bytenr;
1535 block_ctx->dev_bytenr >> 12, 4096); 1684 for (i = 0; i < num_pages;) {
1536 if (NULL == block_ctx->bh) 1685 struct bio *bio;
1537 return -1; 1686 unsigned int j;
1538 block_ctx->data = block_ctx->bh->b_data; 1687 DECLARE_COMPLETION_ONSTACK(complete);
1688
1689 bio = bio_alloc(GFP_NOFS, num_pages - i);
1690 if (!bio) {
1691 printk(KERN_INFO
1692 "btrfsic: bio_alloc() for %u pages failed!\n",
1693 num_pages - i);
1694 return -1;
1695 }
1696 bio->bi_bdev = block_ctx->dev->bdev;
1697 bio->bi_sector = dev_bytenr >> 9;
1698 bio->bi_end_io = btrfsic_complete_bio_end_io;
1699 bio->bi_private = &complete;
1700
1701 for (j = i; j < num_pages; j++) {
1702 ret = bio_add_page(bio, block_ctx->pagev[j],
1703 PAGE_CACHE_SIZE, 0);
1704 if (PAGE_CACHE_SIZE != ret)
1705 break;
1706 }
1707 if (j == i) {
1708 printk(KERN_INFO
1709 "btrfsic: error, failed to add a single page!\n");
1710 return -1;
1711 }
1712 submit_bio(READ, bio);
1713
1714 /* this will also unplug the queue */
1715 wait_for_completion(&complete);
1716
1717 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1718 printk(KERN_INFO
1719 "btrfsic: read error at logical %llu dev %s!\n",
1720 block_ctx->start, block_ctx->dev->name);
1721 bio_put(bio);
1722 return -1;
1723 }
1724 bio_put(bio);
1725 dev_bytenr += (j - i) * PAGE_CACHE_SIZE;
1726 i = j;
1727 }
1728 for (i = 0; i < num_pages; i++) {
1729 block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
1730 if (!block_ctx->datav[i]) {
1731 printk(KERN_INFO "btrfsic: kmap() failed (dev %s)!\n",
1732 block_ctx->dev->name);
1733 return -1;
1734 }
1735 }
1539 1736
1540 return block_ctx->len; 1737 return block_ctx->len;
1541} 1738}
1542 1739
1740static void btrfsic_complete_bio_end_io(struct bio *bio, int err)
1741{
1742 complete((struct completion *)bio->bi_private);
1743}
1744
1543static void btrfsic_dump_database(struct btrfsic_state *state) 1745static void btrfsic_dump_database(struct btrfsic_state *state)
1544{ 1746{
1545 struct list_head *elem_all; 1747 struct list_head *elem_all;
@@ -1617,32 +1819,39 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
1617 * (note that this test fails for the super block) 1819 * (note that this test fails for the super block)
1618 */ 1820 */
1619static int btrfsic_test_for_metadata(struct btrfsic_state *state, 1821static int btrfsic_test_for_metadata(struct btrfsic_state *state,
1620 const u8 *data, unsigned int size) 1822 char **datav, unsigned int num_pages)
1621{ 1823{
1622 struct btrfs_header *h; 1824 struct btrfs_header *h;
1623 u8 csum[BTRFS_CSUM_SIZE]; 1825 u8 csum[BTRFS_CSUM_SIZE];
1624 u32 crc = ~(u32)0; 1826 u32 crc = ~(u32)0;
1625 int fail = 0; 1827 unsigned int i;
1626 int crc_fail = 0;
1627 1828
1628 h = (struct btrfs_header *)data; 1829 if (num_pages * PAGE_CACHE_SIZE < state->metablock_size)
1830 return 1; /* not metadata */
1831 num_pages = state->metablock_size >> PAGE_CACHE_SHIFT;
1832 h = (struct btrfs_header *)datav[0];
1629 1833
1630 if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE)) 1834 if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
1631 fail++; 1835 return 1;
1836
1837 for (i = 0; i < num_pages; i++) {
1838 u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
1839 size_t sublen = i ? PAGE_CACHE_SIZE :
1840 (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE);
1632 1841
1633 crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE); 1842 crc = crc32c(crc, data, sublen);
1843 }
1634 btrfs_csum_final(crc, csum); 1844 btrfs_csum_final(crc, csum);
1635 if (memcmp(csum, h->csum, state->csum_size)) 1845 if (memcmp(csum, h->csum, state->csum_size))
1636 crc_fail++; 1846 return 1;
1637 1847
1638 return fail || crc_fail; 1848 return 0; /* is metadata */
1639} 1849}
1640 1850
1641static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, 1851static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1642 u64 dev_bytenr, 1852 u64 dev_bytenr, char **mapped_datav,
1643 u8 *mapped_data, unsigned int len, 1853 unsigned int num_pages,
1644 struct bio *bio, 1854 struct bio *bio, int *bio_is_patched,
1645 int *bio_is_patched,
1646 struct buffer_head *bh, 1855 struct buffer_head *bh,
1647 int submit_bio_bh_rw) 1856 int submit_bio_bh_rw)
1648{ 1857{
@@ -1652,12 +1861,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1652 int ret; 1861 int ret;
1653 struct btrfsic_state *state = dev_state->state; 1862 struct btrfsic_state *state = dev_state->state;
1654 struct block_device *bdev = dev_state->bdev; 1863 struct block_device *bdev = dev_state->bdev;
1864 unsigned int processed_len;
1655 1865
1656 WARN_ON(len > PAGE_SIZE);
1657 is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
1658 if (NULL != bio_is_patched) 1866 if (NULL != bio_is_patched)
1659 *bio_is_patched = 0; 1867 *bio_is_patched = 0;
1660 1868
1869again:
1870 if (num_pages == 0)
1871 return;
1872
1873 processed_len = 0;
1874 is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav,
1875 num_pages));
1876
1661 block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, 1877 block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
1662 &state->block_hashtable); 1878 &state->block_hashtable);
1663 if (NULL != block) { 1879 if (NULL != block) {
@@ -1667,8 +1883,16 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1667 1883
1668 if (block->is_superblock) { 1884 if (block->is_superblock) {
1669 bytenr = le64_to_cpu(((struct btrfs_super_block *) 1885 bytenr = le64_to_cpu(((struct btrfs_super_block *)
1670 mapped_data)->bytenr); 1886 mapped_datav[0])->bytenr);
1887 if (num_pages * PAGE_CACHE_SIZE <
1888 BTRFS_SUPER_INFO_SIZE) {
1889 printk(KERN_INFO
1890 "btrfsic: cannot work with too short bios!\n");
1891 return;
1892 }
1671 is_metadata = 1; 1893 is_metadata = 1;
1894 BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1));
1895 processed_len = BTRFS_SUPER_INFO_SIZE;
1672 if (state->print_mask & 1896 if (state->print_mask &
1673 BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { 1897 BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
1674 printk(KERN_INFO 1898 printk(KERN_INFO
@@ -1678,12 +1902,18 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1678 } 1902 }
1679 if (is_metadata) { 1903 if (is_metadata) {
1680 if (!block->is_superblock) { 1904 if (!block->is_superblock) {
1905 if (num_pages * PAGE_CACHE_SIZE <
1906 state->metablock_size) {
1907 printk(KERN_INFO
1908 "btrfsic: cannot work with too short bios!\n");
1909 return;
1910 }
1911 processed_len = state->metablock_size;
1681 bytenr = le64_to_cpu(((struct btrfs_header *) 1912 bytenr = le64_to_cpu(((struct btrfs_header *)
1682 mapped_data)->bytenr); 1913 mapped_datav[0])->bytenr);
1683 btrfsic_cmp_log_and_dev_bytenr(state, bytenr, 1914 btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
1684 dev_state, 1915 dev_state,
1685 dev_bytenr, 1916 dev_bytenr);
1686 mapped_data);
1687 } 1917 }
1688 if (block->logical_bytenr != bytenr) { 1918 if (block->logical_bytenr != bytenr) {
1689 printk(KERN_INFO 1919 printk(KERN_INFO
@@ -1710,6 +1940,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1710 block->mirror_num, 1940 block->mirror_num,
1711 btrfsic_get_block_type(state, block)); 1941 btrfsic_get_block_type(state, block));
1712 } else { 1942 } else {
1943 if (num_pages * PAGE_CACHE_SIZE <
1944 state->datablock_size) {
1945 printk(KERN_INFO
1946 "btrfsic: cannot work with too short bios!\n");
1947 return;
1948 }
1949 processed_len = state->datablock_size;
1713 bytenr = block->logical_bytenr; 1950 bytenr = block->logical_bytenr;
1714 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) 1951 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1715 printk(KERN_INFO 1952 printk(KERN_INFO
@@ -1747,7 +1984,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1747 le64_to_cpu(block->disk_key.offset), 1984 le64_to_cpu(block->disk_key.offset),
1748 (unsigned long long) 1985 (unsigned long long)
1749 le64_to_cpu(((struct btrfs_header *) 1986 le64_to_cpu(((struct btrfs_header *)
1750 mapped_data)->generation), 1987 mapped_datav[0])->generation),
1751 (unsigned long long) 1988 (unsigned long long)
1752 state->max_superblock_generation); 1989 state->max_superblock_generation);
1753 btrfsic_dump_tree(state); 1990 btrfsic_dump_tree(state);
@@ -1765,10 +2002,10 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1765 (unsigned long long)block->generation, 2002 (unsigned long long)block->generation,
1766 (unsigned long long) 2003 (unsigned long long)
1767 le64_to_cpu(((struct btrfs_header *) 2004 le64_to_cpu(((struct btrfs_header *)
1768 mapped_data)->generation)); 2005 mapped_datav[0])->generation));
1769 /* it would not be safe to go on */ 2006 /* it would not be safe to go on */
1770 btrfsic_dump_tree(state); 2007 btrfsic_dump_tree(state);
1771 return; 2008 goto continue_loop;
1772 } 2009 }
1773 2010
1774 /* 2011 /*
@@ -1796,18 +2033,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1796 } 2033 }
1797 2034
1798 if (block->is_superblock) 2035 if (block->is_superblock)
1799 ret = btrfsic_map_superblock(state, bytenr, len, 2036 ret = btrfsic_map_superblock(state, bytenr,
2037 processed_len,
1800 bdev, &block_ctx); 2038 bdev, &block_ctx);
1801 else 2039 else
1802 ret = btrfsic_map_block(state, bytenr, len, 2040 ret = btrfsic_map_block(state, bytenr, processed_len,
1803 &block_ctx, 0); 2041 &block_ctx, 0);
1804 if (ret) { 2042 if (ret) {
1805 printk(KERN_INFO 2043 printk(KERN_INFO
1806 "btrfsic: btrfsic_map_block(root @%llu)" 2044 "btrfsic: btrfsic_map_block(root @%llu)"
1807 " failed!\n", (unsigned long long)bytenr); 2045 " failed!\n", (unsigned long long)bytenr);
1808 return; 2046 goto continue_loop;
1809 } 2047 }
1810 block_ctx.data = mapped_data; 2048 block_ctx.datav = mapped_datav;
1811 /* the following is required in case of writes to mirrors, 2049 /* the following is required in case of writes to mirrors,
1812 * use the same that was used for the lookup */ 2050 * use the same that was used for the lookup */
1813 block_ctx.dev = dev_state; 2051 block_ctx.dev = dev_state;
@@ -1863,11 +2101,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1863 block->logical_bytenr = bytenr; 2101 block->logical_bytenr = bytenr;
1864 block->is_metadata = 1; 2102 block->is_metadata = 1;
1865 if (block->is_superblock) { 2103 if (block->is_superblock) {
2104 BUG_ON(PAGE_CACHE_SIZE !=
2105 BTRFS_SUPER_INFO_SIZE);
1866 ret = btrfsic_process_written_superblock( 2106 ret = btrfsic_process_written_superblock(
1867 state, 2107 state,
1868 block, 2108 block,
1869 (struct btrfs_super_block *) 2109 (struct btrfs_super_block *)
1870 mapped_data); 2110 mapped_datav[0]);
1871 if (state->print_mask & 2111 if (state->print_mask &
1872 BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) { 2112 BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
1873 printk(KERN_INFO 2113 printk(KERN_INFO
@@ -1880,8 +2120,6 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1880 state, 2120 state,
1881 block, 2121 block,
1882 &block_ctx, 2122 &block_ctx,
1883 (struct btrfs_header *)
1884 block_ctx.data,
1885 0, 0); 2123 0, 0);
1886 } 2124 }
1887 if (ret) 2125 if (ret)
@@ -1912,26 +2150,30 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1912 u64 bytenr; 2150 u64 bytenr;
1913 2151
1914 if (!is_metadata) { 2152 if (!is_metadata) {
2153 processed_len = state->datablock_size;
1915 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) 2154 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1916 printk(KERN_INFO "Written block (%s/%llu/?)" 2155 printk(KERN_INFO "Written block (%s/%llu/?)"
1917 " !found in hash table, D.\n", 2156 " !found in hash table, D.\n",
1918 dev_state->name, 2157 dev_state->name,
1919 (unsigned long long)dev_bytenr); 2158 (unsigned long long)dev_bytenr);
1920 if (!state->include_extent_data) 2159 if (!state->include_extent_data) {
1921 return; /* ignore that written D block */ 2160 /* ignore that written D block */
2161 goto continue_loop;
2162 }
1922 2163
1923 /* this is getting ugly for the 2164 /* this is getting ugly for the
1924 * include_extent_data case... */ 2165 * include_extent_data case... */
1925 bytenr = 0; /* unknown */ 2166 bytenr = 0; /* unknown */
1926 block_ctx.start = bytenr; 2167 block_ctx.start = bytenr;
1927 block_ctx.len = len; 2168 block_ctx.len = processed_len;
1928 block_ctx.bh = NULL; 2169 block_ctx.mem_to_free = NULL;
2170 block_ctx.pagev = NULL;
1929 } else { 2171 } else {
2172 processed_len = state->metablock_size;
1930 bytenr = le64_to_cpu(((struct btrfs_header *) 2173 bytenr = le64_to_cpu(((struct btrfs_header *)
1931 mapped_data)->bytenr); 2174 mapped_datav[0])->bytenr);
1932 btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, 2175 btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
1933 dev_bytenr, 2176 dev_bytenr);
1934 mapped_data);
1935 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) 2177 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1936 printk(KERN_INFO 2178 printk(KERN_INFO
1937 "Written block @%llu (%s/%llu/?)" 2179 "Written block @%llu (%s/%llu/?)"
@@ -1940,17 +2182,17 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1940 dev_state->name, 2182 dev_state->name,
1941 (unsigned long long)dev_bytenr); 2183 (unsigned long long)dev_bytenr);
1942 2184
1943 ret = btrfsic_map_block(state, bytenr, len, &block_ctx, 2185 ret = btrfsic_map_block(state, bytenr, processed_len,
1944 0); 2186 &block_ctx, 0);
1945 if (ret) { 2187 if (ret) {
1946 printk(KERN_INFO 2188 printk(KERN_INFO
1947 "btrfsic: btrfsic_map_block(root @%llu)" 2189 "btrfsic: btrfsic_map_block(root @%llu)"
1948 " failed!\n", 2190 " failed!\n",
1949 (unsigned long long)dev_bytenr); 2191 (unsigned long long)dev_bytenr);
1950 return; 2192 goto continue_loop;
1951 } 2193 }
1952 } 2194 }
1953 block_ctx.data = mapped_data; 2195 block_ctx.datav = mapped_datav;
1954 /* the following is required in case of writes to mirrors, 2196 /* the following is required in case of writes to mirrors,
1955 * use the same that was used for the lookup */ 2197 * use the same that was used for the lookup */
1956 block_ctx.dev = dev_state; 2198 block_ctx.dev = dev_state;
@@ -1960,7 +2202,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1960 if (NULL == block) { 2202 if (NULL == block) {
1961 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); 2203 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
1962 btrfsic_release_block_ctx(&block_ctx); 2204 btrfsic_release_block_ctx(&block_ctx);
1963 return; 2205 goto continue_loop;
1964 } 2206 }
1965 block->dev_state = dev_state; 2207 block->dev_state = dev_state;
1966 block->dev_bytenr = dev_bytenr; 2208 block->dev_bytenr = dev_bytenr;
@@ -2020,9 +2262,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
2020 2262
2021 if (is_metadata) { 2263 if (is_metadata) {
2022 ret = btrfsic_process_metablock(state, block, 2264 ret = btrfsic_process_metablock(state, block,
2023 &block_ctx, 2265 &block_ctx, 0, 0);
2024 (struct btrfs_header *)
2025 block_ctx.data, 0, 0);
2026 if (ret) 2266 if (ret)
2027 printk(KERN_INFO 2267 printk(KERN_INFO
2028 "btrfsic: process_metablock(root @%llu)" 2268 "btrfsic: process_metablock(root @%llu)"
@@ -2031,6 +2271,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
2031 } 2271 }
2032 btrfsic_release_block_ctx(&block_ctx); 2272 btrfsic_release_block_ctx(&block_ctx);
2033 } 2273 }
2274
2275continue_loop:
2276 BUG_ON(!processed_len);
2277 dev_bytenr += processed_len;
2278 mapped_datav += processed_len >> PAGE_CACHE_SHIFT;
2279 num_pages -= processed_len >> PAGE_CACHE_SHIFT;
2280 goto again;
2034} 2281}
2035 2282
2036static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) 2283static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
@@ -2213,7 +2460,7 @@ static int btrfsic_process_written_superblock(
2213 2460
2214 num_copies = 2461 num_copies =
2215 btrfs_num_copies(&state->root->fs_info->mapping_tree, 2462 btrfs_num_copies(&state->root->fs_info->mapping_tree,
2216 next_bytenr, PAGE_SIZE); 2463 next_bytenr, BTRFS_SUPER_INFO_SIZE);
2217 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 2464 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
2218 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 2465 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
2219 (unsigned long long)next_bytenr, num_copies); 2466 (unsigned long long)next_bytenr, num_copies);
@@ -2224,7 +2471,8 @@ static int btrfsic_process_written_superblock(
2224 printk(KERN_INFO 2471 printk(KERN_INFO
2225 "btrfsic_process_written_superblock(" 2472 "btrfsic_process_written_superblock("
2226 "mirror_num=%d)\n", mirror_num); 2473 "mirror_num=%d)\n", mirror_num);
2227 ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, 2474 ret = btrfsic_map_block(state, next_bytenr,
2475 BTRFS_SUPER_INFO_SIZE,
2228 &tmp_next_block_ctx, 2476 &tmp_next_block_ctx,
2229 mirror_num); 2477 mirror_num);
2230 if (ret) { 2478 if (ret) {
@@ -2689,7 +2937,7 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add(
2689static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, 2937static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
2690 u64 bytenr, 2938 u64 bytenr,
2691 struct btrfsic_dev_state *dev_state, 2939 struct btrfsic_dev_state *dev_state,
2692 u64 dev_bytenr, char *data) 2940 u64 dev_bytenr)
2693{ 2941{
2694 int num_copies; 2942 int num_copies;
2695 int mirror_num; 2943 int mirror_num;
@@ -2698,10 +2946,10 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
2698 int match = 0; 2946 int match = 0;
2699 2947
2700 num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, 2948 num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
2701 bytenr, PAGE_SIZE); 2949 bytenr, state->metablock_size);
2702 2950
2703 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { 2951 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
2704 ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, 2952 ret = btrfsic_map_block(state, bytenr, state->metablock_size,
2705 &block_ctx, mirror_num); 2953 &block_ctx, mirror_num);
2706 if (ret) { 2954 if (ret) {
2707 printk(KERN_INFO "btrfsic:" 2955 printk(KERN_INFO "btrfsic:"
@@ -2727,7 +2975,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
2727 (unsigned long long)bytenr, dev_state->name, 2975 (unsigned long long)bytenr, dev_state->name,
2728 (unsigned long long)dev_bytenr); 2976 (unsigned long long)dev_bytenr);
2729 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { 2977 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
2730 ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, 2978 ret = btrfsic_map_block(state, bytenr,
2979 state->metablock_size,
2731 &block_ctx, mirror_num); 2980 &block_ctx, mirror_num);
2732 if (ret) 2981 if (ret)
2733 continue; 2982 continue;
@@ -2781,13 +3030,13 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh)
2781 (unsigned long)bh->b_size, bh->b_data, 3030 (unsigned long)bh->b_size, bh->b_data,
2782 bh->b_bdev); 3031 bh->b_bdev);
2783 btrfsic_process_written_block(dev_state, dev_bytenr, 3032 btrfsic_process_written_block(dev_state, dev_bytenr,
2784 bh->b_data, bh->b_size, NULL, 3033 &bh->b_data, 1, NULL,
2785 NULL, bh, rw); 3034 NULL, bh, rw);
2786 } else if (NULL != dev_state && (rw & REQ_FLUSH)) { 3035 } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
2787 if (dev_state->state->print_mask & 3036 if (dev_state->state->print_mask &
2788 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) 3037 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2789 printk(KERN_INFO 3038 printk(KERN_INFO
2790 "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n", 3039 "submit_bh(rw=0x%x FLUSH, bdev=%p)\n",
2791 rw, bh->b_bdev); 3040 rw, bh->b_bdev);
2792 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { 3041 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
2793 if ((dev_state->state->print_mask & 3042 if ((dev_state->state->print_mask &
@@ -2836,6 +3085,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
2836 unsigned int i; 3085 unsigned int i;
2837 u64 dev_bytenr; 3086 u64 dev_bytenr;
2838 int bio_is_patched; 3087 int bio_is_patched;
3088 char **mapped_datav;
2839 3089
2840 dev_bytenr = 512 * bio->bi_sector; 3090 dev_bytenr = 512 * bio->bi_sector;
2841 bio_is_patched = 0; 3091 bio_is_patched = 0;
@@ -2848,35 +3098,46 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
2848 (unsigned long long)dev_bytenr, 3098 (unsigned long long)dev_bytenr,
2849 bio->bi_bdev); 3099 bio->bi_bdev);
2850 3100
3101 mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt,
3102 GFP_NOFS);
3103 if (!mapped_datav)
3104 goto leave;
2851 for (i = 0; i < bio->bi_vcnt; i++) { 3105 for (i = 0; i < bio->bi_vcnt; i++) {
2852 u8 *mapped_data; 3106 BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE);
2853 3107 mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page);
2854 mapped_data = kmap(bio->bi_io_vec[i].bv_page); 3108 if (!mapped_datav[i]) {
3109 while (i > 0) {
3110 i--;
3111 kunmap(bio->bi_io_vec[i].bv_page);
3112 }
3113 kfree(mapped_datav);
3114 goto leave;
3115 }
2855 if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | 3116 if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2856 BTRFSIC_PRINT_MASK_VERBOSE) == 3117 BTRFSIC_PRINT_MASK_VERBOSE) ==
2857 (dev_state->state->print_mask & 3118 (dev_state->state->print_mask &
2858 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | 3119 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2859 BTRFSIC_PRINT_MASK_VERBOSE))) 3120 BTRFSIC_PRINT_MASK_VERBOSE)))
2860 printk(KERN_INFO 3121 printk(KERN_INFO
2861 "#%u: page=%p, mapped=%p, len=%u," 3122 "#%u: page=%p, len=%u, offset=%u\n",
2862 " offset=%u\n",
2863 i, bio->bi_io_vec[i].bv_page, 3123 i, bio->bi_io_vec[i].bv_page,
2864 mapped_data,
2865 bio->bi_io_vec[i].bv_len, 3124 bio->bi_io_vec[i].bv_len,
2866 bio->bi_io_vec[i].bv_offset); 3125 bio->bi_io_vec[i].bv_offset);
2867 btrfsic_process_written_block(dev_state, dev_bytenr, 3126 }
2868 mapped_data, 3127 btrfsic_process_written_block(dev_state, dev_bytenr,
2869 bio->bi_io_vec[i].bv_len, 3128 mapped_datav, bio->bi_vcnt,
2870 bio, &bio_is_patched, 3129 bio, &bio_is_patched,
2871 NULL, rw); 3130 NULL, rw);
3131 while (i > 0) {
3132 i--;
2872 kunmap(bio->bi_io_vec[i].bv_page); 3133 kunmap(bio->bi_io_vec[i].bv_page);
2873 dev_bytenr += bio->bi_io_vec[i].bv_len;
2874 } 3134 }
3135 kfree(mapped_datav);
2875 } else if (NULL != dev_state && (rw & REQ_FLUSH)) { 3136 } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
2876 if (dev_state->state->print_mask & 3137 if (dev_state->state->print_mask &
2877 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) 3138 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2878 printk(KERN_INFO 3139 printk(KERN_INFO
2879 "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n", 3140 "submit_bio(rw=0x%x FLUSH, bdev=%p)\n",
2880 rw, bio->bi_bdev); 3141 rw, bio->bi_bdev);
2881 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { 3142 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
2882 if ((dev_state->state->print_mask & 3143 if ((dev_state->state->print_mask &
@@ -2903,6 +3164,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
2903 bio->bi_end_io = btrfsic_bio_end_io; 3164 bio->bi_end_io = btrfsic_bio_end_io;
2904 } 3165 }
2905 } 3166 }
3167leave:
2906 mutex_unlock(&btrfsic_mutex); 3168 mutex_unlock(&btrfsic_mutex);
2907 3169
2908 submit_bio(rw, bio); 3170 submit_bio(rw, bio);
@@ -2917,6 +3179,30 @@ int btrfsic_mount(struct btrfs_root *root,
2917 struct list_head *dev_head = &fs_devices->devices; 3179 struct list_head *dev_head = &fs_devices->devices;
2918 struct btrfs_device *device; 3180 struct btrfs_device *device;
2919 3181
3182 if (root->nodesize != root->leafsize) {
3183 printk(KERN_INFO
3184 "btrfsic: cannot handle nodesize %d != leafsize %d!\n",
3185 root->nodesize, root->leafsize);
3186 return -1;
3187 }
3188 if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
3189 printk(KERN_INFO
3190 "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
3191 root->nodesize, (unsigned long)PAGE_CACHE_SIZE);
3192 return -1;
3193 }
3194 if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) {
3195 printk(KERN_INFO
3196 "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
3197 root->leafsize, (unsigned long)PAGE_CACHE_SIZE);
3198 return -1;
3199 }
3200 if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
3201 printk(KERN_INFO
3202 "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
3203 root->sectorsize, (unsigned long)PAGE_CACHE_SIZE);
3204 return -1;
3205 }
2920 state = kzalloc(sizeof(*state), GFP_NOFS); 3206 state = kzalloc(sizeof(*state), GFP_NOFS);
2921 if (NULL == state) { 3207 if (NULL == state) {
2922 printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); 3208 printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
@@ -2933,6 +3219,8 @@ int btrfsic_mount(struct btrfs_root *root,
2933 state->print_mask = print_mask; 3219 state->print_mask = print_mask;
2934 state->include_extent_data = including_extent_data; 3220 state->include_extent_data = including_extent_data;
2935 state->csum_size = 0; 3221 state->csum_size = 0;
3222 state->metablock_size = root->nodesize;
3223 state->datablock_size = root->sectorsize;
2936 INIT_LIST_HEAD(&state->all_blocks_list); 3224 INIT_LIST_HEAD(&state->all_blocks_list);
2937 btrfsic_block_hashtable_init(&state->block_hashtable); 3225 btrfsic_block_hashtable_init(&state->block_hashtable);
2938 btrfsic_block_link_hashtable_init(&state->block_link_hashtable); 3226 btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
@@ -3049,7 +3337,7 @@ void btrfsic_unmount(struct btrfs_root *root,
3049 btrfsic_block_link_free(l); 3337 btrfsic_block_link_free(l);
3050 } 3338 }
3051 3339
3052 if (b_all->is_iodone) 3340 if (b_all->is_iodone || b_all->never_written)
3053 btrfsic_block_free(b_all); 3341 btrfsic_block_free(b_all);
3054 else 3342 else
3055 printk(KERN_INFO "btrfs: attempt to free %c-block" 3343 printk(KERN_INFO "btrfs: attempt to free %c-block"
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4106264fbc65..d7a96cfdc50a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/rbtree.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "disk-io.h" 23#include "disk-io.h"
23#include "transaction.h" 24#include "transaction.h"
@@ -37,7 +38,16 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
37 struct extent_buffer *dst_buf, 38 struct extent_buffer *dst_buf,
38 struct extent_buffer *src_buf); 39 struct extent_buffer *src_buf);
39static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 40static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
40 struct btrfs_path *path, int level, int slot); 41 struct btrfs_path *path, int level, int slot,
42 int tree_mod_log);
43static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
44 struct extent_buffer *eb);
45struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
46 u32 blocksize, u64 parent_transid,
47 u64 time_seq);
48struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root,
49 u64 bytenr, u32 blocksize,
50 u64 time_seq);
41 51
42struct btrfs_path *btrfs_alloc_path(void) 52struct btrfs_path *btrfs_alloc_path(void)
43{ 53{
@@ -255,7 +265,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
255 265
256 cow = btrfs_alloc_free_block(trans, root, buf->len, 0, 266 cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
257 new_root_objectid, &disk_key, level, 267 new_root_objectid, &disk_key, level,
258 buf->start, 0, 1); 268 buf->start, 0);
259 if (IS_ERR(cow)) 269 if (IS_ERR(cow))
260 return PTR_ERR(cow); 270 return PTR_ERR(cow);
261 271
@@ -288,6 +298,434 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
288 return 0; 298 return 0;
289} 299}
290 300
301enum mod_log_op {
302 MOD_LOG_KEY_REPLACE,
303 MOD_LOG_KEY_ADD,
304 MOD_LOG_KEY_REMOVE,
305 MOD_LOG_KEY_REMOVE_WHILE_FREEING,
306 MOD_LOG_KEY_REMOVE_WHILE_MOVING,
307 MOD_LOG_MOVE_KEYS,
308 MOD_LOG_ROOT_REPLACE,
309};
310
311struct tree_mod_move {
312 int dst_slot;
313 int nr_items;
314};
315
316struct tree_mod_root {
317 u64 logical;
318 u8 level;
319};
320
321struct tree_mod_elem {
322 struct rb_node node;
323 u64 index; /* shifted logical */
324 struct seq_list elem;
325 enum mod_log_op op;
326
327 /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
328 int slot;
329
330 /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
331 u64 generation;
332
333 /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
334 struct btrfs_disk_key key;
335 u64 blockptr;
336
337 /* this is used for op == MOD_LOG_MOVE_KEYS */
338 struct tree_mod_move move;
339
340 /* this is used for op == MOD_LOG_ROOT_REPLACE */
341 struct tree_mod_root old_root;
342};
343
344static inline void
345__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem)
346{
347 elem->seq = atomic_inc_return(&fs_info->tree_mod_seq);
348 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
349}
350
351void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
352 struct seq_list *elem)
353{
354 elem->flags = 1;
355 spin_lock(&fs_info->tree_mod_seq_lock);
356 __get_tree_mod_seq(fs_info, elem);
357 spin_unlock(&fs_info->tree_mod_seq_lock);
358}
359
360void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
361 struct seq_list *elem)
362{
363 struct rb_root *tm_root;
364 struct rb_node *node;
365 struct rb_node *next;
366 struct seq_list *cur_elem;
367 struct tree_mod_elem *tm;
368 u64 min_seq = (u64)-1;
369 u64 seq_putting = elem->seq;
370
371 if (!seq_putting)
372 return;
373
374 BUG_ON(!(elem->flags & 1));
375 spin_lock(&fs_info->tree_mod_seq_lock);
376 list_del(&elem->list);
377
378 list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
379 if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) {
380 if (seq_putting > cur_elem->seq) {
381 /*
382 * blocker with lower sequence number exists, we
383 * cannot remove anything from the log
384 */
385 goto out;
386 }
387 min_seq = cur_elem->seq;
388 }
389 }
390
391 /*
392 * anything that's lower than the lowest existing (read: blocked)
393 * sequence number can be removed from the tree.
394 */
395 write_lock(&fs_info->tree_mod_log_lock);
396 tm_root = &fs_info->tree_mod_log;
397 for (node = rb_first(tm_root); node; node = next) {
398 next = rb_next(node);
399 tm = container_of(node, struct tree_mod_elem, node);
400 if (tm->elem.seq > min_seq)
401 continue;
402 rb_erase(node, tm_root);
403 list_del(&tm->elem.list);
404 kfree(tm);
405 }
406 write_unlock(&fs_info->tree_mod_log_lock);
407out:
408 spin_unlock(&fs_info->tree_mod_seq_lock);
409}
410
411/*
412 * key order of the log:
413 * index -> sequence
414 *
415 * the index is the shifted logical of the *new* root node for root replace
416 * operations, or the shifted logical of the affected block for all other
417 * operations.
418 */
419static noinline int
420__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
421{
422 struct rb_root *tm_root;
423 struct rb_node **new;
424 struct rb_node *parent = NULL;
425 struct tree_mod_elem *cur;
426 int ret = 0;
427
428 BUG_ON(!tm || !tm->elem.seq);
429
430 write_lock(&fs_info->tree_mod_log_lock);
431 tm_root = &fs_info->tree_mod_log;
432 new = &tm_root->rb_node;
433 while (*new) {
434 cur = container_of(*new, struct tree_mod_elem, node);
435 parent = *new;
436 if (cur->index < tm->index)
437 new = &((*new)->rb_left);
438 else if (cur->index > tm->index)
439 new = &((*new)->rb_right);
440 else if (cur->elem.seq < tm->elem.seq)
441 new = &((*new)->rb_left);
442 else if (cur->elem.seq > tm->elem.seq)
443 new = &((*new)->rb_right);
444 else {
445 kfree(tm);
446 ret = -EEXIST;
447 goto unlock;
448 }
449 }
450
451 rb_link_node(&tm->node, parent, new);
452 rb_insert_color(&tm->node, tm_root);
453unlock:
454 write_unlock(&fs_info->tree_mod_log_lock);
455 return ret;
456}
457
458static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
459 struct extent_buffer *eb) {
460 smp_mb();
461 if (list_empty(&(fs_info)->tree_mod_seq_list))
462 return 1;
463 if (!eb)
464 return 0;
465 if (btrfs_header_level(eb) == 0)
466 return 1;
467 return 0;
468}
469
470static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
471 struct tree_mod_elem **tm_ret)
472{
473 struct tree_mod_elem *tm;
474 int seq;
475
476 if (tree_mod_dont_log(fs_info, NULL))
477 return 0;
478
479 tm = *tm_ret = kzalloc(sizeof(*tm), flags);
480 if (!tm)
481 return -ENOMEM;
482
483 tm->elem.flags = 0;
484 spin_lock(&fs_info->tree_mod_seq_lock);
485 if (list_empty(&fs_info->tree_mod_seq_list)) {
486 /*
487 * someone emptied the list while we were waiting for the lock.
488 * we must not add to the list, because no blocker exists. items
489 * are removed from the list only when the existing blocker is
490 * removed from the list.
491 */
492 kfree(tm);
493 seq = 0;
494 } else {
495 __get_tree_mod_seq(fs_info, &tm->elem);
496 seq = tm->elem.seq;
497 }
498 spin_unlock(&fs_info->tree_mod_seq_lock);
499
500 return seq;
501}
502
503static noinline int
504tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
505 struct extent_buffer *eb, int slot,
506 enum mod_log_op op, gfp_t flags)
507{
508 struct tree_mod_elem *tm;
509 int ret;
510
511 ret = tree_mod_alloc(fs_info, flags, &tm);
512 if (ret <= 0)
513 return ret;
514
515 tm->index = eb->start >> PAGE_CACHE_SHIFT;
516 if (op != MOD_LOG_KEY_ADD) {
517 btrfs_node_key(eb, &tm->key, slot);
518 tm->blockptr = btrfs_node_blockptr(eb, slot);
519 }
520 tm->op = op;
521 tm->slot = slot;
522 tm->generation = btrfs_node_ptr_generation(eb, slot);
523
524 return __tree_mod_log_insert(fs_info, tm);
525}
526
527static noinline int
528tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
529 int slot, enum mod_log_op op)
530{
531 return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
532}
533
534static noinline int
535tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
536 struct extent_buffer *eb, int dst_slot, int src_slot,
537 int nr_items, gfp_t flags)
538{
539 struct tree_mod_elem *tm;
540 int ret;
541 int i;
542
543 if (tree_mod_dont_log(fs_info, eb))
544 return 0;
545
546 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
547 ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
548 MOD_LOG_KEY_REMOVE_WHILE_MOVING);
549 BUG_ON(ret < 0);
550 }
551
552 ret = tree_mod_alloc(fs_info, flags, &tm);
553 if (ret <= 0)
554 return ret;
555
556 tm->index = eb->start >> PAGE_CACHE_SHIFT;
557 tm->slot = src_slot;
558 tm->move.dst_slot = dst_slot;
559 tm->move.nr_items = nr_items;
560 tm->op = MOD_LOG_MOVE_KEYS;
561
562 return __tree_mod_log_insert(fs_info, tm);
563}
564
565static noinline int
566tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
567 struct extent_buffer *old_root,
568 struct extent_buffer *new_root, gfp_t flags)
569{
570 struct tree_mod_elem *tm;
571 int ret;
572
573 ret = tree_mod_alloc(fs_info, flags, &tm);
574 if (ret <= 0)
575 return ret;
576
577 tm->index = new_root->start >> PAGE_CACHE_SHIFT;
578 tm->old_root.logical = old_root->start;
579 tm->old_root.level = btrfs_header_level(old_root);
580 tm->generation = btrfs_header_generation(old_root);
581 tm->op = MOD_LOG_ROOT_REPLACE;
582
583 return __tree_mod_log_insert(fs_info, tm);
584}
585
586static struct tree_mod_elem *
587__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
588 int smallest)
589{
590 struct rb_root *tm_root;
591 struct rb_node *node;
592 struct tree_mod_elem *cur = NULL;
593 struct tree_mod_elem *found = NULL;
594 u64 index = start >> PAGE_CACHE_SHIFT;
595
596 read_lock(&fs_info->tree_mod_log_lock);
597 tm_root = &fs_info->tree_mod_log;
598 node = tm_root->rb_node;
599 while (node) {
600 cur = container_of(node, struct tree_mod_elem, node);
601 if (cur->index < index) {
602 node = node->rb_left;
603 } else if (cur->index > index) {
604 node = node->rb_right;
605 } else if (cur->elem.seq < min_seq) {
606 node = node->rb_left;
607 } else if (!smallest) {
608 /* we want the node with the highest seq */
609 if (found)
610 BUG_ON(found->elem.seq > cur->elem.seq);
611 found = cur;
612 node = node->rb_left;
613 } else if (cur->elem.seq > min_seq) {
614 /* we want the node with the smallest seq */
615 if (found)
616 BUG_ON(found->elem.seq < cur->elem.seq);
617 found = cur;
618 node = node->rb_right;
619 } else {
620 found = cur;
621 break;
622 }
623 }
624 read_unlock(&fs_info->tree_mod_log_lock);
625
626 return found;
627}
628
629/*
630 * this returns the element from the log with the smallest time sequence
631 * value that's in the log (the oldest log item). any element with a time
632 * sequence lower than min_seq will be ignored.
633 */
634static struct tree_mod_elem *
635tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
636 u64 min_seq)
637{
638 return __tree_mod_log_search(fs_info, start, min_seq, 1);
639}
640
641/*
642 * this returns the element from the log with the largest time sequence
643 * value that's in the log (the most recent log item). any element with
644 * a time sequence lower than min_seq will be ignored.
645 */
646static struct tree_mod_elem *
647tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
648{
649 return __tree_mod_log_search(fs_info, start, min_seq, 0);
650}
651
652static inline void
653tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
654 struct extent_buffer *src, unsigned long dst_offset,
655 unsigned long src_offset, int nr_items)
656{
657 int ret;
658 int i;
659
660 if (tree_mod_dont_log(fs_info, NULL))
661 return;
662
663 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
664 return;
665
666 /* speed this up by single seq for all operations? */
667 for (i = 0; i < nr_items; i++) {
668 ret = tree_mod_log_insert_key(fs_info, src, i + src_offset,
669 MOD_LOG_KEY_REMOVE);
670 BUG_ON(ret < 0);
671 ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset,
672 MOD_LOG_KEY_ADD);
673 BUG_ON(ret < 0);
674 }
675}
676
677static inline void
678tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
679 int dst_offset, int src_offset, int nr_items)
680{
681 int ret;
682 ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
683 nr_items, GFP_NOFS);
684 BUG_ON(ret < 0);
685}
686
687static inline void
688tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
689 struct extent_buffer *eb,
690 struct btrfs_disk_key *disk_key, int slot, int atomic)
691{
692 int ret;
693
694 ret = tree_mod_log_insert_key_mask(fs_info, eb, slot,
695 MOD_LOG_KEY_REPLACE,
696 atomic ? GFP_ATOMIC : GFP_NOFS);
697 BUG_ON(ret < 0);
698}
699
700static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
701 struct extent_buffer *eb)
702{
703 int i;
704 int ret;
705 u32 nritems;
706
707 if (tree_mod_dont_log(fs_info, eb))
708 return;
709
710 nritems = btrfs_header_nritems(eb);
711 for (i = nritems - 1; i >= 0; i--) {
712 ret = tree_mod_log_insert_key(fs_info, eb, i,
713 MOD_LOG_KEY_REMOVE_WHILE_FREEING);
714 BUG_ON(ret < 0);
715 }
716}
717
718static inline void
719tree_mod_log_set_root_pointer(struct btrfs_root *root,
720 struct extent_buffer *new_root_node)
721{
722 int ret;
723 tree_mod_log_free_eb(root->fs_info, root->node);
724 ret = tree_mod_log_insert_root(root->fs_info, root->node,
725 new_root_node, GFP_NOFS);
726 BUG_ON(ret < 0);
727}
728
291/* 729/*
292 * check if the tree block can be shared by multiple trees 730 * check if the tree block can be shared by multiple trees
293 */ 731 */
@@ -409,6 +847,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
409 ret = btrfs_dec_ref(trans, root, buf, 1, 1); 847 ret = btrfs_dec_ref(trans, root, buf, 1, 1);
410 BUG_ON(ret); /* -ENOMEM */ 848 BUG_ON(ret); /* -ENOMEM */
411 } 849 }
850 /*
851 * don't log freeing in case we're freeing the root node, this
852 * is done by tree_mod_log_set_root_pointer later
853 */
854 if (buf != root->node && btrfs_header_level(buf) != 0)
855 tree_mod_log_free_eb(root->fs_info, buf);
412 clean_tree_block(trans, root, buf); 856 clean_tree_block(trans, root, buf);
413 *last_ref = 1; 857 *last_ref = 1;
414 } 858 }
@@ -467,7 +911,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
467 911
468 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, 912 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
469 root->root_key.objectid, &disk_key, 913 root->root_key.objectid, &disk_key,
470 level, search_start, empty_size, 1); 914 level, search_start, empty_size);
471 if (IS_ERR(cow)) 915 if (IS_ERR(cow))
472 return PTR_ERR(cow); 916 return PTR_ERR(cow);
473 917
@@ -506,10 +950,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
506 parent_start = 0; 950 parent_start = 0;
507 951
508 extent_buffer_get(cow); 952 extent_buffer_get(cow);
953 tree_mod_log_set_root_pointer(root, cow);
509 rcu_assign_pointer(root->node, cow); 954 rcu_assign_pointer(root->node, cow);
510 955
511 btrfs_free_tree_block(trans, root, buf, parent_start, 956 btrfs_free_tree_block(trans, root, buf, parent_start,
512 last_ref, 1); 957 last_ref);
513 free_extent_buffer(buf); 958 free_extent_buffer(buf);
514 add_root_to_dirty_list(root); 959 add_root_to_dirty_list(root);
515 } else { 960 } else {
@@ -519,13 +964,15 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
519 parent_start = 0; 964 parent_start = 0;
520 965
521 WARN_ON(trans->transid != btrfs_header_generation(parent)); 966 WARN_ON(trans->transid != btrfs_header_generation(parent));
967 tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
968 MOD_LOG_KEY_REPLACE);
522 btrfs_set_node_blockptr(parent, parent_slot, 969 btrfs_set_node_blockptr(parent, parent_slot,
523 cow->start); 970 cow->start);
524 btrfs_set_node_ptr_generation(parent, parent_slot, 971 btrfs_set_node_ptr_generation(parent, parent_slot,
525 trans->transid); 972 trans->transid);
526 btrfs_mark_buffer_dirty(parent); 973 btrfs_mark_buffer_dirty(parent);
527 btrfs_free_tree_block(trans, root, buf, parent_start, 974 btrfs_free_tree_block(trans, root, buf, parent_start,
528 last_ref, 1); 975 last_ref);
529 } 976 }
530 if (unlock_orig) 977 if (unlock_orig)
531 btrfs_tree_unlock(buf); 978 btrfs_tree_unlock(buf);
@@ -535,6 +982,210 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
535 return 0; 982 return 0;
536} 983}
537 984
985/*
986 * returns the logical address of the oldest predecessor of the given root.
987 * entries older than time_seq are ignored.
988 */
989static struct tree_mod_elem *
990__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
991 struct btrfs_root *root, u64 time_seq)
992{
993 struct tree_mod_elem *tm;
994 struct tree_mod_elem *found = NULL;
995 u64 root_logical = root->node->start;
996 int looped = 0;
997
998 if (!time_seq)
999 return 0;
1000
1001 /*
1002 * the very last operation that's logged for a root is the replacement
1003 * operation (if it is replaced at all). this has the index of the *new*
1004 * root, making it the very first operation that's logged for this root.
1005 */
1006 while (1) {
1007 tm = tree_mod_log_search_oldest(fs_info, root_logical,
1008 time_seq);
1009 if (!looped && !tm)
1010 return 0;
1011 /*
1012 * we must have key remove operations in the log before the
1013 * replace operation.
1014 */
1015 BUG_ON(!tm);
1016
1017 if (tm->op != MOD_LOG_ROOT_REPLACE)
1018 break;
1019
1020 found = tm;
1021 root_logical = tm->old_root.logical;
1022 BUG_ON(root_logical == root->node->start);
1023 looped = 1;
1024 }
1025
1026 return found;
1027}
1028
1029/*
1030 * tm is a pointer to the first operation to rewind within eb. then, all
1031 * previous operations will be rewinded (until we reach something older than
1032 * time_seq).
1033 */
1034static void
1035__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1036 struct tree_mod_elem *first_tm)
1037{
1038 u32 n;
1039 struct rb_node *next;
1040 struct tree_mod_elem *tm = first_tm;
1041 unsigned long o_dst;
1042 unsigned long o_src;
1043 unsigned long p_size = sizeof(struct btrfs_key_ptr);
1044
1045 n = btrfs_header_nritems(eb);
1046 while (tm && tm->elem.seq >= time_seq) {
1047 /*
1048 * all the operations are recorded with the operator used for
1049 * the modification. as we're going backwards, we do the
1050 * opposite of each operation here.
1051 */
1052 switch (tm->op) {
1053 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1054 BUG_ON(tm->slot < n);
1055 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1056 case MOD_LOG_KEY_REMOVE:
1057 btrfs_set_node_key(eb, &tm->key, tm->slot);
1058 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1059 btrfs_set_node_ptr_generation(eb, tm->slot,
1060 tm->generation);
1061 n++;
1062 break;
1063 case MOD_LOG_KEY_REPLACE:
1064 BUG_ON(tm->slot >= n);
1065 btrfs_set_node_key(eb, &tm->key, tm->slot);
1066 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1067 btrfs_set_node_ptr_generation(eb, tm->slot,
1068 tm->generation);
1069 break;
1070 case MOD_LOG_KEY_ADD:
1071 if (tm->slot != n - 1) {
1072 o_dst = btrfs_node_key_ptr_offset(tm->slot);
1073 o_src = btrfs_node_key_ptr_offset(tm->slot + 1);
1074 memmove_extent_buffer(eb, o_dst, o_src, p_size);
1075 }
1076 n--;
1077 break;
1078 case MOD_LOG_MOVE_KEYS:
1079 o_dst = btrfs_node_key_ptr_offset(tm->slot);
1080 o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1081 memmove_extent_buffer(eb, o_dst, o_src,
1082 tm->move.nr_items * p_size);
1083 break;
1084 case MOD_LOG_ROOT_REPLACE:
1085 /*
1086 * this operation is special. for roots, this must be
1087 * handled explicitly before rewinding.
1088 * for non-roots, this operation may exist if the node
1089 * was a root: root A -> child B; then A gets empty and
1090 * B is promoted to the new root. in the mod log, we'll
1091 * have a root-replace operation for B, a tree block
1092 * that is no root. we simply ignore that operation.
1093 */
1094 break;
1095 }
1096 next = rb_next(&tm->node);
1097 if (!next)
1098 break;
1099 tm = container_of(next, struct tree_mod_elem, node);
1100 if (tm->index != first_tm->index)
1101 break;
1102 }
1103 btrfs_set_header_nritems(eb, n);
1104}
1105
1106static struct extent_buffer *
1107tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1108 u64 time_seq)
1109{
1110 struct extent_buffer *eb_rewin;
1111 struct tree_mod_elem *tm;
1112
1113 if (!time_seq)
1114 return eb;
1115
1116 if (btrfs_header_level(eb) == 0)
1117 return eb;
1118
1119 tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1120 if (!tm)
1121 return eb;
1122
1123 if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1124 BUG_ON(tm->slot != 0);
1125 eb_rewin = alloc_dummy_extent_buffer(eb->start,
1126 fs_info->tree_root->nodesize);
1127 BUG_ON(!eb_rewin);
1128 btrfs_set_header_bytenr(eb_rewin, eb->start);
1129 btrfs_set_header_backref_rev(eb_rewin,
1130 btrfs_header_backref_rev(eb));
1131 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1132 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1133 } else {
1134 eb_rewin = btrfs_clone_extent_buffer(eb);
1135 BUG_ON(!eb_rewin);
1136 }
1137
1138 extent_buffer_get(eb_rewin);
1139 free_extent_buffer(eb);
1140
1141 __tree_mod_log_rewind(eb_rewin, time_seq, tm);
1142
1143 return eb_rewin;
1144}
1145
1146static inline struct extent_buffer *
1147get_old_root(struct btrfs_root *root, u64 time_seq)
1148{
1149 struct tree_mod_elem *tm;
1150 struct extent_buffer *eb;
1151 struct tree_mod_root *old_root;
1152 u64 old_generation;
1153
1154 tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
1155 if (!tm)
1156 return root->node;
1157
1158 old_root = &tm->old_root;
1159 old_generation = tm->generation;
1160
1161 tm = tree_mod_log_search(root->fs_info, old_root->logical, time_seq);
1162 /*
1163 * there was an item in the log when __tree_mod_log_oldest_root
1164 * returned. this one must not go away, because the time_seq passed to
1165 * us must be blocking its removal.
1166 */
1167 BUG_ON(!tm);
1168
1169 if (old_root->logical == root->node->start) {
1170 /* there are logged operations for the current root */
1171 eb = btrfs_clone_extent_buffer(root->node);
1172 } else {
1173 /* there's a root replace operation for the current root */
1174 eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT,
1175 root->nodesize);
1176 btrfs_set_header_bytenr(eb, eb->start);
1177 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1178 btrfs_set_header_owner(eb, root->root_key.objectid);
1179 }
1180 if (!eb)
1181 return NULL;
1182 btrfs_set_header_level(eb, old_root->level);
1183 btrfs_set_header_generation(eb, old_generation);
1184 __tree_mod_log_rewind(eb, time_seq, tm);
1185
1186 return eb;
1187}
1188
538static inline int should_cow_block(struct btrfs_trans_handle *trans, 1189static inline int should_cow_block(struct btrfs_trans_handle *trans,
539 struct btrfs_root *root, 1190 struct btrfs_root *root,
540 struct extent_buffer *buf) 1191 struct extent_buffer *buf)
@@ -739,7 +1390,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
739 if (!cur) 1390 if (!cur)
740 return -EIO; 1391 return -EIO;
741 } else if (!uptodate) { 1392 } else if (!uptodate) {
742 btrfs_read_buffer(cur, gen); 1393 err = btrfs_read_buffer(cur, gen);
1394 if (err) {
1395 free_extent_buffer(cur);
1396 return err;
1397 }
743 } 1398 }
744 } 1399 }
745 if (search_start == 0) 1400 if (search_start == 0)
@@ -854,20 +1509,18 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
854static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, 1509static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
855 int level, int *slot) 1510 int level, int *slot)
856{ 1511{
857 if (level == 0) { 1512 if (level == 0)
858 return generic_bin_search(eb, 1513 return generic_bin_search(eb,
859 offsetof(struct btrfs_leaf, items), 1514 offsetof(struct btrfs_leaf, items),
860 sizeof(struct btrfs_item), 1515 sizeof(struct btrfs_item),
861 key, btrfs_header_nritems(eb), 1516 key, btrfs_header_nritems(eb),
862 slot); 1517 slot);
863 } else { 1518 else
864 return generic_bin_search(eb, 1519 return generic_bin_search(eb,
865 offsetof(struct btrfs_node, ptrs), 1520 offsetof(struct btrfs_node, ptrs),
866 sizeof(struct btrfs_key_ptr), 1521 sizeof(struct btrfs_key_ptr),
867 key, btrfs_header_nritems(eb), 1522 key, btrfs_header_nritems(eb),
868 slot); 1523 slot);
869 }
870 return -1;
871} 1524}
872 1525
873int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 1526int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
@@ -974,6 +1627,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
974 goto enospc; 1627 goto enospc;
975 } 1628 }
976 1629
1630 tree_mod_log_set_root_pointer(root, child);
977 rcu_assign_pointer(root->node, child); 1631 rcu_assign_pointer(root->node, child);
978 1632
979 add_root_to_dirty_list(root); 1633 add_root_to_dirty_list(root);
@@ -987,7 +1641,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
987 free_extent_buffer(mid); 1641 free_extent_buffer(mid);
988 1642
989 root_sub_used(root, mid->len); 1643 root_sub_used(root, mid->len);
990 btrfs_free_tree_block(trans, root, mid, 0, 1, 0); 1644 btrfs_free_tree_block(trans, root, mid, 0, 1);
991 /* once for the root ptr */ 1645 /* once for the root ptr */
992 free_extent_buffer_stale(mid); 1646 free_extent_buffer_stale(mid);
993 return 0; 1647 return 0;
@@ -1040,14 +1694,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1040 if (btrfs_header_nritems(right) == 0) { 1694 if (btrfs_header_nritems(right) == 0) {
1041 clean_tree_block(trans, root, right); 1695 clean_tree_block(trans, root, right);
1042 btrfs_tree_unlock(right); 1696 btrfs_tree_unlock(right);
1043 del_ptr(trans, root, path, level + 1, pslot + 1); 1697 del_ptr(trans, root, path, level + 1, pslot + 1, 1);
1044 root_sub_used(root, right->len); 1698 root_sub_used(root, right->len);
1045 btrfs_free_tree_block(trans, root, right, 0, 1, 0); 1699 btrfs_free_tree_block(trans, root, right, 0, 1);
1046 free_extent_buffer_stale(right); 1700 free_extent_buffer_stale(right);
1047 right = NULL; 1701 right = NULL;
1048 } else { 1702 } else {
1049 struct btrfs_disk_key right_key; 1703 struct btrfs_disk_key right_key;
1050 btrfs_node_key(right, &right_key, 0); 1704 btrfs_node_key(right, &right_key, 0);
1705 tree_mod_log_set_node_key(root->fs_info, parent,
1706 &right_key, pslot + 1, 0);
1051 btrfs_set_node_key(parent, &right_key, pslot + 1); 1707 btrfs_set_node_key(parent, &right_key, pslot + 1);
1052 btrfs_mark_buffer_dirty(parent); 1708 btrfs_mark_buffer_dirty(parent);
1053 } 1709 }
@@ -1082,15 +1738,17 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1082 if (btrfs_header_nritems(mid) == 0) { 1738 if (btrfs_header_nritems(mid) == 0) {
1083 clean_tree_block(trans, root, mid); 1739 clean_tree_block(trans, root, mid);
1084 btrfs_tree_unlock(mid); 1740 btrfs_tree_unlock(mid);
1085 del_ptr(trans, root, path, level + 1, pslot); 1741 del_ptr(trans, root, path, level + 1, pslot, 1);
1086 root_sub_used(root, mid->len); 1742 root_sub_used(root, mid->len);
1087 btrfs_free_tree_block(trans, root, mid, 0, 1, 0); 1743 btrfs_free_tree_block(trans, root, mid, 0, 1);
1088 free_extent_buffer_stale(mid); 1744 free_extent_buffer_stale(mid);
1089 mid = NULL; 1745 mid = NULL;
1090 } else { 1746 } else {
1091 /* update the parent key to reflect our changes */ 1747 /* update the parent key to reflect our changes */
1092 struct btrfs_disk_key mid_key; 1748 struct btrfs_disk_key mid_key;
1093 btrfs_node_key(mid, &mid_key, 0); 1749 btrfs_node_key(mid, &mid_key, 0);
1750 tree_mod_log_set_node_key(root->fs_info, parent, &mid_key,
1751 pslot, 0);
1094 btrfs_set_node_key(parent, &mid_key, pslot); 1752 btrfs_set_node_key(parent, &mid_key, pslot);
1095 btrfs_mark_buffer_dirty(parent); 1753 btrfs_mark_buffer_dirty(parent);
1096 } 1754 }
@@ -1188,6 +1846,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1188 struct btrfs_disk_key disk_key; 1846 struct btrfs_disk_key disk_key;
1189 orig_slot += left_nr; 1847 orig_slot += left_nr;
1190 btrfs_node_key(mid, &disk_key, 0); 1848 btrfs_node_key(mid, &disk_key, 0);
1849 tree_mod_log_set_node_key(root->fs_info, parent,
1850 &disk_key, pslot, 0);
1191 btrfs_set_node_key(parent, &disk_key, pslot); 1851 btrfs_set_node_key(parent, &disk_key, pslot);
1192 btrfs_mark_buffer_dirty(parent); 1852 btrfs_mark_buffer_dirty(parent);
1193 if (btrfs_header_nritems(left) > orig_slot) { 1853 if (btrfs_header_nritems(left) > orig_slot) {
@@ -1239,6 +1899,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1239 struct btrfs_disk_key disk_key; 1899 struct btrfs_disk_key disk_key;
1240 1900
1241 btrfs_node_key(right, &disk_key, 0); 1901 btrfs_node_key(right, &disk_key, 0);
1902 tree_mod_log_set_node_key(root->fs_info, parent,
1903 &disk_key, pslot + 1, 0);
1242 btrfs_set_node_key(parent, &disk_key, pslot + 1); 1904 btrfs_set_node_key(parent, &disk_key, pslot + 1);
1243 btrfs_mark_buffer_dirty(parent); 1905 btrfs_mark_buffer_dirty(parent);
1244 1906
@@ -1496,7 +2158,7 @@ static int
1496read_block_for_search(struct btrfs_trans_handle *trans, 2158read_block_for_search(struct btrfs_trans_handle *trans,
1497 struct btrfs_root *root, struct btrfs_path *p, 2159 struct btrfs_root *root, struct btrfs_path *p,
1498 struct extent_buffer **eb_ret, int level, int slot, 2160 struct extent_buffer **eb_ret, int level, int slot,
1499 struct btrfs_key *key) 2161 struct btrfs_key *key, u64 time_seq)
1500{ 2162{
1501 u64 blocknr; 2163 u64 blocknr;
1502 u64 gen; 2164 u64 gen;
@@ -1850,7 +2512,7 @@ cow_done:
1850 } 2512 }
1851 2513
1852 err = read_block_for_search(trans, root, p, 2514 err = read_block_for_search(trans, root, p,
1853 &b, level, slot, key); 2515 &b, level, slot, key, 0);
1854 if (err == -EAGAIN) 2516 if (err == -EAGAIN)
1855 goto again; 2517 goto again;
1856 if (err) { 2518 if (err) {
@@ -1922,6 +2584,115 @@ done:
1922} 2584}
1923 2585
1924/* 2586/*
2587 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2588 * current state of the tree together with the operations recorded in the tree
2589 * modification log to search for the key in a previous version of this tree, as
2590 * denoted by the time_seq parameter.
2591 *
2592 * Naturally, there is no support for insert, delete or cow operations.
2593 *
2594 * The resulting path and return value will be set up as if we called
2595 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2596 */
2597int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2598 struct btrfs_path *p, u64 time_seq)
2599{
2600 struct extent_buffer *b;
2601 int slot;
2602 int ret;
2603 int err;
2604 int level;
2605 int lowest_unlock = 1;
2606 u8 lowest_level = 0;
2607
2608 lowest_level = p->lowest_level;
2609 WARN_ON(p->nodes[0] != NULL);
2610
2611 if (p->search_commit_root) {
2612 BUG_ON(time_seq);
2613 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2614 }
2615
2616again:
2617 b = get_old_root(root, time_seq);
2618 extent_buffer_get(b);
2619 level = btrfs_header_level(b);
2620 btrfs_tree_read_lock(b);
2621 p->locks[level] = BTRFS_READ_LOCK;
2622
2623 while (b) {
2624 level = btrfs_header_level(b);
2625 p->nodes[level] = b;
2626 btrfs_clear_path_blocking(p, NULL, 0);
2627
2628 /*
2629 * we have a lock on b and as long as we aren't changing
2630 * the tree, there is no way to for the items in b to change.
2631 * It is safe to drop the lock on our parent before we
2632 * go through the expensive btree search on b.
2633 */
2634 btrfs_unlock_up_safe(p, level + 1);
2635
2636 ret = bin_search(b, key, level, &slot);
2637
2638 if (level != 0) {
2639 int dec = 0;
2640 if (ret && slot > 0) {
2641 dec = 1;
2642 slot -= 1;
2643 }
2644 p->slots[level] = slot;
2645 unlock_up(p, level, lowest_unlock, 0, NULL);
2646
2647 if (level == lowest_level) {
2648 if (dec)
2649 p->slots[level]++;
2650 goto done;
2651 }
2652
2653 err = read_block_for_search(NULL, root, p, &b, level,
2654 slot, key, time_seq);
2655 if (err == -EAGAIN)
2656 goto again;
2657 if (err) {
2658 ret = err;
2659 goto done;
2660 }
2661
2662 level = btrfs_header_level(b);
2663 err = btrfs_try_tree_read_lock(b);
2664 if (!err) {
2665 btrfs_set_path_blocking(p);
2666 btrfs_tree_read_lock(b);
2667 btrfs_clear_path_blocking(p, b,
2668 BTRFS_READ_LOCK);
2669 }
2670 p->locks[level] = BTRFS_READ_LOCK;
2671 p->nodes[level] = b;
2672 b = tree_mod_log_rewind(root->fs_info, b, time_seq);
2673 if (b != p->nodes[level]) {
2674 btrfs_tree_unlock_rw(p->nodes[level],
2675 p->locks[level]);
2676 p->locks[level] = 0;
2677 p->nodes[level] = b;
2678 }
2679 } else {
2680 p->slots[level] = slot;
2681 unlock_up(p, level, lowest_unlock, 0, NULL);
2682 goto done;
2683 }
2684 }
2685 ret = 1;
2686done:
2687 if (!p->leave_spinning)
2688 btrfs_set_path_blocking(p);
2689 if (ret < 0)
2690 btrfs_release_path(p);
2691
2692 return ret;
2693}
2694
2695/*
1925 * adjust the pointers going up the tree, starting at level 2696 * adjust the pointers going up the tree, starting at level
1926 * making sure the right key of each node is points to 'key'. 2697 * making sure the right key of each node is points to 'key'.
1927 * This is used after shifting pointers to the left, so it stops 2698 * This is used after shifting pointers to the left, so it stops
@@ -1941,6 +2712,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
1941 if (!path->nodes[i]) 2712 if (!path->nodes[i])
1942 break; 2713 break;
1943 t = path->nodes[i]; 2714 t = path->nodes[i];
2715 tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);
1944 btrfs_set_node_key(t, key, tslot); 2716 btrfs_set_node_key(t, key, tslot);
1945 btrfs_mark_buffer_dirty(path->nodes[i]); 2717 btrfs_mark_buffer_dirty(path->nodes[i]);
1946 if (tslot != 0) 2718 if (tslot != 0)
@@ -2023,12 +2795,16 @@ static int push_node_left(struct btrfs_trans_handle *trans,
2023 } else 2795 } else
2024 push_items = min(src_nritems - 8, push_items); 2796 push_items = min(src_nritems - 8, push_items);
2025 2797
2798 tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
2799 push_items);
2026 copy_extent_buffer(dst, src, 2800 copy_extent_buffer(dst, src,
2027 btrfs_node_key_ptr_offset(dst_nritems), 2801 btrfs_node_key_ptr_offset(dst_nritems),
2028 btrfs_node_key_ptr_offset(0), 2802 btrfs_node_key_ptr_offset(0),
2029 push_items * sizeof(struct btrfs_key_ptr)); 2803 push_items * sizeof(struct btrfs_key_ptr));
2030 2804
2031 if (push_items < src_nritems) { 2805 if (push_items < src_nritems) {
2806 tree_mod_log_eb_move(root->fs_info, src, 0, push_items,
2807 src_nritems - push_items);
2032 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), 2808 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2033 btrfs_node_key_ptr_offset(push_items), 2809 btrfs_node_key_ptr_offset(push_items),
2034 (src_nritems - push_items) * 2810 (src_nritems - push_items) *
@@ -2082,11 +2858,14 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
2082 if (max_push < push_items) 2858 if (max_push < push_items)
2083 push_items = max_push; 2859 push_items = max_push;
2084 2860
2861 tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
2085 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), 2862 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2086 btrfs_node_key_ptr_offset(0), 2863 btrfs_node_key_ptr_offset(0),
2087 (dst_nritems) * 2864 (dst_nritems) *
2088 sizeof(struct btrfs_key_ptr)); 2865 sizeof(struct btrfs_key_ptr));
2089 2866
2867 tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
2868 src_nritems - push_items, push_items);
2090 copy_extent_buffer(dst, src, 2869 copy_extent_buffer(dst, src,
2091 btrfs_node_key_ptr_offset(0), 2870 btrfs_node_key_ptr_offset(0),
2092 btrfs_node_key_ptr_offset(src_nritems - push_items), 2871 btrfs_node_key_ptr_offset(src_nritems - push_items),
@@ -2129,7 +2908,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2129 2908
2130 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 2909 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2131 root->root_key.objectid, &lower_key, 2910 root->root_key.objectid, &lower_key,
2132 level, root->node->start, 0, 0); 2911 level, root->node->start, 0);
2133 if (IS_ERR(c)) 2912 if (IS_ERR(c))
2134 return PTR_ERR(c); 2913 return PTR_ERR(c);
2135 2914
@@ -2161,6 +2940,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2161 btrfs_mark_buffer_dirty(c); 2940 btrfs_mark_buffer_dirty(c);
2162 2941
2163 old = root->node; 2942 old = root->node;
2943 tree_mod_log_set_root_pointer(root, c);
2164 rcu_assign_pointer(root->node, c); 2944 rcu_assign_pointer(root->node, c);
2165 2945
2166 /* the super has an extra ref to root->node */ 2946 /* the super has an extra ref to root->node */
@@ -2184,10 +2964,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2184static void insert_ptr(struct btrfs_trans_handle *trans, 2964static void insert_ptr(struct btrfs_trans_handle *trans,
2185 struct btrfs_root *root, struct btrfs_path *path, 2965 struct btrfs_root *root, struct btrfs_path *path,
2186 struct btrfs_disk_key *key, u64 bytenr, 2966 struct btrfs_disk_key *key, u64 bytenr,
2187 int slot, int level) 2967 int slot, int level, int tree_mod_log)
2188{ 2968{
2189 struct extent_buffer *lower; 2969 struct extent_buffer *lower;
2190 int nritems; 2970 int nritems;
2971 int ret;
2191 2972
2192 BUG_ON(!path->nodes[level]); 2973 BUG_ON(!path->nodes[level]);
2193 btrfs_assert_tree_locked(path->nodes[level]); 2974 btrfs_assert_tree_locked(path->nodes[level]);
@@ -2196,11 +2977,19 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
2196 BUG_ON(slot > nritems); 2977 BUG_ON(slot > nritems);
2197 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root)); 2978 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
2198 if (slot != nritems) { 2979 if (slot != nritems) {
2980 if (tree_mod_log && level)
2981 tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
2982 slot, nritems - slot);
2199 memmove_extent_buffer(lower, 2983 memmove_extent_buffer(lower,
2200 btrfs_node_key_ptr_offset(slot + 1), 2984 btrfs_node_key_ptr_offset(slot + 1),
2201 btrfs_node_key_ptr_offset(slot), 2985 btrfs_node_key_ptr_offset(slot),
2202 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 2986 (nritems - slot) * sizeof(struct btrfs_key_ptr));
2203 } 2987 }
2988 if (tree_mod_log && level) {
2989 ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
2990 MOD_LOG_KEY_ADD);
2991 BUG_ON(ret < 0);
2992 }
2204 btrfs_set_node_key(lower, key, slot); 2993 btrfs_set_node_key(lower, key, slot);
2205 btrfs_set_node_blockptr(lower, slot, bytenr); 2994 btrfs_set_node_blockptr(lower, slot, bytenr);
2206 WARN_ON(trans->transid == 0); 2995 WARN_ON(trans->transid == 0);
@@ -2252,7 +3041,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2252 3041
2253 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 3042 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2254 root->root_key.objectid, 3043 root->root_key.objectid,
2255 &disk_key, level, c->start, 0, 0); 3044 &disk_key, level, c->start, 0);
2256 if (IS_ERR(split)) 3045 if (IS_ERR(split))
2257 return PTR_ERR(split); 3046 return PTR_ERR(split);
2258 3047
@@ -2271,7 +3060,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2271 (unsigned long)btrfs_header_chunk_tree_uuid(split), 3060 (unsigned long)btrfs_header_chunk_tree_uuid(split),
2272 BTRFS_UUID_SIZE); 3061 BTRFS_UUID_SIZE);
2273 3062
2274 3063 tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
2275 copy_extent_buffer(split, c, 3064 copy_extent_buffer(split, c,
2276 btrfs_node_key_ptr_offset(0), 3065 btrfs_node_key_ptr_offset(0),
2277 btrfs_node_key_ptr_offset(mid), 3066 btrfs_node_key_ptr_offset(mid),
@@ -2284,7 +3073,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2284 btrfs_mark_buffer_dirty(split); 3073 btrfs_mark_buffer_dirty(split);
2285 3074
2286 insert_ptr(trans, root, path, &disk_key, split->start, 3075 insert_ptr(trans, root, path, &disk_key, split->start,
2287 path->slots[level + 1] + 1, level + 1); 3076 path->slots[level + 1] + 1, level + 1, 1);
2288 3077
2289 if (path->slots[level] >= mid) { 3078 if (path->slots[level] >= mid) {
2290 path->slots[level] -= mid; 3079 path->slots[level] -= mid;
@@ -2821,7 +3610,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
2821 btrfs_set_header_nritems(l, mid); 3610 btrfs_set_header_nritems(l, mid);
2822 btrfs_item_key(right, &disk_key, 0); 3611 btrfs_item_key(right, &disk_key, 0);
2823 insert_ptr(trans, root, path, &disk_key, right->start, 3612 insert_ptr(trans, root, path, &disk_key, right->start,
2824 path->slots[1] + 1, 1); 3613 path->slots[1] + 1, 1, 0);
2825 3614
2826 btrfs_mark_buffer_dirty(right); 3615 btrfs_mark_buffer_dirty(right);
2827 btrfs_mark_buffer_dirty(l); 3616 btrfs_mark_buffer_dirty(l);
@@ -3004,7 +3793,7 @@ again:
3004 3793
3005 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 3794 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
3006 root->root_key.objectid, 3795 root->root_key.objectid,
3007 &disk_key, 0, l->start, 0, 0); 3796 &disk_key, 0, l->start, 0);
3008 if (IS_ERR(right)) 3797 if (IS_ERR(right))
3009 return PTR_ERR(right); 3798 return PTR_ERR(right);
3010 3799
@@ -3028,7 +3817,7 @@ again:
3028 if (mid <= slot) { 3817 if (mid <= slot) {
3029 btrfs_set_header_nritems(right, 0); 3818 btrfs_set_header_nritems(right, 0);
3030 insert_ptr(trans, root, path, &disk_key, right->start, 3819 insert_ptr(trans, root, path, &disk_key, right->start,
3031 path->slots[1] + 1, 1); 3820 path->slots[1] + 1, 1, 0);
3032 btrfs_tree_unlock(path->nodes[0]); 3821 btrfs_tree_unlock(path->nodes[0]);
3033 free_extent_buffer(path->nodes[0]); 3822 free_extent_buffer(path->nodes[0]);
3034 path->nodes[0] = right; 3823 path->nodes[0] = right;
@@ -3037,7 +3826,7 @@ again:
3037 } else { 3826 } else {
3038 btrfs_set_header_nritems(right, 0); 3827 btrfs_set_header_nritems(right, 0);
3039 insert_ptr(trans, root, path, &disk_key, right->start, 3828 insert_ptr(trans, root, path, &disk_key, right->start,
3040 path->slots[1], 1); 3829 path->slots[1], 1, 0);
3041 btrfs_tree_unlock(path->nodes[0]); 3830 btrfs_tree_unlock(path->nodes[0]);
3042 free_extent_buffer(path->nodes[0]); 3831 free_extent_buffer(path->nodes[0]);
3043 path->nodes[0] = right; 3832 path->nodes[0] = right;
@@ -3749,19 +4538,29 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3749 * empty a node. 4538 * empty a node.
3750 */ 4539 */
3751static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4540static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3752 struct btrfs_path *path, int level, int slot) 4541 struct btrfs_path *path, int level, int slot,
4542 int tree_mod_log)
3753{ 4543{
3754 struct extent_buffer *parent = path->nodes[level]; 4544 struct extent_buffer *parent = path->nodes[level];
3755 u32 nritems; 4545 u32 nritems;
4546 int ret;
3756 4547
3757 nritems = btrfs_header_nritems(parent); 4548 nritems = btrfs_header_nritems(parent);
3758 if (slot != nritems - 1) { 4549 if (slot != nritems - 1) {
4550 if (tree_mod_log && level)
4551 tree_mod_log_eb_move(root->fs_info, parent, slot,
4552 slot + 1, nritems - slot - 1);
3759 memmove_extent_buffer(parent, 4553 memmove_extent_buffer(parent,
3760 btrfs_node_key_ptr_offset(slot), 4554 btrfs_node_key_ptr_offset(slot),
3761 btrfs_node_key_ptr_offset(slot + 1), 4555 btrfs_node_key_ptr_offset(slot + 1),
3762 sizeof(struct btrfs_key_ptr) * 4556 sizeof(struct btrfs_key_ptr) *
3763 (nritems - slot - 1)); 4557 (nritems - slot - 1));
4558 } else if (tree_mod_log && level) {
4559 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4560 MOD_LOG_KEY_REMOVE);
4561 BUG_ON(ret < 0);
3764 } 4562 }
4563
3765 nritems--; 4564 nritems--;
3766 btrfs_set_header_nritems(parent, nritems); 4565 btrfs_set_header_nritems(parent, nritems);
3767 if (nritems == 0 && parent == root->node) { 4566 if (nritems == 0 && parent == root->node) {
@@ -3793,7 +4592,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
3793 struct extent_buffer *leaf) 4592 struct extent_buffer *leaf)
3794{ 4593{
3795 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4594 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
3796 del_ptr(trans, root, path, 1, path->slots[1]); 4595 del_ptr(trans, root, path, 1, path->slots[1], 1);
3797 4596
3798 /* 4597 /*
3799 * btrfs_free_extent is expensive, we want to make sure we 4598 * btrfs_free_extent is expensive, we want to make sure we
@@ -3804,7 +4603,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
3804 root_sub_used(root, leaf->len); 4603 root_sub_used(root, leaf->len);
3805 4604
3806 extent_buffer_get(leaf); 4605 extent_buffer_get(leaf);
3807 btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); 4606 btrfs_free_tree_block(trans, root, leaf, 0, 1);
3808 free_extent_buffer_stale(leaf); 4607 free_extent_buffer_stale(leaf);
3809} 4608}
3810/* 4609/*
@@ -4271,7 +5070,7 @@ again:
4271 next = c; 5070 next = c;
4272 next_rw_lock = path->locks[level]; 5071 next_rw_lock = path->locks[level];
4273 ret = read_block_for_search(NULL, root, path, &next, level, 5072 ret = read_block_for_search(NULL, root, path, &next, level,
4274 slot, &key); 5073 slot, &key, 0);
4275 if (ret == -EAGAIN) 5074 if (ret == -EAGAIN)
4276 goto again; 5075 goto again;
4277 5076
@@ -4308,7 +5107,7 @@ again:
4308 break; 5107 break;
4309 5108
4310 ret = read_block_for_search(NULL, root, path, &next, level, 5109 ret = read_block_for_search(NULL, root, path, &next, level,
4311 0, &key); 5110 0, &key, 0);
4312 if (ret == -EAGAIN) 5111 if (ret == -EAGAIN)
4313 goto again; 5112 goto again;
4314 5113
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8fd72331d600..0236d03c6732 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -173,6 +173,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
173#define BTRFS_FT_XATTR 8 173#define BTRFS_FT_XATTR 8
174#define BTRFS_FT_MAX 9 174#define BTRFS_FT_MAX 9
175 175
176/* ioprio of readahead is set to idle */
177#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
178
176/* 179/*
177 * The key defines the order in the tree, and so it also defines (optimal) 180 * The key defines the order in the tree, and so it also defines (optimal)
178 * block layout. 181 * block layout.
@@ -823,6 +826,14 @@ struct btrfs_csum_item {
823 u8 csum; 826 u8 csum;
824} __attribute__ ((__packed__)); 827} __attribute__ ((__packed__));
825 828
829struct btrfs_dev_stats_item {
830 /*
831 * grow this item struct at the end for future enhancements and keep
832 * the existing values unchanged
833 */
834 __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
835} __attribute__ ((__packed__));
836
826/* different types of block groups (and chunks) */ 837/* different types of block groups (and chunks) */
827#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) 838#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
828#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) 839#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
@@ -1129,6 +1140,15 @@ struct btrfs_fs_info {
1129 spinlock_t delayed_iput_lock; 1140 spinlock_t delayed_iput_lock;
1130 struct list_head delayed_iputs; 1141 struct list_head delayed_iputs;
1131 1142
1143 /* this protects tree_mod_seq_list */
1144 spinlock_t tree_mod_seq_lock;
1145 atomic_t tree_mod_seq;
1146 struct list_head tree_mod_seq_list;
1147
1148 /* this protects tree_mod_log */
1149 rwlock_t tree_mod_log_lock;
1150 struct rb_root tree_mod_log;
1151
1132 atomic_t nr_async_submits; 1152 atomic_t nr_async_submits;
1133 atomic_t async_submit_draining; 1153 atomic_t async_submit_draining;
1134 atomic_t nr_async_bios; 1154 atomic_t nr_async_bios;
@@ -1375,7 +1395,7 @@ struct btrfs_root {
1375 struct list_head root_list; 1395 struct list_head root_list;
1376 1396
1377 spinlock_t orphan_lock; 1397 spinlock_t orphan_lock;
1378 struct list_head orphan_list; 1398 atomic_t orphan_inodes;
1379 struct btrfs_block_rsv *orphan_block_rsv; 1399 struct btrfs_block_rsv *orphan_block_rsv;
1380 int orphan_item_inserted; 1400 int orphan_item_inserted;
1381 int orphan_cleanup_state; 1401 int orphan_cleanup_state;
@@ -1508,6 +1528,12 @@ struct btrfs_ioctl_defrag_range_args {
1508#define BTRFS_BALANCE_ITEM_KEY 248 1528#define BTRFS_BALANCE_ITEM_KEY 248
1509 1529
1510/* 1530/*
1531 * Persistantly stores the io stats in the device tree.
1532 * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid).
1533 */
1534#define BTRFS_DEV_STATS_KEY 249
1535
1536/*
1511 * string items are for debugging. They just store a short string of 1537 * string items are for debugging. They just store a short string of
1512 * data in the FS 1538 * data in the FS
1513 */ 1539 */
@@ -2415,6 +2441,30 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
2415 return btrfs_item_size(eb, e) - offset; 2441 return btrfs_item_size(eb, e) - offset;
2416} 2442}
2417 2443
2444/* btrfs_dev_stats_item */
2445static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
2446 struct btrfs_dev_stats_item *ptr,
2447 int index)
2448{
2449 u64 val;
2450
2451 read_extent_buffer(eb, &val,
2452 offsetof(struct btrfs_dev_stats_item, values) +
2453 ((unsigned long)ptr) + (index * sizeof(u64)),
2454 sizeof(val));
2455 return val;
2456}
2457
2458static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
2459 struct btrfs_dev_stats_item *ptr,
2460 int index, u64 val)
2461{
2462 write_extent_buffer(eb, &val,
2463 offsetof(struct btrfs_dev_stats_item, values) +
2464 ((unsigned long)ptr) + (index * sizeof(u64)),
2465 sizeof(val));
2466}
2467
2418static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) 2468static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
2419{ 2469{
2420 return sb->s_fs_info; 2470 return sb->s_fs_info;
@@ -2496,11 +2546,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
2496 struct btrfs_root *root, u32 blocksize, 2546 struct btrfs_root *root, u32 blocksize,
2497 u64 parent, u64 root_objectid, 2547 u64 parent, u64 root_objectid,
2498 struct btrfs_disk_key *key, int level, 2548 struct btrfs_disk_key *key, int level,
2499 u64 hint, u64 empty_size, int for_cow); 2549 u64 hint, u64 empty_size);
2500void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 2550void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
2501 struct btrfs_root *root, 2551 struct btrfs_root *root,
2502 struct extent_buffer *buf, 2552 struct extent_buffer *buf,
2503 u64 parent, int last_ref, int for_cow); 2553 u64 parent, int last_ref);
2504struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 2554struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
2505 struct btrfs_root *root, 2555 struct btrfs_root *root,
2506 u64 bytenr, u32 blocksize, 2556 u64 bytenr, u32 blocksize,
@@ -2659,6 +2709,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
2659int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root 2709int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2660 *root, struct btrfs_key *key, struct btrfs_path *p, int 2710 *root, struct btrfs_key *key, struct btrfs_path *p, int
2661 ins_len, int cow); 2711 ins_len, int cow);
2712int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2713 struct btrfs_path *p, u64 time_seq);
2662int btrfs_realloc_node(struct btrfs_trans_handle *trans, 2714int btrfs_realloc_node(struct btrfs_trans_handle *trans,
2663 struct btrfs_root *root, struct extent_buffer *parent, 2715 struct btrfs_root *root, struct extent_buffer *parent,
2664 int start_slot, int cache_only, u64 *last_ret, 2716 int start_slot, int cache_only, u64 *last_ret,
@@ -2922,7 +2974,6 @@ int btrfs_readpage(struct file *file, struct page *page);
2922void btrfs_evict_inode(struct inode *inode); 2974void btrfs_evict_inode(struct inode *inode);
2923int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); 2975int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
2924int btrfs_dirty_inode(struct inode *inode); 2976int btrfs_dirty_inode(struct inode *inode);
2925int btrfs_update_time(struct file *file);
2926struct inode *btrfs_alloc_inode(struct super_block *sb); 2977struct inode *btrfs_alloc_inode(struct super_block *sb);
2927void btrfs_destroy_inode(struct inode *inode); 2978void btrfs_destroy_inode(struct inode *inode);
2928int btrfs_drop_inode(struct inode *inode); 2979int btrfs_drop_inode(struct inode *inode);
@@ -3098,4 +3149,23 @@ void btrfs_reada_detach(void *handle);
3098int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, 3149int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
3099 u64 start, int err); 3150 u64 start, int err);
3100 3151
3152/* delayed seq elem */
3153struct seq_list {
3154 struct list_head list;
3155 u64 seq;
3156 u32 flags;
3157};
3158
3159void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
3160 struct seq_list *elem);
3161void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
3162 struct seq_list *elem);
3163
3164static inline int is_fstree(u64 rootid)
3165{
3166 if (rootid == BTRFS_FS_TREE_OBJECTID ||
3167 (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
3168 return 1;
3169 return 0;
3170}
3101#endif 3171#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 03e3748d84d0..c18d0442ae6d 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -669,8 +669,8 @@ static int btrfs_delayed_inode_reserve_metadata(
669 return ret; 669 return ret;
670 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { 670 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
671 spin_lock(&BTRFS_I(inode)->lock); 671 spin_lock(&BTRFS_I(inode)->lock);
672 if (BTRFS_I(inode)->delalloc_meta_reserved) { 672 if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
673 BTRFS_I(inode)->delalloc_meta_reserved = 0; 673 &BTRFS_I(inode)->runtime_flags)) {
674 spin_unlock(&BTRFS_I(inode)->lock); 674 spin_unlock(&BTRFS_I(inode)->lock);
675 release = true; 675 release = true;
676 goto migrate; 676 goto migrate;
@@ -1706,7 +1706,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
1706 btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); 1706 btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
1707 btrfs_set_stack_inode_generation(inode_item, 1707 btrfs_set_stack_inode_generation(inode_item,
1708 BTRFS_I(inode)->generation); 1708 BTRFS_I(inode)->generation);
1709 btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence); 1709 btrfs_set_stack_inode_sequence(inode_item, inode->i_version);
1710 btrfs_set_stack_inode_transid(inode_item, trans->transid); 1710 btrfs_set_stack_inode_transid(inode_item, trans->transid);
1711 btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); 1711 btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
1712 btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); 1712 btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
@@ -1754,7 +1754,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
1754 set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); 1754 set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
1755 inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); 1755 inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
1756 BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); 1756 BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
1757 BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item); 1757 inode->i_version = btrfs_stack_inode_sequence(inode_item);
1758 inode->i_rdev = 0; 1758 inode->i_rdev = 0;
1759 *rdev = btrfs_stack_inode_rdev(inode_item); 1759 *rdev = btrfs_stack_inode_rdev(inode_item);
1760 BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); 1760 BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 69f22e3ab3bc..13ae7b04790e 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -525,7 +525,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
525 ref->is_head = 0; 525 ref->is_head = 0;
526 ref->in_tree = 1; 526 ref->in_tree = 1;
527 527
528 if (need_ref_seq(for_cow, ref_root)) 528 if (is_fstree(ref_root))
529 seq = inc_delayed_seq(delayed_refs); 529 seq = inc_delayed_seq(delayed_refs);
530 ref->seq = seq; 530 ref->seq = seq;
531 531
@@ -584,7 +584,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
584 ref->is_head = 0; 584 ref->is_head = 0;
585 ref->in_tree = 1; 585 ref->in_tree = 1;
586 586
587 if (need_ref_seq(for_cow, ref_root)) 587 if (is_fstree(ref_root))
588 seq = inc_delayed_seq(delayed_refs); 588 seq = inc_delayed_seq(delayed_refs);
589 ref->seq = seq; 589 ref->seq = seq;
590 590
@@ -658,10 +658,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
658 add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, 658 add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
659 num_bytes, parent, ref_root, level, action, 659 num_bytes, parent, ref_root, level, action,
660 for_cow); 660 for_cow);
661 if (!need_ref_seq(for_cow, ref_root) && 661 if (!is_fstree(ref_root) &&
662 waitqueue_active(&delayed_refs->seq_wait)) 662 waitqueue_active(&delayed_refs->seq_wait))
663 wake_up(&delayed_refs->seq_wait); 663 wake_up(&delayed_refs->seq_wait);
664 spin_unlock(&delayed_refs->lock); 664 spin_unlock(&delayed_refs->lock);
665
665 return 0; 666 return 0;
666} 667}
667 668
@@ -706,10 +707,11 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
706 add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, 707 add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
707 num_bytes, parent, ref_root, owner, offset, 708 num_bytes, parent, ref_root, owner, offset,
708 action, for_cow); 709 action, for_cow);
709 if (!need_ref_seq(for_cow, ref_root) && 710 if (!is_fstree(ref_root) &&
710 waitqueue_active(&delayed_refs->seq_wait)) 711 waitqueue_active(&delayed_refs->seq_wait))
711 wake_up(&delayed_refs->seq_wait); 712 wake_up(&delayed_refs->seq_wait);
712 spin_unlock(&delayed_refs->lock); 713 spin_unlock(&delayed_refs->lock);
714
713 return 0; 715 return 0;
714} 716}
715 717
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index d8f244d94925..413927fb9957 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -195,11 +195,6 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
195int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 195int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
196 struct list_head *cluster, u64 search_start); 196 struct list_head *cluster, u64 search_start);
197 197
198struct seq_list {
199 struct list_head list;
200 u64 seq;
201};
202
203static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs) 198static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
204{ 199{
205 assert_spin_locked(&delayed_refs->lock); 200 assert_spin_locked(&delayed_refs->lock);
@@ -230,25 +225,6 @@ int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
230 u64 seq); 225 u64 seq);
231 226
232/* 227/*
233 * delayed refs with a ref_seq > 0 must be held back during backref walking.
234 * this only applies to items in one of the fs-trees. for_cow items never need
235 * to be held back, so they won't get a ref_seq number.
236 */
237static inline int need_ref_seq(int for_cow, u64 rootid)
238{
239 if (for_cow)
240 return 0;
241
242 if (rootid == BTRFS_FS_TREE_OBJECTID)
243 return 1;
244
245 if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
246 return 1;
247
248 return 0;
249}
250
251/*
252 * a node might live in a head or a regular ref, this lets you 228 * a node might live in a head or a regular ref, this lets you
253 * test for the proper type to use. 229 * test for the proper type to use.
254 */ 230 */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e1fe74a2ce16..7ae51decf6d3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1153,7 +1153,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1153 root->orphan_block_rsv = NULL; 1153 root->orphan_block_rsv = NULL;
1154 1154
1155 INIT_LIST_HEAD(&root->dirty_list); 1155 INIT_LIST_HEAD(&root->dirty_list);
1156 INIT_LIST_HEAD(&root->orphan_list);
1157 INIT_LIST_HEAD(&root->root_list); 1156 INIT_LIST_HEAD(&root->root_list);
1158 spin_lock_init(&root->orphan_lock); 1157 spin_lock_init(&root->orphan_lock);
1159 spin_lock_init(&root->inode_lock); 1158 spin_lock_init(&root->inode_lock);
@@ -1166,6 +1165,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1166 atomic_set(&root->log_commit[0], 0); 1165 atomic_set(&root->log_commit[0], 0);
1167 atomic_set(&root->log_commit[1], 0); 1166 atomic_set(&root->log_commit[1], 0);
1168 atomic_set(&root->log_writers, 0); 1167 atomic_set(&root->log_writers, 0);
1168 atomic_set(&root->orphan_inodes, 0);
1169 root->log_batch = 0; 1169 root->log_batch = 0;
1170 root->log_transid = 0; 1170 root->log_transid = 0;
1171 root->last_log_commit = 0; 1171 root->last_log_commit = 0;
@@ -1252,7 +1252,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1252 1252
1253 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 1253 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
1254 BTRFS_TREE_LOG_OBJECTID, NULL, 1254 BTRFS_TREE_LOG_OBJECTID, NULL,
1255 0, 0, 0, 0); 1255 0, 0, 0);
1256 if (IS_ERR(leaf)) { 1256 if (IS_ERR(leaf)) {
1257 kfree(root); 1257 kfree(root);
1258 return ERR_CAST(leaf); 1258 return ERR_CAST(leaf);
@@ -1914,11 +1914,14 @@ int open_ctree(struct super_block *sb,
1914 spin_lock_init(&fs_info->delayed_iput_lock); 1914 spin_lock_init(&fs_info->delayed_iput_lock);
1915 spin_lock_init(&fs_info->defrag_inodes_lock); 1915 spin_lock_init(&fs_info->defrag_inodes_lock);
1916 spin_lock_init(&fs_info->free_chunk_lock); 1916 spin_lock_init(&fs_info->free_chunk_lock);
1917 spin_lock_init(&fs_info->tree_mod_seq_lock);
1918 rwlock_init(&fs_info->tree_mod_log_lock);
1917 mutex_init(&fs_info->reloc_mutex); 1919 mutex_init(&fs_info->reloc_mutex);
1918 1920
1919 init_completion(&fs_info->kobj_unregister); 1921 init_completion(&fs_info->kobj_unregister);
1920 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1922 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1921 INIT_LIST_HEAD(&fs_info->space_info); 1923 INIT_LIST_HEAD(&fs_info->space_info);
1924 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
1922 btrfs_mapping_init(&fs_info->mapping_tree); 1925 btrfs_mapping_init(&fs_info->mapping_tree);
1923 btrfs_init_block_rsv(&fs_info->global_block_rsv); 1926 btrfs_init_block_rsv(&fs_info->global_block_rsv);
1924 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); 1927 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
@@ -1931,12 +1934,14 @@ int open_ctree(struct super_block *sb,
1931 atomic_set(&fs_info->async_submit_draining, 0); 1934 atomic_set(&fs_info->async_submit_draining, 0);
1932 atomic_set(&fs_info->nr_async_bios, 0); 1935 atomic_set(&fs_info->nr_async_bios, 0);
1933 atomic_set(&fs_info->defrag_running, 0); 1936 atomic_set(&fs_info->defrag_running, 0);
1937 atomic_set(&fs_info->tree_mod_seq, 0);
1934 fs_info->sb = sb; 1938 fs_info->sb = sb;
1935 fs_info->max_inline = 8192 * 1024; 1939 fs_info->max_inline = 8192 * 1024;
1936 fs_info->metadata_ratio = 0; 1940 fs_info->metadata_ratio = 0;
1937 fs_info->defrag_inodes = RB_ROOT; 1941 fs_info->defrag_inodes = RB_ROOT;
1938 fs_info->trans_no_join = 0; 1942 fs_info->trans_no_join = 0;
1939 fs_info->free_chunk_space = 0; 1943 fs_info->free_chunk_space = 0;
1944 fs_info->tree_mod_log = RB_ROOT;
1940 1945
1941 /* readahead state */ 1946 /* readahead state */
1942 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); 1947 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
@@ -2001,7 +2006,8 @@ int open_ctree(struct super_block *sb,
2001 BTRFS_I(fs_info->btree_inode)->root = tree_root; 2006 BTRFS_I(fs_info->btree_inode)->root = tree_root;
2002 memset(&BTRFS_I(fs_info->btree_inode)->location, 0, 2007 memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
2003 sizeof(struct btrfs_key)); 2008 sizeof(struct btrfs_key));
2004 BTRFS_I(fs_info->btree_inode)->dummy_inode = 1; 2009 set_bit(BTRFS_INODE_DUMMY,
2010 &BTRFS_I(fs_info->btree_inode)->runtime_flags);
2005 insert_inode_hash(fs_info->btree_inode); 2011 insert_inode_hash(fs_info->btree_inode);
2006 2012
2007 spin_lock_init(&fs_info->block_group_cache_lock); 2013 spin_lock_init(&fs_info->block_group_cache_lock);
@@ -2353,6 +2359,13 @@ retry_root_backup:
2353 fs_info->generation = generation; 2359 fs_info->generation = generation;
2354 fs_info->last_trans_committed = generation; 2360 fs_info->last_trans_committed = generation;
2355 2361
2362 ret = btrfs_init_dev_stats(fs_info);
2363 if (ret) {
2364 printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n",
2365 ret);
2366 goto fail_block_groups;
2367 }
2368
2356 ret = btrfs_init_space_info(fs_info); 2369 ret = btrfs_init_space_info(fs_info);
2357 if (ret) { 2370 if (ret) {
2358 printk(KERN_ERR "Failed to initial space info: %d\n", ret); 2371 printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2556,18 +2569,19 @@ recovery_tree_root:
2556 2569
2557static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) 2570static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
2558{ 2571{
2559 char b[BDEVNAME_SIZE];
2560
2561 if (uptodate) { 2572 if (uptodate) {
2562 set_buffer_uptodate(bh); 2573 set_buffer_uptodate(bh);
2563 } else { 2574 } else {
2575 struct btrfs_device *device = (struct btrfs_device *)
2576 bh->b_private;
2577
2564 printk_ratelimited(KERN_WARNING "lost page write due to " 2578 printk_ratelimited(KERN_WARNING "lost page write due to "
2565 "I/O error on %s\n", 2579 "I/O error on %s\n", device->name);
2566 bdevname(bh->b_bdev, b));
2567 /* note, we dont' set_buffer_write_io_error because we have 2580 /* note, we dont' set_buffer_write_io_error because we have
2568 * our own ways of dealing with the IO errors 2581 * our own ways of dealing with the IO errors
2569 */ 2582 */
2570 clear_buffer_uptodate(bh); 2583 clear_buffer_uptodate(bh);
2584 btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
2571 } 2585 }
2572 unlock_buffer(bh); 2586 unlock_buffer(bh);
2573 put_bh(bh); 2587 put_bh(bh);
@@ -2682,6 +2696,7 @@ static int write_dev_supers(struct btrfs_device *device,
2682 set_buffer_uptodate(bh); 2696 set_buffer_uptodate(bh);
2683 lock_buffer(bh); 2697 lock_buffer(bh);
2684 bh->b_end_io = btrfs_end_buffer_write_sync; 2698 bh->b_end_io = btrfs_end_buffer_write_sync;
2699 bh->b_private = device;
2685 } 2700 }
2686 2701
2687 /* 2702 /*
@@ -2740,6 +2755,9 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
2740 } 2755 }
2741 if (!bio_flagged(bio, BIO_UPTODATE)) { 2756 if (!bio_flagged(bio, BIO_UPTODATE)) {
2742 ret = -EIO; 2757 ret = -EIO;
2758 if (!bio_flagged(bio, BIO_EOPNOTSUPP))
2759 btrfs_dev_stat_inc_and_print(device,
2760 BTRFS_DEV_STAT_FLUSH_ERRS);
2743 } 2761 }
2744 2762
2745 /* drop the reference from the wait == 0 run */ 2763 /* drop the reference from the wait == 0 run */
@@ -2902,19 +2920,6 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
2902 return ret; 2920 return ret;
2903} 2921}
2904 2922
2905/* Kill all outstanding I/O */
2906void btrfs_abort_devices(struct btrfs_root *root)
2907{
2908 struct list_head *head;
2909 struct btrfs_device *dev;
2910 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2911 head = &root->fs_info->fs_devices->devices;
2912 list_for_each_entry_rcu(dev, head, dev_list) {
2913 blk_abort_queue(dev->bdev->bd_disk->queue);
2914 }
2915 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2916}
2917
2918void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) 2923void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
2919{ 2924{
2920 spin_lock(&fs_info->fs_roots_radix_lock); 2925 spin_lock(&fs_info->fs_roots_radix_lock);
@@ -3671,17 +3676,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
3671 return 0; 3676 return 0;
3672} 3677}
3673 3678
3674static int btree_writepage_io_failed_hook(struct bio *bio, struct page *page,
3675 u64 start, u64 end,
3676 struct extent_state *state)
3677{
3678 struct super_block *sb = page->mapping->host->i_sb;
3679 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
3680 btrfs_error(fs_info, -EIO,
3681 "Error occured while writing out btree at %llu", start);
3682 return -EIO;
3683}
3684
3685static struct extent_io_ops btree_extent_io_ops = { 3679static struct extent_io_ops btree_extent_io_ops = {
3686 .write_cache_pages_lock_hook = btree_lock_page_hook, 3680 .write_cache_pages_lock_hook = btree_lock_page_hook,
3687 .readpage_end_io_hook = btree_readpage_end_io_hook, 3681 .readpage_end_io_hook = btree_readpage_end_io_hook,
@@ -3689,5 +3683,4 @@ static struct extent_io_ops btree_extent_io_ops = {
3689 .submit_bio_hook = btree_submit_bio_hook, 3683 .submit_bio_hook = btree_submit_bio_hook,
3690 /* note we're sharing with inode.c for the merge bio hook */ 3684 /* note we're sharing with inode.c for the merge bio hook */
3691 .merge_bio_hook = btrfs_merge_bio_hook, 3685 .merge_bio_hook = btrfs_merge_bio_hook,
3692 .writepage_io_failed_hook = btree_writepage_io_failed_hook,
3693}; 3686};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index ab1830aaf0ed..05b3fab39f7e 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -89,7 +89,6 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
89int btrfs_cleanup_transaction(struct btrfs_root *root); 89int btrfs_cleanup_transaction(struct btrfs_root *root);
90void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, 90void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
91 struct btrfs_root *root); 91 struct btrfs_root *root);
92void btrfs_abort_devices(struct btrfs_root *root);
93 92
94#ifdef CONFIG_DEBUG_LOCK_ALLOC 93#ifdef CONFIG_DEBUG_LOCK_ALLOC
95void btrfs_init_lockdep(void); 94void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index e887ee62b6d4..614f34a899c2 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -13,15 +13,14 @@
13 parent_root_objectid) / 4) 13 parent_root_objectid) / 4)
14#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4) 14#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
15 15
16static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, 16static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
17 int connectable) 17 struct inode *parent)
18{ 18{
19 struct btrfs_fid *fid = (struct btrfs_fid *)fh; 19 struct btrfs_fid *fid = (struct btrfs_fid *)fh;
20 struct inode *inode = dentry->d_inode;
21 int len = *max_len; 20 int len = *max_len;
22 int type; 21 int type;
23 22
24 if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) { 23 if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
25 *max_len = BTRFS_FID_SIZE_CONNECTABLE; 24 *max_len = BTRFS_FID_SIZE_CONNECTABLE;
26 return 255; 25 return 255;
27 } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) { 26 } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
@@ -36,19 +35,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
36 fid->root_objectid = BTRFS_I(inode)->root->objectid; 35 fid->root_objectid = BTRFS_I(inode)->root->objectid;
37 fid->gen = inode->i_generation; 36 fid->gen = inode->i_generation;
38 37
39 if (connectable && !S_ISDIR(inode->i_mode)) { 38 if (parent) {
40 struct inode *parent;
41 u64 parent_root_id; 39 u64 parent_root_id;
42 40
43 spin_lock(&dentry->d_lock);
44
45 parent = dentry->d_parent->d_inode;
46 fid->parent_objectid = BTRFS_I(parent)->location.objectid; 41 fid->parent_objectid = BTRFS_I(parent)->location.objectid;
47 fid->parent_gen = parent->i_generation; 42 fid->parent_gen = parent->i_generation;
48 parent_root_id = BTRFS_I(parent)->root->objectid; 43 parent_root_id = BTRFS_I(parent)->root->objectid;
49 44
50 spin_unlock(&dentry->d_lock);
51
52 if (parent_root_id != fid->root_objectid) { 45 if (parent_root_id != fid->root_objectid) {
53 fid->parent_root_objectid = parent_root_id; 46 fid->parent_root_objectid = parent_root_id;
54 len = BTRFS_FID_SIZE_CONNECTABLE_ROOT; 47 len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 49fd7b66d57b..4b5a1e1bdefb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3578,7 +3578,7 @@ again:
3578 space_info->chunk_alloc = 0; 3578 space_info->chunk_alloc = 0;
3579 spin_unlock(&space_info->lock); 3579 spin_unlock(&space_info->lock);
3580out: 3580out:
3581 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3581 mutex_unlock(&fs_info->chunk_mutex);
3582 return ret; 3582 return ret;
3583} 3583}
3584 3584
@@ -4355,10 +4355,9 @@ static unsigned drop_outstanding_extent(struct inode *inode)
4355 BTRFS_I(inode)->outstanding_extents--; 4355 BTRFS_I(inode)->outstanding_extents--;
4356 4356
4357 if (BTRFS_I(inode)->outstanding_extents == 0 && 4357 if (BTRFS_I(inode)->outstanding_extents == 0 &&
4358 BTRFS_I(inode)->delalloc_meta_reserved) { 4358 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4359 &BTRFS_I(inode)->runtime_flags))
4359 drop_inode_space = 1; 4360 drop_inode_space = 1;
4360 BTRFS_I(inode)->delalloc_meta_reserved = 0;
4361 }
4362 4361
4363 /* 4362 /*
4364 * If we have more or the same amount of outsanding extents than we have 4363 * If we have more or the same amount of outsanding extents than we have
@@ -4465,7 +4464,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4465 * Add an item to reserve for updating the inode when we complete the 4464 * Add an item to reserve for updating the inode when we complete the
4466 * delalloc io. 4465 * delalloc io.
4467 */ 4466 */
4468 if (!BTRFS_I(inode)->delalloc_meta_reserved) { 4467 if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4468 &BTRFS_I(inode)->runtime_flags)) {
4469 nr_extents++; 4469 nr_extents++;
4470 extra_reserve = 1; 4470 extra_reserve = 1;
4471 } 4471 }
@@ -4511,7 +4511,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4511 4511
4512 spin_lock(&BTRFS_I(inode)->lock); 4512 spin_lock(&BTRFS_I(inode)->lock);
4513 if (extra_reserve) { 4513 if (extra_reserve) {
4514 BTRFS_I(inode)->delalloc_meta_reserved = 1; 4514 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4515 &BTRFS_I(inode)->runtime_flags);
4515 nr_extents--; 4516 nr_extents--;
4516 } 4517 }
4517 BTRFS_I(inode)->reserved_extents += nr_extents; 4518 BTRFS_I(inode)->reserved_extents += nr_extents;
@@ -5217,7 +5218,7 @@ out:
5217void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 5218void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5218 struct btrfs_root *root, 5219 struct btrfs_root *root,
5219 struct extent_buffer *buf, 5220 struct extent_buffer *buf,
5220 u64 parent, int last_ref, int for_cow) 5221 u64 parent, int last_ref)
5221{ 5222{
5222 struct btrfs_block_group_cache *cache = NULL; 5223 struct btrfs_block_group_cache *cache = NULL;
5223 int ret; 5224 int ret;
@@ -5227,7 +5228,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5227 buf->start, buf->len, 5228 buf->start, buf->len,
5228 parent, root->root_key.objectid, 5229 parent, root->root_key.objectid,
5229 btrfs_header_level(buf), 5230 btrfs_header_level(buf),
5230 BTRFS_DROP_DELAYED_REF, NULL, for_cow); 5231 BTRFS_DROP_DELAYED_REF, NULL, 0);
5231 BUG_ON(ret); /* -ENOMEM */ 5232 BUG_ON(ret); /* -ENOMEM */
5232 } 5233 }
5233 5234
@@ -6249,7 +6250,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6249 struct btrfs_root *root, u32 blocksize, 6250 struct btrfs_root *root, u32 blocksize,
6250 u64 parent, u64 root_objectid, 6251 u64 parent, u64 root_objectid,
6251 struct btrfs_disk_key *key, int level, 6252 struct btrfs_disk_key *key, int level,
6252 u64 hint, u64 empty_size, int for_cow) 6253 u64 hint, u64 empty_size)
6253{ 6254{
6254 struct btrfs_key ins; 6255 struct btrfs_key ins;
6255 struct btrfs_block_rsv *block_rsv; 6256 struct btrfs_block_rsv *block_rsv;
@@ -6297,7 +6298,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6297 ins.objectid, 6298 ins.objectid,
6298 ins.offset, parent, root_objectid, 6299 ins.offset, parent, root_objectid,
6299 level, BTRFS_ADD_DELAYED_EXTENT, 6300 level, BTRFS_ADD_DELAYED_EXTENT,
6300 extent_op, for_cow); 6301 extent_op, 0);
6301 BUG_ON(ret); /* -ENOMEM */ 6302 BUG_ON(ret); /* -ENOMEM */
6302 } 6303 }
6303 return buf; 6304 return buf;
@@ -6715,7 +6716,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6715 btrfs_header_owner(path->nodes[level + 1])); 6716 btrfs_header_owner(path->nodes[level + 1]));
6716 } 6717 }
6717 6718
6718 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0); 6719 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
6719out: 6720out:
6720 wc->refs[level] = 0; 6721 wc->refs[level] = 0;
6721 wc->flags[level] = 0; 6722 wc->flags[level] = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c9018a05036e..2c8f7b204617 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -186,7 +186,6 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
186 return parent; 186 return parent;
187 } 187 }
188 188
189 entry = rb_entry(node, struct tree_entry, rb_node);
190 rb_link_node(node, parent, p); 189 rb_link_node(node, parent, p);
191 rb_insert_color(node, root); 190 rb_insert_color(node, root);
192 return NULL; 191 return NULL;
@@ -413,7 +412,7 @@ static struct extent_state *next_state(struct extent_state *state)
413 412
414/* 413/*
415 * utility function to clear some bits in an extent state struct. 414 * utility function to clear some bits in an extent state struct.
416 * it will optionally wake up any one waiting on this state (wake == 1) 415 * it will optionally wake up any one waiting on this state (wake == 1).
417 * 416 *
418 * If no bits are set on the state struct after clearing things, the 417 * If no bits are set on the state struct after clearing things, the
419 * struct is freed and removed from the tree 418 * struct is freed and removed from the tree
@@ -570,10 +569,8 @@ hit_next:
570 if (err) 569 if (err)
571 goto out; 570 goto out;
572 if (state->end <= end) { 571 if (state->end <= end) {
573 clear_state_bit(tree, state, &bits, wake); 572 state = clear_state_bit(tree, state, &bits, wake);
574 if (last_end == (u64)-1) 573 goto next;
575 goto out;
576 start = last_end + 1;
577 } 574 }
578 goto search_again; 575 goto search_again;
579 } 576 }
@@ -781,7 +778,6 @@ hit_next:
781 * Just lock what we found and keep going 778 * Just lock what we found and keep going
782 */ 779 */
783 if (state->start == start && state->end <= end) { 780 if (state->start == start && state->end <= end) {
784 struct rb_node *next_node;
785 if (state->state & exclusive_bits) { 781 if (state->state & exclusive_bits) {
786 *failed_start = state->start; 782 *failed_start = state->start;
787 err = -EEXIST; 783 err = -EEXIST;
@@ -789,20 +785,15 @@ hit_next:
789 } 785 }
790 786
791 set_state_bits(tree, state, &bits); 787 set_state_bits(tree, state, &bits);
792
793 cache_state(state, cached_state); 788 cache_state(state, cached_state);
794 merge_state(tree, state); 789 merge_state(tree, state);
795 if (last_end == (u64)-1) 790 if (last_end == (u64)-1)
796 goto out; 791 goto out;
797
798 start = last_end + 1; 792 start = last_end + 1;
799 next_node = rb_next(&state->rb_node); 793 state = next_state(state);
800 if (next_node && start < end && prealloc && !need_resched()) { 794 if (start < end && state && state->start == start &&
801 state = rb_entry(next_node, struct extent_state, 795 !need_resched())
802 rb_node); 796 goto hit_next;
803 if (state->start == start)
804 goto hit_next;
805 }
806 goto search_again; 797 goto search_again;
807 } 798 }
808 799
@@ -845,6 +836,10 @@ hit_next:
845 if (last_end == (u64)-1) 836 if (last_end == (u64)-1)
846 goto out; 837 goto out;
847 start = last_end + 1; 838 start = last_end + 1;
839 state = next_state(state);
840 if (start < end && state && state->start == start &&
841 !need_resched())
842 goto hit_next;
848 } 843 }
849 goto search_again; 844 goto search_again;
850 } 845 }
@@ -994,21 +989,14 @@ hit_next:
994 * Just lock what we found and keep going 989 * Just lock what we found and keep going
995 */ 990 */
996 if (state->start == start && state->end <= end) { 991 if (state->start == start && state->end <= end) {
997 struct rb_node *next_node;
998
999 set_state_bits(tree, state, &bits); 992 set_state_bits(tree, state, &bits);
1000 clear_state_bit(tree, state, &clear_bits, 0); 993 state = clear_state_bit(tree, state, &clear_bits, 0);
1001 if (last_end == (u64)-1) 994 if (last_end == (u64)-1)
1002 goto out; 995 goto out;
1003
1004 start = last_end + 1; 996 start = last_end + 1;
1005 next_node = rb_next(&state->rb_node); 997 if (start < end && state && state->start == start &&
1006 if (next_node && start < end && prealloc && !need_resched()) { 998 !need_resched())
1007 state = rb_entry(next_node, struct extent_state, 999 goto hit_next;
1008 rb_node);
1009 if (state->start == start)
1010 goto hit_next;
1011 }
1012 goto search_again; 1000 goto search_again;
1013 } 1001 }
1014 1002
@@ -1042,10 +1030,13 @@ hit_next:
1042 goto out; 1030 goto out;
1043 if (state->end <= end) { 1031 if (state->end <= end) {
1044 set_state_bits(tree, state, &bits); 1032 set_state_bits(tree, state, &bits);
1045 clear_state_bit(tree, state, &clear_bits, 0); 1033 state = clear_state_bit(tree, state, &clear_bits, 0);
1046 if (last_end == (u64)-1) 1034 if (last_end == (u64)-1)
1047 goto out; 1035 goto out;
1048 start = last_end + 1; 1036 start = last_end + 1;
1037 if (start < end && state && state->start == start &&
1038 !need_resched())
1039 goto hit_next;
1049 } 1040 }
1050 goto search_again; 1041 goto search_again;
1051 } 1042 }
@@ -1173,9 +1164,8 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1173 cached_state, mask); 1164 cached_state, mask);
1174} 1165}
1175 1166
1176static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 1167int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1177 u64 end, struct extent_state **cached_state, 1168 struct extent_state **cached_state, gfp_t mask)
1178 gfp_t mask)
1179{ 1169{
1180 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 1170 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
1181 cached_state, mask); 1171 cached_state, mask);
@@ -1293,7 +1283,7 @@ out:
1293 * returned if we find something, and *start_ret and *end_ret are 1283 * returned if we find something, and *start_ret and *end_ret are
1294 * set to reflect the state struct that was found. 1284 * set to reflect the state struct that was found.
1295 * 1285 *
1296 * If nothing was found, 1 is returned, < 0 on error 1286 * If nothing was found, 1 is returned. If found something, return 0.
1297 */ 1287 */
1298int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1288int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1299 u64 *start_ret, u64 *end_ret, int bits) 1289 u64 *start_ret, u64 *end_ret, int bits)
@@ -1923,6 +1913,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1923 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 1913 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1924 /* try to remap that extent elsewhere? */ 1914 /* try to remap that extent elsewhere? */
1925 bio_put(bio); 1915 bio_put(bio);
1916 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
1926 return -EIO; 1917 return -EIO;
1927 } 1918 }
1928 1919
@@ -2222,17 +2213,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2222 uptodate = 0; 2213 uptodate = 0;
2223 } 2214 }
2224 2215
2225 if (!uptodate && tree->ops &&
2226 tree->ops->writepage_io_failed_hook) {
2227 ret = tree->ops->writepage_io_failed_hook(NULL, page,
2228 start, end, NULL);
2229 /* Writeback already completed */
2230 if (ret == 0)
2231 return 1;
2232 }
2233
2234 if (!uptodate) { 2216 if (!uptodate) {
2235 clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
2236 ClearPageUptodate(page); 2217 ClearPageUptodate(page);
2237 SetPageError(page); 2218 SetPageError(page);
2238 } 2219 }
@@ -2347,10 +2328,23 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2347 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 2328 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
2348 ret = tree->ops->readpage_end_io_hook(page, start, end, 2329 ret = tree->ops->readpage_end_io_hook(page, start, end,
2349 state, mirror); 2330 state, mirror);
2350 if (ret) 2331 if (ret) {
2332 /* no IO indicated but software detected errors
2333 * in the block, either checksum errors or
2334 * issues with the contents */
2335 struct btrfs_root *root =
2336 BTRFS_I(page->mapping->host)->root;
2337 struct btrfs_device *device;
2338
2351 uptodate = 0; 2339 uptodate = 0;
2352 else 2340 device = btrfs_find_device_for_logical(
2341 root, start, mirror);
2342 if (device)
2343 btrfs_dev_stat_inc_and_print(device,
2344 BTRFS_DEV_STAT_CORRUPTION_ERRS);
2345 } else {
2353 clean_io_failure(start, page); 2346 clean_io_failure(start, page);
2347 }
2354 } 2348 }
2355 2349
2356 if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { 2350 if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
@@ -3164,7 +3158,7 @@ static int write_one_eb(struct extent_buffer *eb,
3164 u64 offset = eb->start; 3158 u64 offset = eb->start;
3165 unsigned long i, num_pages; 3159 unsigned long i, num_pages;
3166 int rw = (epd->sync_io ? WRITE_SYNC : WRITE); 3160 int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
3167 int ret; 3161 int ret = 0;
3168 3162
3169 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3163 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3170 num_pages = num_extent_pages(eb->start, eb->len); 3164 num_pages = num_extent_pages(eb->start, eb->len);
@@ -3930,6 +3924,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3930 eb->start = start; 3924 eb->start = start;
3931 eb->len = len; 3925 eb->len = len;
3932 eb->tree = tree; 3926 eb->tree = tree;
3927 eb->bflags = 0;
3933 rwlock_init(&eb->lock); 3928 rwlock_init(&eb->lock);
3934 atomic_set(&eb->write_locks, 0); 3929 atomic_set(&eb->write_locks, 0);
3935 atomic_set(&eb->read_locks, 0); 3930 atomic_set(&eb->read_locks, 0);
@@ -3967,6 +3962,60 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3967 return eb; 3962 return eb;
3968} 3963}
3969 3964
3965struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
3966{
3967 unsigned long i;
3968 struct page *p;
3969 struct extent_buffer *new;
3970 unsigned long num_pages = num_extent_pages(src->start, src->len);
3971
3972 new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC);
3973 if (new == NULL)
3974 return NULL;
3975
3976 for (i = 0; i < num_pages; i++) {
3977 p = alloc_page(GFP_ATOMIC);
3978 BUG_ON(!p);
3979 attach_extent_buffer_page(new, p);
3980 WARN_ON(PageDirty(p));
3981 SetPageUptodate(p);
3982 new->pages[i] = p;
3983 }
3984
3985 copy_extent_buffer(new, src, 0, 0, src->len);
3986 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
3987 set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
3988
3989 return new;
3990}
3991
3992struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
3993{
3994 struct extent_buffer *eb;
3995 unsigned long num_pages = num_extent_pages(0, len);
3996 unsigned long i;
3997
3998 eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC);
3999 if (!eb)
4000 return NULL;
4001
4002 for (i = 0; i < num_pages; i++) {
4003 eb->pages[i] = alloc_page(GFP_ATOMIC);
4004 if (!eb->pages[i])
4005 goto err;
4006 }
4007 set_extent_buffer_uptodate(eb);
4008 btrfs_set_header_nritems(eb, 0);
4009 set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4010
4011 return eb;
4012err:
4013 for (i--; i > 0; i--)
4014 __free_page(eb->pages[i]);
4015 __free_extent_buffer(eb);
4016 return NULL;
4017}
4018
3970static int extent_buffer_under_io(struct extent_buffer *eb) 4019static int extent_buffer_under_io(struct extent_buffer *eb)
3971{ 4020{
3972 return (atomic_read(&eb->io_pages) || 4021 return (atomic_read(&eb->io_pages) ||
@@ -3981,18 +4030,21 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3981 unsigned long start_idx) 4030 unsigned long start_idx)
3982{ 4031{
3983 unsigned long index; 4032 unsigned long index;
4033 unsigned long num_pages;
3984 struct page *page; 4034 struct page *page;
4035 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
3985 4036
3986 BUG_ON(extent_buffer_under_io(eb)); 4037 BUG_ON(extent_buffer_under_io(eb));
3987 4038
3988 index = num_extent_pages(eb->start, eb->len); 4039 num_pages = num_extent_pages(eb->start, eb->len);
4040 index = start_idx + num_pages;
3989 if (start_idx >= index) 4041 if (start_idx >= index)
3990 return; 4042 return;
3991 4043
3992 do { 4044 do {
3993 index--; 4045 index--;
3994 page = extent_buffer_page(eb, index); 4046 page = extent_buffer_page(eb, index);
3995 if (page) { 4047 if (page && mapped) {
3996 spin_lock(&page->mapping->private_lock); 4048 spin_lock(&page->mapping->private_lock);
3997 /* 4049 /*
3998 * We do this since we'll remove the pages after we've 4050 * We do this since we'll remove the pages after we've
@@ -4017,6 +4069,8 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
4017 } 4069 }
4018 spin_unlock(&page->mapping->private_lock); 4070 spin_unlock(&page->mapping->private_lock);
4019 4071
4072 }
4073 if (page) {
4020 /* One for when we alloced the page */ 4074 /* One for when we alloced the page */
4021 page_cache_release(page); 4075 page_cache_release(page);
4022 } 4076 }
@@ -4235,14 +4289,18 @@ static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4235{ 4289{
4236 WARN_ON(atomic_read(&eb->refs) == 0); 4290 WARN_ON(atomic_read(&eb->refs) == 0);
4237 if (atomic_dec_and_test(&eb->refs)) { 4291 if (atomic_dec_and_test(&eb->refs)) {
4238 struct extent_io_tree *tree = eb->tree; 4292 if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) {
4293 spin_unlock(&eb->refs_lock);
4294 } else {
4295 struct extent_io_tree *tree = eb->tree;
4239 4296
4240 spin_unlock(&eb->refs_lock); 4297 spin_unlock(&eb->refs_lock);
4241 4298
4242 spin_lock(&tree->buffer_lock); 4299 spin_lock(&tree->buffer_lock);
4243 radix_tree_delete(&tree->buffer, 4300 radix_tree_delete(&tree->buffer,
4244 eb->start >> PAGE_CACHE_SHIFT); 4301 eb->start >> PAGE_CACHE_SHIFT);
4245 spin_unlock(&tree->buffer_lock); 4302 spin_unlock(&tree->buffer_lock);
4303 }
4246 4304
4247 /* Should be safe to release our pages at this point */ 4305 /* Should be safe to release our pages at this point */
4248 btrfs_release_extent_buffer_page(eb, 0); 4306 btrfs_release_extent_buffer_page(eb, 0);
@@ -4260,6 +4318,10 @@ void free_extent_buffer(struct extent_buffer *eb)
4260 4318
4261 spin_lock(&eb->refs_lock); 4319 spin_lock(&eb->refs_lock);
4262 if (atomic_read(&eb->refs) == 2 && 4320 if (atomic_read(&eb->refs) == 2 &&
4321 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
4322 atomic_dec(&eb->refs);
4323
4324 if (atomic_read(&eb->refs) == 2 &&
4263 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 4325 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
4264 !extent_buffer_under_io(eb) && 4326 !extent_buffer_under_io(eb) &&
4265 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4327 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b516c3b8dec6..25900af5b15d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -39,6 +39,7 @@
39#define EXTENT_BUFFER_STALE 6 39#define EXTENT_BUFFER_STALE 6
40#define EXTENT_BUFFER_WRITEBACK 7 40#define EXTENT_BUFFER_WRITEBACK 7
41#define EXTENT_BUFFER_IOERR 8 41#define EXTENT_BUFFER_IOERR 8
42#define EXTENT_BUFFER_DUMMY 9
42 43
43/* these are flags for extent_clear_unlock_delalloc */ 44/* these are flags for extent_clear_unlock_delalloc */
44#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 45#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -75,9 +76,6 @@ struct extent_io_ops {
75 unsigned long bio_flags); 76 unsigned long bio_flags);
76 int (*readpage_io_hook)(struct page *page, u64 start, u64 end); 77 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
77 int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); 78 int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
78 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
79 u64 start, u64 end,
80 struct extent_state *state);
81 int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, 79 int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
82 struct extent_state *state, int mirror); 80 struct extent_state *state, int mirror);
83 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, 81 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
@@ -225,6 +223,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
225 struct extent_state **cached_state, gfp_t mask); 223 struct extent_state **cached_state, gfp_t mask);
226int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 224int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
227 struct extent_state **cached_state, gfp_t mask); 225 struct extent_state **cached_state, gfp_t mask);
226int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
227 struct extent_state **cached_state, gfp_t mask);
228int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 228int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
229 gfp_t mask); 229 gfp_t mask);
230int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 230int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -265,6 +265,8 @@ void set_page_extent_mapped(struct page *page);
265 265
266struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 266struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
267 u64 start, unsigned long len); 267 u64 start, unsigned long len);
268struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
269struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
268struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 270struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
269 u64 start, unsigned long len); 271 u64 start, unsigned long len);
270void free_extent_buffer(struct extent_buffer *eb); 272void free_extent_buffer(struct extent_buffer *eb);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 53bf2d764bbc..70dc8ca73e25 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -65,6 +65,21 @@ struct inode_defrag {
65 int cycled; 65 int cycled;
66}; 66};
67 67
68static int __compare_inode_defrag(struct inode_defrag *defrag1,
69 struct inode_defrag *defrag2)
70{
71 if (defrag1->root > defrag2->root)
72 return 1;
73 else if (defrag1->root < defrag2->root)
74 return -1;
75 else if (defrag1->ino > defrag2->ino)
76 return 1;
77 else if (defrag1->ino < defrag2->ino)
78 return -1;
79 else
80 return 0;
81}
82
68/* pop a record for an inode into the defrag tree. The lock 83/* pop a record for an inode into the defrag tree. The lock
69 * must be held already 84 * must be held already
70 * 85 *
@@ -81,15 +96,17 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
81 struct inode_defrag *entry; 96 struct inode_defrag *entry;
82 struct rb_node **p; 97 struct rb_node **p;
83 struct rb_node *parent = NULL; 98 struct rb_node *parent = NULL;
99 int ret;
84 100
85 p = &root->fs_info->defrag_inodes.rb_node; 101 p = &root->fs_info->defrag_inodes.rb_node;
86 while (*p) { 102 while (*p) {
87 parent = *p; 103 parent = *p;
88 entry = rb_entry(parent, struct inode_defrag, rb_node); 104 entry = rb_entry(parent, struct inode_defrag, rb_node);
89 105
90 if (defrag->ino < entry->ino) 106 ret = __compare_inode_defrag(defrag, entry);
107 if (ret < 0)
91 p = &parent->rb_left; 108 p = &parent->rb_left;
92 else if (defrag->ino > entry->ino) 109 else if (ret > 0)
93 p = &parent->rb_right; 110 p = &parent->rb_right;
94 else { 111 else {
95 /* if we're reinserting an entry for 112 /* if we're reinserting an entry for
@@ -103,7 +120,7 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
103 goto exists; 120 goto exists;
104 } 121 }
105 } 122 }
106 BTRFS_I(inode)->in_defrag = 1; 123 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
107 rb_link_node(&defrag->rb_node, parent, p); 124 rb_link_node(&defrag->rb_node, parent, p);
108 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 125 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
109 return; 126 return;
@@ -131,7 +148,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
131 if (btrfs_fs_closing(root->fs_info)) 148 if (btrfs_fs_closing(root->fs_info))
132 return 0; 149 return 0;
133 150
134 if (BTRFS_I(inode)->in_defrag) 151 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
135 return 0; 152 return 0;
136 153
137 if (trans) 154 if (trans)
@@ -148,7 +165,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
148 defrag->root = root->root_key.objectid; 165 defrag->root = root->root_key.objectid;
149 166
150 spin_lock(&root->fs_info->defrag_inodes_lock); 167 spin_lock(&root->fs_info->defrag_inodes_lock);
151 if (!BTRFS_I(inode)->in_defrag) 168 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
152 __btrfs_add_inode_defrag(inode, defrag); 169 __btrfs_add_inode_defrag(inode, defrag);
153 else 170 else
154 kfree(defrag); 171 kfree(defrag);
@@ -159,28 +176,35 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
159/* 176/*
160 * must be called with the defrag_inodes lock held 177 * must be called with the defrag_inodes lock held
161 */ 178 */
162struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino, 179struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
180 u64 root, u64 ino,
163 struct rb_node **next) 181 struct rb_node **next)
164{ 182{
165 struct inode_defrag *entry = NULL; 183 struct inode_defrag *entry = NULL;
184 struct inode_defrag tmp;
166 struct rb_node *p; 185 struct rb_node *p;
167 struct rb_node *parent = NULL; 186 struct rb_node *parent = NULL;
187 int ret;
188
189 tmp.ino = ino;
190 tmp.root = root;
168 191
169 p = info->defrag_inodes.rb_node; 192 p = info->defrag_inodes.rb_node;
170 while (p) { 193 while (p) {
171 parent = p; 194 parent = p;
172 entry = rb_entry(parent, struct inode_defrag, rb_node); 195 entry = rb_entry(parent, struct inode_defrag, rb_node);
173 196
174 if (ino < entry->ino) 197 ret = __compare_inode_defrag(&tmp, entry);
198 if (ret < 0)
175 p = parent->rb_left; 199 p = parent->rb_left;
176 else if (ino > entry->ino) 200 else if (ret > 0)
177 p = parent->rb_right; 201 p = parent->rb_right;
178 else 202 else
179 return entry; 203 return entry;
180 } 204 }
181 205
182 if (next) { 206 if (next) {
183 while (parent && ino > entry->ino) { 207 while (parent && __compare_inode_defrag(&tmp, entry) > 0) {
184 parent = rb_next(parent); 208 parent = rb_next(parent);
185 entry = rb_entry(parent, struct inode_defrag, rb_node); 209 entry = rb_entry(parent, struct inode_defrag, rb_node);
186 } 210 }
@@ -202,6 +226,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
202 struct btrfs_key key; 226 struct btrfs_key key;
203 struct btrfs_ioctl_defrag_range_args range; 227 struct btrfs_ioctl_defrag_range_args range;
204 u64 first_ino = 0; 228 u64 first_ino = 0;
229 u64 root_objectid = 0;
205 int num_defrag; 230 int num_defrag;
206 int defrag_batch = 1024; 231 int defrag_batch = 1024;
207 232
@@ -214,11 +239,14 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
214 n = NULL; 239 n = NULL;
215 240
216 /* find an inode to defrag */ 241 /* find an inode to defrag */
217 defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n); 242 defrag = btrfs_find_defrag_inode(fs_info, root_objectid,
243 first_ino, &n);
218 if (!defrag) { 244 if (!defrag) {
219 if (n) 245 if (n) {
220 defrag = rb_entry(n, struct inode_defrag, rb_node); 246 defrag = rb_entry(n, struct inode_defrag,
221 else if (first_ino) { 247 rb_node);
248 } else if (root_objectid || first_ino) {
249 root_objectid = 0;
222 first_ino = 0; 250 first_ino = 0;
223 continue; 251 continue;
224 } else { 252 } else {
@@ -228,6 +256,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
228 256
229 /* remove it from the rbtree */ 257 /* remove it from the rbtree */
230 first_ino = defrag->ino + 1; 258 first_ino = defrag->ino + 1;
259 root_objectid = defrag->root;
231 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); 260 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
232 261
233 if (btrfs_fs_closing(fs_info)) 262 if (btrfs_fs_closing(fs_info))
@@ -252,7 +281,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
252 goto next; 281 goto next;
253 282
254 /* do a chunk of defrag */ 283 /* do a chunk of defrag */
255 BTRFS_I(inode)->in_defrag = 0; 284 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
256 range.start = defrag->last_offset; 285 range.start = defrag->last_offset;
257 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 286 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
258 defrag_batch); 287 defrag_batch);
@@ -1404,12 +1433,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1404 goto out; 1433 goto out;
1405 } 1434 }
1406 1435
1407 err = btrfs_update_time(file); 1436 err = file_update_time(file);
1408 if (err) { 1437 if (err) {
1409 mutex_unlock(&inode->i_mutex); 1438 mutex_unlock(&inode->i_mutex);
1410 goto out; 1439 goto out;
1411 } 1440 }
1412 BTRFS_I(inode)->sequence++;
1413 1441
1414 start_pos = round_down(pos, root->sectorsize); 1442 start_pos = round_down(pos, root->sectorsize);
1415 if (start_pos > i_size_read(inode)) { 1443 if (start_pos > i_size_read(inode)) {
@@ -1466,8 +1494,8 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
1466 * flush down new bytes that may have been written if the 1494 * flush down new bytes that may have been written if the
1467 * application were using truncate to replace a file in place. 1495 * application were using truncate to replace a file in place.
1468 */ 1496 */
1469 if (BTRFS_I(inode)->ordered_data_close) { 1497 if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
1470 BTRFS_I(inode)->ordered_data_close = 0; 1498 &BTRFS_I(inode)->runtime_flags)) {
1471 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); 1499 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
1472 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 1500 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1473 filemap_flush(inode->i_mapping); 1501 filemap_flush(inode->i_mapping);
@@ -1498,14 +1526,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1498 1526
1499 trace_btrfs_sync_file(file, datasync); 1527 trace_btrfs_sync_file(file, datasync);
1500 1528
1501 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1502 if (ret)
1503 return ret;
1504 mutex_lock(&inode->i_mutex); 1529 mutex_lock(&inode->i_mutex);
1505 1530
1506 /* we wait first, since the writeback may change the inode */ 1531 /*
1532 * we wait first, since the writeback may change the inode, also wait
1533 * ordered range does a filemape_write_and_wait_range which is why we
1534 * don't do it above like other file systems.
1535 */
1507 root->log_batch++; 1536 root->log_batch++;
1508 btrfs_wait_ordered_range(inode, 0, (u64)-1); 1537 btrfs_wait_ordered_range(inode, start, end);
1509 root->log_batch++; 1538 root->log_batch++;
1510 1539
1511 /* 1540 /*
@@ -1523,7 +1552,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1523 * syncing 1552 * syncing
1524 */ 1553 */
1525 smp_mb(); 1554 smp_mb();
1526 if (BTRFS_I(inode)->last_trans <= 1555 if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
1556 BTRFS_I(inode)->last_trans <=
1527 root->fs_info->last_trans_committed) { 1557 root->fs_info->last_trans_committed) {
1528 BTRFS_I(inode)->last_trans = 0; 1558 BTRFS_I(inode)->last_trans = 0;
1529 mutex_unlock(&inode->i_mutex); 1559 mutex_unlock(&inode->i_mutex);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 202008ec367d..81296c57405a 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -33,6 +33,8 @@
33 33
34static int link_free_space(struct btrfs_free_space_ctl *ctl, 34static int link_free_space(struct btrfs_free_space_ctl *ctl,
35 struct btrfs_free_space *info); 35 struct btrfs_free_space *info);
36static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
37 struct btrfs_free_space *info);
36 38
37static struct inode *__lookup_free_space_inode(struct btrfs_root *root, 39static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
38 struct btrfs_path *path, 40 struct btrfs_path *path,
@@ -75,7 +77,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
75 return ERR_PTR(-ENOENT); 77 return ERR_PTR(-ENOENT);
76 } 78 }
77 79
78 inode->i_mapping->flags &= ~__GFP_FS; 80 mapping_set_gfp_mask(inode->i_mapping,
81 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
79 82
80 return inode; 83 return inode;
81} 84}
@@ -365,7 +368,7 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
365 368
366static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) 369static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
367{ 370{
368 u64 *val; 371 __le64 *val;
369 372
370 io_ctl_map_page(io_ctl, 1); 373 io_ctl_map_page(io_ctl, 1);
371 374
@@ -388,7 +391,7 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
388 391
389static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) 392static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
390{ 393{
391 u64 *gen; 394 __le64 *gen;
392 395
393 /* 396 /*
394 * Skip the crc area. If we don't check crcs then we just have a 64bit 397 * Skip the crc area. If we don't check crcs then we just have a 64bit
@@ -584,6 +587,44 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
584 return 0; 587 return 0;
585} 588}
586 589
590/*
591 * Since we attach pinned extents after the fact we can have contiguous sections
592 * of free space that are split up in entries. This poses a problem with the
593 * tree logging stuff since it could have allocated across what appears to be 2
594 * entries since we would have merged the entries when adding the pinned extents
595 * back to the free space cache. So run through the space cache that we just
596 * loaded and merge contiguous entries. This will make the log replay stuff not
597 * blow up and it will make for nicer allocator behavior.
598 */
599static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
600{
601 struct btrfs_free_space *e, *prev = NULL;
602 struct rb_node *n;
603
604again:
605 spin_lock(&ctl->tree_lock);
606 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
607 e = rb_entry(n, struct btrfs_free_space, offset_index);
608 if (!prev)
609 goto next;
610 if (e->bitmap || prev->bitmap)
611 goto next;
612 if (prev->offset + prev->bytes == e->offset) {
613 unlink_free_space(ctl, prev);
614 unlink_free_space(ctl, e);
615 prev->bytes += e->bytes;
616 kmem_cache_free(btrfs_free_space_cachep, e);
617 link_free_space(ctl, prev);
618 prev = NULL;
619 spin_unlock(&ctl->tree_lock);
620 goto again;
621 }
622next:
623 prev = e;
624 }
625 spin_unlock(&ctl->tree_lock);
626}
627
587int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 628int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
588 struct btrfs_free_space_ctl *ctl, 629 struct btrfs_free_space_ctl *ctl,
589 struct btrfs_path *path, u64 offset) 630 struct btrfs_path *path, u64 offset)
@@ -726,6 +767,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
726 } 767 }
727 768
728 io_ctl_drop_pages(&io_ctl); 769 io_ctl_drop_pages(&io_ctl);
770 merge_space_tree(ctl);
729 ret = 1; 771 ret = 1;
730out: 772out:
731 io_ctl_free(&io_ctl); 773 io_ctl_free(&io_ctl);
@@ -972,9 +1014,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
972 goto out; 1014 goto out;
973 1015
974 1016
975 ret = filemap_write_and_wait(inode->i_mapping); 1017 btrfs_wait_ordered_range(inode, 0, (u64)-1);
976 if (ret)
977 goto out;
978 1018
979 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 1019 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
980 key.offset = offset; 1020 key.offset = offset;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ceb7b9c9edcc..f6ab6f5e635a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -89,7 +89,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
89 89
90static int btrfs_setsize(struct inode *inode, loff_t newsize); 90static int btrfs_setsize(struct inode *inode, loff_t newsize);
91static int btrfs_truncate(struct inode *inode); 91static int btrfs_truncate(struct inode *inode);
92static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); 92static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
93static noinline int cow_file_range(struct inode *inode, 93static noinline int cow_file_range(struct inode *inode,
94 struct page *locked_page, 94 struct page *locked_page,
95 u64 start, u64 end, int *page_started, 95 u64 start, u64 end, int *page_started,
@@ -257,10 +257,13 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
257 ret = insert_inline_extent(trans, root, inode, start, 257 ret = insert_inline_extent(trans, root, inode, start,
258 inline_len, compressed_size, 258 inline_len, compressed_size,
259 compress_type, compressed_pages); 259 compress_type, compressed_pages);
260 if (ret) { 260 if (ret && ret != -ENOSPC) {
261 btrfs_abort_transaction(trans, root, ret); 261 btrfs_abort_transaction(trans, root, ret);
262 return ret; 262 return ret;
263 } else if (ret == -ENOSPC) {
264 return 1;
263 } 265 }
266
264 btrfs_delalloc_release_metadata(inode, end + 1 - start); 267 btrfs_delalloc_release_metadata(inode, end + 1 - start);
265 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 268 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
266 return 0; 269 return 0;
@@ -1572,11 +1575,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1572 if (btrfs_is_free_space_inode(root, inode)) 1575 if (btrfs_is_free_space_inode(root, inode))
1573 metadata = 2; 1576 metadata = 2;
1574 1577
1575 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1576 if (ret)
1577 return ret;
1578
1579 if (!(rw & REQ_WRITE)) { 1578 if (!(rw & REQ_WRITE)) {
1579 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1580 if (ret)
1581 return ret;
1582
1580 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1583 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1581 return btrfs_submit_compressed_read(inode, bio, 1584 return btrfs_submit_compressed_read(inode, bio,
1582 mirror_num, bio_flags); 1585 mirror_num, bio_flags);
@@ -1815,25 +1818,24 @@ out:
1815 * an ordered extent if the range of bytes in the file it covers are 1818 * an ordered extent if the range of bytes in the file it covers are
1816 * fully written. 1819 * fully written.
1817 */ 1820 */
1818static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 1821static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1819{ 1822{
1823 struct inode *inode = ordered_extent->inode;
1820 struct btrfs_root *root = BTRFS_I(inode)->root; 1824 struct btrfs_root *root = BTRFS_I(inode)->root;
1821 struct btrfs_trans_handle *trans = NULL; 1825 struct btrfs_trans_handle *trans = NULL;
1822 struct btrfs_ordered_extent *ordered_extent = NULL;
1823 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1826 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1824 struct extent_state *cached_state = NULL; 1827 struct extent_state *cached_state = NULL;
1825 int compress_type = 0; 1828 int compress_type = 0;
1826 int ret; 1829 int ret;
1827 bool nolock; 1830 bool nolock;
1828 1831
1829 ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
1830 end - start + 1);
1831 if (!ret)
1832 return 0;
1833 BUG_ON(!ordered_extent); /* Logic error */
1834
1835 nolock = btrfs_is_free_space_inode(root, inode); 1832 nolock = btrfs_is_free_space_inode(root, inode);
1836 1833
1834 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
1835 ret = -EIO;
1836 goto out;
1837 }
1838
1837 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1839 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1838 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 1840 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
1839 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1841 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
@@ -1889,12 +1891,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1889 ordered_extent->file_offset, 1891 ordered_extent->file_offset,
1890 ordered_extent->len); 1892 ordered_extent->len);
1891 } 1893 }
1892 unlock_extent_cached(io_tree, ordered_extent->file_offset, 1894
1893 ordered_extent->file_offset +
1894 ordered_extent->len - 1, &cached_state, GFP_NOFS);
1895 if (ret < 0) { 1895 if (ret < 0) {
1896 btrfs_abort_transaction(trans, root, ret); 1896 btrfs_abort_transaction(trans, root, ret);
1897 goto out; 1897 goto out_unlock;
1898 } 1898 }
1899 1899
1900 add_pending_csums(trans, inode, ordered_extent->file_offset, 1900 add_pending_csums(trans, inode, ordered_extent->file_offset,
@@ -1905,10 +1905,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1905 ret = btrfs_update_inode_fallback(trans, root, inode); 1905 ret = btrfs_update_inode_fallback(trans, root, inode);
1906 if (ret) { /* -ENOMEM or corruption */ 1906 if (ret) { /* -ENOMEM or corruption */
1907 btrfs_abort_transaction(trans, root, ret); 1907 btrfs_abort_transaction(trans, root, ret);
1908 goto out; 1908 goto out_unlock;
1909 } 1909 }
1910 } 1910 }
1911 ret = 0; 1911 ret = 0;
1912out_unlock:
1913 unlock_extent_cached(io_tree, ordered_extent->file_offset,
1914 ordered_extent->file_offset +
1915 ordered_extent->len - 1, &cached_state, GFP_NOFS);
1912out: 1916out:
1913 if (root != root->fs_info->tree_root) 1917 if (root != root->fs_info->tree_root)
1914 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1918 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
@@ -1919,26 +1923,57 @@ out:
1919 btrfs_end_transaction(trans, root); 1923 btrfs_end_transaction(trans, root);
1920 } 1924 }
1921 1925
1926 if (ret)
1927 clear_extent_uptodate(io_tree, ordered_extent->file_offset,
1928 ordered_extent->file_offset +
1929 ordered_extent->len - 1, NULL, GFP_NOFS);
1930
1931 /*
1932 * This needs to be dont to make sure anybody waiting knows we are done
1933 * upating everything for this ordered extent.
1934 */
1935 btrfs_remove_ordered_extent(inode, ordered_extent);
1936
1922 /* once for us */ 1937 /* once for us */
1923 btrfs_put_ordered_extent(ordered_extent); 1938 btrfs_put_ordered_extent(ordered_extent);
1924 /* once for the tree */ 1939 /* once for the tree */
1925 btrfs_put_ordered_extent(ordered_extent); 1940 btrfs_put_ordered_extent(ordered_extent);
1926 1941
1927 return 0; 1942 return ret;
1928out_unlock: 1943}
1929 unlock_extent_cached(io_tree, ordered_extent->file_offset, 1944
1930 ordered_extent->file_offset + 1945static void finish_ordered_fn(struct btrfs_work *work)
1931 ordered_extent->len - 1, &cached_state, GFP_NOFS); 1946{
1932 goto out; 1947 struct btrfs_ordered_extent *ordered_extent;
1948 ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
1949 btrfs_finish_ordered_io(ordered_extent);
1933} 1950}
1934 1951
1935static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 1952static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1936 struct extent_state *state, int uptodate) 1953 struct extent_state *state, int uptodate)
1937{ 1954{
1955 struct inode *inode = page->mapping->host;
1956 struct btrfs_root *root = BTRFS_I(inode)->root;
1957 struct btrfs_ordered_extent *ordered_extent = NULL;
1958 struct btrfs_workers *workers;
1959
1938 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 1960 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
1939 1961
1940 ClearPagePrivate2(page); 1962 ClearPagePrivate2(page);
1941 return btrfs_finish_ordered_io(page->mapping->host, start, end); 1963 if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
1964 end - start + 1, uptodate))
1965 return 0;
1966
1967 ordered_extent->work.func = finish_ordered_fn;
1968 ordered_extent->work.flags = 0;
1969
1970 if (btrfs_is_free_space_inode(root, inode))
1971 workers = &root->fs_info->endio_freespace_worker;
1972 else
1973 workers = &root->fs_info->endio_write_workers;
1974 btrfs_queue_worker(workers, &ordered_extent->work);
1975
1976 return 0;
1942} 1977}
1943 1978
1944/* 1979/*
@@ -2072,12 +2107,12 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2072 struct btrfs_block_rsv *block_rsv; 2107 struct btrfs_block_rsv *block_rsv;
2073 int ret; 2108 int ret;
2074 2109
2075 if (!list_empty(&root->orphan_list) || 2110 if (atomic_read(&root->orphan_inodes) ||
2076 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) 2111 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2077 return; 2112 return;
2078 2113
2079 spin_lock(&root->orphan_lock); 2114 spin_lock(&root->orphan_lock);
2080 if (!list_empty(&root->orphan_list)) { 2115 if (atomic_read(&root->orphan_inodes)) {
2081 spin_unlock(&root->orphan_lock); 2116 spin_unlock(&root->orphan_lock);
2082 return; 2117 return;
2083 } 2118 }
@@ -2134,8 +2169,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2134 block_rsv = NULL; 2169 block_rsv = NULL;
2135 } 2170 }
2136 2171
2137 if (list_empty(&BTRFS_I(inode)->i_orphan)) { 2172 if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2138 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2173 &BTRFS_I(inode)->runtime_flags)) {
2139#if 0 2174#if 0
2140 /* 2175 /*
2141 * For proper ENOSPC handling, we should do orphan 2176 * For proper ENOSPC handling, we should do orphan
@@ -2148,12 +2183,12 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2148 insert = 1; 2183 insert = 1;
2149#endif 2184#endif
2150 insert = 1; 2185 insert = 1;
2186 atomic_dec(&root->orphan_inodes);
2151 } 2187 }
2152 2188
2153 if (!BTRFS_I(inode)->orphan_meta_reserved) { 2189 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
2154 BTRFS_I(inode)->orphan_meta_reserved = 1; 2190 &BTRFS_I(inode)->runtime_flags))
2155 reserve = 1; 2191 reserve = 1;
2156 }
2157 spin_unlock(&root->orphan_lock); 2192 spin_unlock(&root->orphan_lock);
2158 2193
2159 /* grab metadata reservation from transaction handle */ 2194 /* grab metadata reservation from transaction handle */
@@ -2166,6 +2201,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2166 if (insert >= 1) { 2201 if (insert >= 1) {
2167 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); 2202 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
2168 if (ret && ret != -EEXIST) { 2203 if (ret && ret != -EEXIST) {
2204 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2205 &BTRFS_I(inode)->runtime_flags);
2169 btrfs_abort_transaction(trans, root, ret); 2206 btrfs_abort_transaction(trans, root, ret);
2170 return ret; 2207 return ret;
2171 } 2208 }
@@ -2196,15 +2233,13 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2196 int ret = 0; 2233 int ret = 0;
2197 2234
2198 spin_lock(&root->orphan_lock); 2235 spin_lock(&root->orphan_lock);
2199 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 2236 if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2200 list_del_init(&BTRFS_I(inode)->i_orphan); 2237 &BTRFS_I(inode)->runtime_flags))
2201 delete_item = 1; 2238 delete_item = 1;
2202 }
2203 2239
2204 if (BTRFS_I(inode)->orphan_meta_reserved) { 2240 if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
2205 BTRFS_I(inode)->orphan_meta_reserved = 0; 2241 &BTRFS_I(inode)->runtime_flags))
2206 release_rsv = 1; 2242 release_rsv = 1;
2207 }
2208 spin_unlock(&root->orphan_lock); 2243 spin_unlock(&root->orphan_lock);
2209 2244
2210 if (trans && delete_item) { 2245 if (trans && delete_item) {
@@ -2212,8 +2247,10 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2212 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ 2247 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
2213 } 2248 }
2214 2249
2215 if (release_rsv) 2250 if (release_rsv) {
2216 btrfs_orphan_release_metadata(inode); 2251 btrfs_orphan_release_metadata(inode);
2252 atomic_dec(&root->orphan_inodes);
2253 }
2217 2254
2218 return 0; 2255 return 0;
2219} 2256}
@@ -2341,6 +2378,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2341 ret = PTR_ERR(trans); 2378 ret = PTR_ERR(trans);
2342 goto out; 2379 goto out;
2343 } 2380 }
2381 printk(KERN_ERR "auto deleting %Lu\n",
2382 found_key.objectid);
2344 ret = btrfs_del_orphan_item(trans, root, 2383 ret = btrfs_del_orphan_item(trans, root,
2345 found_key.objectid); 2384 found_key.objectid);
2346 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ 2385 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
@@ -2352,9 +2391,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2352 * add this inode to the orphan list so btrfs_orphan_del does 2391 * add this inode to the orphan list so btrfs_orphan_del does
2353 * the proper thing when we hit it 2392 * the proper thing when we hit it
2354 */ 2393 */
2355 spin_lock(&root->orphan_lock); 2394 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2356 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2395 &BTRFS_I(inode)->runtime_flags);
2357 spin_unlock(&root->orphan_lock);
2358 2396
2359 /* if we have links, this was a truncate, lets do that */ 2397 /* if we have links, this was a truncate, lets do that */
2360 if (inode->i_nlink) { 2398 if (inode->i_nlink) {
@@ -2510,7 +2548,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
2510 2548
2511 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 2549 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
2512 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 2550 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
2513 BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); 2551 inode->i_version = btrfs_inode_sequence(leaf, inode_item);
2514 inode->i_generation = BTRFS_I(inode)->generation; 2552 inode->i_generation = BTRFS_I(inode)->generation;
2515 inode->i_rdev = 0; 2553 inode->i_rdev = 0;
2516 rdev = btrfs_inode_rdev(leaf, inode_item); 2554 rdev = btrfs_inode_rdev(leaf, inode_item);
@@ -2594,7 +2632,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2594 2632
2595 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); 2633 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2596 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); 2634 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
2597 btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); 2635 btrfs_set_inode_sequence(leaf, item, inode->i_version);
2598 btrfs_set_inode_transid(leaf, item, trans->transid); 2636 btrfs_set_inode_transid(leaf, item, trans->transid);
2599 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2637 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2600 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2638 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
@@ -2752,6 +2790,8 @@ err:
2752 goto out; 2790 goto out;
2753 2791
2754 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 2792 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2793 inode_inc_iversion(inode);
2794 inode_inc_iversion(dir);
2755 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2795 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2756 btrfs_update_inode(trans, root, dir); 2796 btrfs_update_inode(trans, root, dir);
2757out: 2797out:
@@ -3089,6 +3129,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
3089 } 3129 }
3090 3130
3091 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 3131 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
3132 inode_inc_iversion(dir);
3092 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 3133 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
3093 ret = btrfs_update_inode(trans, root, dir); 3134 ret = btrfs_update_inode(trans, root, dir);
3094 if (ret) 3135 if (ret)
@@ -3607,7 +3648,8 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
3607 * any new writes get down to disk quickly. 3648 * any new writes get down to disk quickly.
3608 */ 3649 */
3609 if (newsize == 0) 3650 if (newsize == 0)
3610 BTRFS_I(inode)->ordered_data_close = 1; 3651 set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
3652 &BTRFS_I(inode)->runtime_flags);
3611 3653
3612 /* we don't support swapfiles, so vmtruncate shouldn't fail */ 3654 /* we don't support swapfiles, so vmtruncate shouldn't fail */
3613 truncate_setsize(inode, newsize); 3655 truncate_setsize(inode, newsize);
@@ -3638,6 +3680,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3638 3680
3639 if (attr->ia_valid) { 3681 if (attr->ia_valid) {
3640 setattr_copy(inode, attr); 3682 setattr_copy(inode, attr);
3683 inode_inc_iversion(inode);
3641 err = btrfs_dirty_inode(inode); 3684 err = btrfs_dirty_inode(inode);
3642 3685
3643 if (!err && attr->ia_valid & ATTR_MODE) 3686 if (!err && attr->ia_valid & ATTR_MODE)
@@ -3671,7 +3714,8 @@ void btrfs_evict_inode(struct inode *inode)
3671 btrfs_wait_ordered_range(inode, 0, (u64)-1); 3714 btrfs_wait_ordered_range(inode, 0, (u64)-1);
3672 3715
3673 if (root->fs_info->log_root_recovering) { 3716 if (root->fs_info->log_root_recovering) {
3674 BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); 3717 BUG_ON(!test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3718 &BTRFS_I(inode)->runtime_flags));
3675 goto no_delete; 3719 goto no_delete;
3676 } 3720 }
3677 3721
@@ -4066,7 +4110,7 @@ static struct inode *new_simple_dir(struct super_block *s,
4066 4110
4067 BTRFS_I(inode)->root = root; 4111 BTRFS_I(inode)->root = root;
4068 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 4112 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
4069 BTRFS_I(inode)->dummy_inode = 1; 4113 set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
4070 4114
4071 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; 4115 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
4072 inode->i_op = &btrfs_dir_ro_inode_operations; 4116 inode->i_op = &btrfs_dir_ro_inode_operations;
@@ -4370,7 +4414,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4370 int ret = 0; 4414 int ret = 0;
4371 bool nolock = false; 4415 bool nolock = false;
4372 4416
4373 if (BTRFS_I(inode)->dummy_inode) 4417 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
4374 return 0; 4418 return 0;
4375 4419
4376 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode)) 4420 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode))
@@ -4403,7 +4447,7 @@ int btrfs_dirty_inode(struct inode *inode)
4403 struct btrfs_trans_handle *trans; 4447 struct btrfs_trans_handle *trans;
4404 int ret; 4448 int ret;
4405 4449
4406 if (BTRFS_I(inode)->dummy_inode) 4450 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
4407 return 0; 4451 return 0;
4408 4452
4409 trans = btrfs_join_transaction(root); 4453 trans = btrfs_join_transaction(root);
@@ -4431,46 +4475,18 @@ int btrfs_dirty_inode(struct inode *inode)
4431 * This is a copy of file_update_time. We need this so we can return error on 4475 * This is a copy of file_update_time. We need this so we can return error on
4432 * ENOSPC for updating the inode in the case of file write and mmap writes. 4476 * ENOSPC for updating the inode in the case of file write and mmap writes.
4433 */ 4477 */
4434int btrfs_update_time(struct file *file) 4478static int btrfs_update_time(struct inode *inode, struct timespec *now,
4479 int flags)
4435{ 4480{
4436 struct inode *inode = file->f_path.dentry->d_inode; 4481 if (flags & S_VERSION)
4437 struct timespec now;
4438 int ret;
4439 enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
4440
4441 /* First try to exhaust all avenues to not sync */
4442 if (IS_NOCMTIME(inode))
4443 return 0;
4444
4445 now = current_fs_time(inode->i_sb);
4446 if (!timespec_equal(&inode->i_mtime, &now))
4447 sync_it = S_MTIME;
4448
4449 if (!timespec_equal(&inode->i_ctime, &now))
4450 sync_it |= S_CTIME;
4451
4452 if (IS_I_VERSION(inode))
4453 sync_it |= S_VERSION;
4454
4455 if (!sync_it)
4456 return 0;
4457
4458 /* Finally allowed to write? Takes lock. */
4459 if (mnt_want_write_file(file))
4460 return 0;
4461
4462 /* Only change inode inside the lock region */
4463 if (sync_it & S_VERSION)
4464 inode_inc_iversion(inode); 4482 inode_inc_iversion(inode);
4465 if (sync_it & S_CTIME) 4483 if (flags & S_CTIME)
4466 inode->i_ctime = now; 4484 inode->i_ctime = *now;
4467 if (sync_it & S_MTIME) 4485 if (flags & S_MTIME)
4468 inode->i_mtime = now; 4486 inode->i_mtime = *now;
4469 ret = btrfs_dirty_inode(inode); 4487 if (flags & S_ATIME)
4470 if (!ret) 4488 inode->i_atime = *now;
4471 mark_inode_dirty_sync(inode); 4489 return btrfs_dirty_inode(inode);
4472 mnt_drop_write(file->f_path.mnt);
4473 return ret;
4474} 4490}
4475 4491
4476/* 4492/*
@@ -4730,6 +4746,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
4730 4746
4731 btrfs_i_size_write(parent_inode, parent_inode->i_size + 4747 btrfs_i_size_write(parent_inode, parent_inode->i_size +
4732 name_len * 2); 4748 name_len * 2);
4749 inode_inc_iversion(parent_inode);
4733 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 4750 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
4734 ret = btrfs_update_inode(trans, root, parent_inode); 4751 ret = btrfs_update_inode(trans, root, parent_inode);
4735 if (ret) 4752 if (ret)
@@ -4937,6 +4954,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4937 } 4954 }
4938 4955
4939 btrfs_inc_nlink(inode); 4956 btrfs_inc_nlink(inode);
4957 inode_inc_iversion(inode);
4940 inode->i_ctime = CURRENT_TIME; 4958 inode->i_ctime = CURRENT_TIME;
4941 ihold(inode); 4959 ihold(inode);
4942 4960
@@ -5903,9 +5921,7 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
5903 struct btrfs_dio_private *dip = bio->bi_private; 5921 struct btrfs_dio_private *dip = bio->bi_private;
5904 struct inode *inode = dip->inode; 5922 struct inode *inode = dip->inode;
5905 struct btrfs_root *root = BTRFS_I(inode)->root; 5923 struct btrfs_root *root = BTRFS_I(inode)->root;
5906 struct btrfs_trans_handle *trans;
5907 struct btrfs_ordered_extent *ordered = NULL; 5924 struct btrfs_ordered_extent *ordered = NULL;
5908 struct extent_state *cached_state = NULL;
5909 u64 ordered_offset = dip->logical_offset; 5925 u64 ordered_offset = dip->logical_offset;
5910 u64 ordered_bytes = dip->bytes; 5926 u64 ordered_bytes = dip->bytes;
5911 int ret; 5927 int ret;
@@ -5915,73 +5931,14 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
5915again: 5931again:
5916 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, 5932 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
5917 &ordered_offset, 5933 &ordered_offset,
5918 ordered_bytes); 5934 ordered_bytes, !err);
5919 if (!ret) 5935 if (!ret)
5920 goto out_test; 5936 goto out_test;
5921 5937
5922 BUG_ON(!ordered); 5938 ordered->work.func = finish_ordered_fn;
5923 5939 ordered->work.flags = 0;
5924 trans = btrfs_join_transaction(root); 5940 btrfs_queue_worker(&root->fs_info->endio_write_workers,
5925 if (IS_ERR(trans)) { 5941 &ordered->work);
5926 err = -ENOMEM;
5927 goto out;
5928 }
5929 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5930
5931 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5932 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5933 if (!ret)
5934 err = btrfs_update_inode_fallback(trans, root, inode);
5935 goto out;
5936 }
5937
5938 lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5939 ordered->file_offset + ordered->len - 1, 0,
5940 &cached_state);
5941
5942 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
5943 ret = btrfs_mark_extent_written(trans, inode,
5944 ordered->file_offset,
5945 ordered->file_offset +
5946 ordered->len);
5947 if (ret) {
5948 err = ret;
5949 goto out_unlock;
5950 }
5951 } else {
5952 ret = insert_reserved_file_extent(trans, inode,
5953 ordered->file_offset,
5954 ordered->start,
5955 ordered->disk_len,
5956 ordered->len,
5957 ordered->len,
5958 0, 0, 0,
5959 BTRFS_FILE_EXTENT_REG);
5960 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
5961 ordered->file_offset, ordered->len);
5962 if (ret) {
5963 err = ret;
5964 WARN_ON(1);
5965 goto out_unlock;
5966 }
5967 }
5968
5969 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5970 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5971 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
5972 btrfs_update_inode_fallback(trans, root, inode);
5973 ret = 0;
5974out_unlock:
5975 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5976 ordered->file_offset + ordered->len - 1,
5977 &cached_state, GFP_NOFS);
5978out:
5979 btrfs_delalloc_release_metadata(inode, ordered->len);
5980 btrfs_end_transaction(trans, root);
5981 ordered_offset = ordered->file_offset + ordered->len;
5982 btrfs_put_ordered_extent(ordered);
5983 btrfs_put_ordered_extent(ordered);
5984
5985out_test: 5942out_test:
5986 /* 5943 /*
5987 * our bio might span multiple ordered extents. If we haven't 5944 * our bio might span multiple ordered extents. If we haven't
@@ -5990,12 +5947,12 @@ out_test:
5990 if (ordered_offset < dip->logical_offset + dip->bytes) { 5947 if (ordered_offset < dip->logical_offset + dip->bytes) {
5991 ordered_bytes = dip->logical_offset + dip->bytes - 5948 ordered_bytes = dip->logical_offset + dip->bytes -
5992 ordered_offset; 5949 ordered_offset;
5950 ordered = NULL;
5993 goto again; 5951 goto again;
5994 } 5952 }
5995out_done: 5953out_done:
5996 bio->bi_private = dip->private; 5954 bio->bi_private = dip->private;
5997 5955
5998 kfree(dip->csums);
5999 kfree(dip); 5956 kfree(dip);
6000 5957
6001 /* If we had an error make sure to clear the uptodate flag */ 5958 /* If we had an error make sure to clear the uptodate flag */
@@ -6063,9 +6020,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
6063 int ret; 6020 int ret;
6064 6021
6065 bio_get(bio); 6022 bio_get(bio);
6066 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 6023
6067 if (ret) 6024 if (!write) {
6068 goto err; 6025 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
6026 if (ret)
6027 goto err;
6028 }
6069 6029
6070 if (skip_sum) 6030 if (skip_sum)
6071 goto map; 6031 goto map;
@@ -6485,13 +6445,13 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
6485 6445
6486static void btrfs_invalidatepage(struct page *page, unsigned long offset) 6446static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6487{ 6447{
6448 struct inode *inode = page->mapping->host;
6488 struct extent_io_tree *tree; 6449 struct extent_io_tree *tree;
6489 struct btrfs_ordered_extent *ordered; 6450 struct btrfs_ordered_extent *ordered;
6490 struct extent_state *cached_state = NULL; 6451 struct extent_state *cached_state = NULL;
6491 u64 page_start = page_offset(page); 6452 u64 page_start = page_offset(page);
6492 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 6453 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
6493 6454
6494
6495 /* 6455 /*
6496 * we have the page locked, so new writeback can't start, 6456 * we have the page locked, so new writeback can't start,
6497 * and the dirty bit won't be cleared while we are here. 6457 * and the dirty bit won't be cleared while we are here.
@@ -6501,13 +6461,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6501 */ 6461 */
6502 wait_on_page_writeback(page); 6462 wait_on_page_writeback(page);
6503 6463
6504 tree = &BTRFS_I(page->mapping->host)->io_tree; 6464 tree = &BTRFS_I(inode)->io_tree;
6505 if (offset) { 6465 if (offset) {
6506 btrfs_releasepage(page, GFP_NOFS); 6466 btrfs_releasepage(page, GFP_NOFS);
6507 return; 6467 return;
6508 } 6468 }
6509 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 6469 lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
6510 ordered = btrfs_lookup_ordered_extent(page->mapping->host, 6470 ordered = btrfs_lookup_ordered_extent(inode,
6511 page_offset(page)); 6471 page_offset(page));
6512 if (ordered) { 6472 if (ordered) {
6513 /* 6473 /*
@@ -6522,9 +6482,10 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6522 * whoever cleared the private bit is responsible 6482 * whoever cleared the private bit is responsible
6523 * for the finish_ordered_io 6483 * for the finish_ordered_io
6524 */ 6484 */
6525 if (TestClearPagePrivate2(page)) { 6485 if (TestClearPagePrivate2(page) &&
6526 btrfs_finish_ordered_io(page->mapping->host, 6486 btrfs_dec_test_ordered_pending(inode, &ordered, page_start,
6527 page_start, page_end); 6487 PAGE_CACHE_SIZE, 1)) {
6488 btrfs_finish_ordered_io(ordered);
6528 } 6489 }
6529 btrfs_put_ordered_extent(ordered); 6490 btrfs_put_ordered_extent(ordered);
6530 cached_state = NULL; 6491 cached_state = NULL;
@@ -6576,7 +6537,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6576 6537
6577 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6538 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
6578 if (!ret) { 6539 if (!ret) {
6579 ret = btrfs_update_time(vma->vm_file); 6540 ret = file_update_time(vma->vm_file);
6580 reserved = 1; 6541 reserved = 1;
6581 } 6542 }
6582 if (ret) { 6543 if (ret) {
@@ -6771,7 +6732,8 @@ static int btrfs_truncate(struct inode *inode)
6771 * using truncate to replace the contents of the file will 6732 * using truncate to replace the contents of the file will
6772 * end up with a zero length file after a crash. 6733 * end up with a zero length file after a crash.
6773 */ 6734 */
6774 if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) 6735 if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
6736 &BTRFS_I(inode)->runtime_flags))
6775 btrfs_add_ordered_operation(trans, root, inode); 6737 btrfs_add_ordered_operation(trans, root, inode);
6776 6738
6777 while (1) { 6739 while (1) {
@@ -6894,7 +6856,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6894 ei->root = NULL; 6856 ei->root = NULL;
6895 ei->space_info = NULL; 6857 ei->space_info = NULL;
6896 ei->generation = 0; 6858 ei->generation = 0;
6897 ei->sequence = 0;
6898 ei->last_trans = 0; 6859 ei->last_trans = 0;
6899 ei->last_sub_trans = 0; 6860 ei->last_sub_trans = 0;
6900 ei->logged_trans = 0; 6861 ei->logged_trans = 0;
@@ -6909,11 +6870,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6909 ei->outstanding_extents = 0; 6870 ei->outstanding_extents = 0;
6910 ei->reserved_extents = 0; 6871 ei->reserved_extents = 0;
6911 6872
6912 ei->ordered_data_close = 0; 6873 ei->runtime_flags = 0;
6913 ei->orphan_meta_reserved = 0;
6914 ei->dummy_inode = 0;
6915 ei->in_defrag = 0;
6916 ei->delalloc_meta_reserved = 0;
6917 ei->force_compress = BTRFS_COMPRESS_NONE; 6874 ei->force_compress = BTRFS_COMPRESS_NONE;
6918 6875
6919 ei->delayed_node = NULL; 6876 ei->delayed_node = NULL;
@@ -6927,7 +6884,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6927 mutex_init(&ei->log_mutex); 6884 mutex_init(&ei->log_mutex);
6928 mutex_init(&ei->delalloc_mutex); 6885 mutex_init(&ei->delalloc_mutex);
6929 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6886 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
6930 INIT_LIST_HEAD(&ei->i_orphan);
6931 INIT_LIST_HEAD(&ei->delalloc_inodes); 6887 INIT_LIST_HEAD(&ei->delalloc_inodes);
6932 INIT_LIST_HEAD(&ei->ordered_operations); 6888 INIT_LIST_HEAD(&ei->ordered_operations);
6933 RB_CLEAR_NODE(&ei->rb_node); 6889 RB_CLEAR_NODE(&ei->rb_node);
@@ -6972,13 +6928,12 @@ void btrfs_destroy_inode(struct inode *inode)
6972 spin_unlock(&root->fs_info->ordered_extent_lock); 6928 spin_unlock(&root->fs_info->ordered_extent_lock);
6973 } 6929 }
6974 6930
6975 spin_lock(&root->orphan_lock); 6931 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
6976 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6932 &BTRFS_I(inode)->runtime_flags)) {
6977 printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n", 6933 printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
6978 (unsigned long long)btrfs_ino(inode)); 6934 (unsigned long long)btrfs_ino(inode));
6979 list_del_init(&BTRFS_I(inode)->i_orphan); 6935 atomic_dec(&root->orphan_inodes);
6980 } 6936 }
6981 spin_unlock(&root->orphan_lock);
6982 6937
6983 while (1) { 6938 while (1) {
6984 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 6939 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -7193,6 +7148,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7193 if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode)) 7148 if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
7194 btrfs_add_ordered_operation(trans, root, old_inode); 7149 btrfs_add_ordered_operation(trans, root, old_inode);
7195 7150
7151 inode_inc_iversion(old_dir);
7152 inode_inc_iversion(new_dir);
7153 inode_inc_iversion(old_inode);
7196 old_dir->i_ctime = old_dir->i_mtime = ctime; 7154 old_dir->i_ctime = old_dir->i_mtime = ctime;
7197 new_dir->i_ctime = new_dir->i_mtime = ctime; 7155 new_dir->i_ctime = new_dir->i_mtime = ctime;
7198 old_inode->i_ctime = ctime; 7156 old_inode->i_ctime = ctime;
@@ -7219,6 +7177,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7219 } 7177 }
7220 7178
7221 if (new_inode) { 7179 if (new_inode) {
7180 inode_inc_iversion(new_inode);
7222 new_inode->i_ctime = CURRENT_TIME; 7181 new_inode->i_ctime = CURRENT_TIME;
7223 if (unlikely(btrfs_ino(new_inode) == 7182 if (unlikely(btrfs_ino(new_inode) ==
7224 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 7183 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
@@ -7490,6 +7449,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7490 cur_offset += ins.offset; 7449 cur_offset += ins.offset;
7491 *alloc_hint = ins.objectid + ins.offset; 7450 *alloc_hint = ins.objectid + ins.offset;
7492 7451
7452 inode_inc_iversion(inode);
7493 inode->i_ctime = CURRENT_TIME; 7453 inode->i_ctime = CURRENT_TIME;
7494 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 7454 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
7495 if (!(mode & FALLOC_FL_KEEP_SIZE) && 7455 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
@@ -7647,6 +7607,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
7647 .permission = btrfs_permission, 7607 .permission = btrfs_permission,
7648 .fiemap = btrfs_fiemap, 7608 .fiemap = btrfs_fiemap,
7649 .get_acl = btrfs_get_acl, 7609 .get_acl = btrfs_get_acl,
7610 .update_time = btrfs_update_time,
7650}; 7611};
7651static const struct inode_operations btrfs_special_inode_operations = { 7612static const struct inode_operations btrfs_special_inode_operations = {
7652 .getattr = btrfs_getattr, 7613 .getattr = btrfs_getattr,
@@ -7657,6 +7618,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
7657 .listxattr = btrfs_listxattr, 7618 .listxattr = btrfs_listxattr,
7658 .removexattr = btrfs_removexattr, 7619 .removexattr = btrfs_removexattr,
7659 .get_acl = btrfs_get_acl, 7620 .get_acl = btrfs_get_acl,
7621 .update_time = btrfs_update_time,
7660}; 7622};
7661static const struct inode_operations btrfs_symlink_inode_operations = { 7623static const struct inode_operations btrfs_symlink_inode_operations = {
7662 .readlink = generic_readlink, 7624 .readlink = generic_readlink,
@@ -7670,6 +7632,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
7670 .listxattr = btrfs_listxattr, 7632 .listxattr = btrfs_listxattr,
7671 .removexattr = btrfs_removexattr, 7633 .removexattr = btrfs_removexattr,
7672 .get_acl = btrfs_get_acl, 7634 .get_acl = btrfs_get_acl,
7635 .update_time = btrfs_update_time,
7673}; 7636};
7674 7637
7675const struct dentry_operations btrfs_dentry_operations = { 7638const struct dentry_operations btrfs_dentry_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 14f8e1faa46e..24b776c08d99 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -261,6 +261,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
261 } 261 }
262 262
263 btrfs_update_iflags(inode); 263 btrfs_update_iflags(inode);
264 inode_inc_iversion(inode);
264 inode->i_ctime = CURRENT_TIME; 265 inode->i_ctime = CURRENT_TIME;
265 ret = btrfs_update_inode(trans, root, inode); 266 ret = btrfs_update_inode(trans, root, inode);
266 267
@@ -367,7 +368,7 @@ static noinline int create_subvol(struct btrfs_root *root,
367 return PTR_ERR(trans); 368 return PTR_ERR(trans);
368 369
369 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 370 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
370 0, objectid, NULL, 0, 0, 0, 0); 371 0, objectid, NULL, 0, 0, 0);
371 if (IS_ERR(leaf)) { 372 if (IS_ERR(leaf)) {
372 ret = PTR_ERR(leaf); 373 ret = PTR_ERR(leaf);
373 goto fail; 374 goto fail;
@@ -2262,10 +2263,12 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
2262 di_args->bytes_used = dev->bytes_used; 2263 di_args->bytes_used = dev->bytes_used;
2263 di_args->total_bytes = dev->total_bytes; 2264 di_args->total_bytes = dev->total_bytes;
2264 memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); 2265 memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
2265 if (dev->name) 2266 if (dev->name) {
2266 strncpy(di_args->path, dev->name, sizeof(di_args->path)); 2267 strncpy(di_args->path, dev->name, sizeof(di_args->path));
2267 else 2268 di_args->path[sizeof(di_args->path) - 1] = 0;
2269 } else {
2268 di_args->path[0] = '\0'; 2270 di_args->path[0] = '\0';
2271 }
2269 2272
2270out: 2273out:
2271 if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) 2274 if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
@@ -2622,6 +2625,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2622 btrfs_mark_buffer_dirty(leaf); 2625 btrfs_mark_buffer_dirty(leaf);
2623 btrfs_release_path(path); 2626 btrfs_release_path(path);
2624 2627
2628 inode_inc_iversion(inode);
2625 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 2629 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2626 2630
2627 /* 2631 /*
@@ -2914,7 +2918,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2914 up_read(&info->groups_sem); 2918 up_read(&info->groups_sem);
2915 } 2919 }
2916 2920
2917 user_dest = (struct btrfs_ioctl_space_info *) 2921 user_dest = (struct btrfs_ioctl_space_info __user *)
2918 (arg + sizeof(struct btrfs_ioctl_space_args)); 2922 (arg + sizeof(struct btrfs_ioctl_space_args));
2919 2923
2920 if (copy_to_user(user_dest, dest_orig, alloc_size)) 2924 if (copy_to_user(user_dest, dest_orig, alloc_size))
@@ -3042,6 +3046,28 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
3042 return ret; 3046 return ret;
3043} 3047}
3044 3048
3049static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
3050 void __user *arg, int reset_after_read)
3051{
3052 struct btrfs_ioctl_get_dev_stats *sa;
3053 int ret;
3054
3055 if (reset_after_read && !capable(CAP_SYS_ADMIN))
3056 return -EPERM;
3057
3058 sa = memdup_user(arg, sizeof(*sa));
3059 if (IS_ERR(sa))
3060 return PTR_ERR(sa);
3061
3062 ret = btrfs_get_dev_stats(root, sa, reset_after_read);
3063
3064 if (copy_to_user(arg, sa, sizeof(*sa)))
3065 ret = -EFAULT;
3066
3067 kfree(sa);
3068 return ret;
3069}
3070
3045static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) 3071static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
3046{ 3072{
3047 int ret = 0; 3073 int ret = 0;
@@ -3212,8 +3238,9 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
3212 } 3238 }
3213} 3239}
3214 3240
3215static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) 3241static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3216{ 3242{
3243 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3217 struct btrfs_fs_info *fs_info = root->fs_info; 3244 struct btrfs_fs_info *fs_info = root->fs_info;
3218 struct btrfs_ioctl_balance_args *bargs; 3245 struct btrfs_ioctl_balance_args *bargs;
3219 struct btrfs_balance_control *bctl; 3246 struct btrfs_balance_control *bctl;
@@ -3225,6 +3252,10 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
3225 if (fs_info->sb->s_flags & MS_RDONLY) 3252 if (fs_info->sb->s_flags & MS_RDONLY)
3226 return -EROFS; 3253 return -EROFS;
3227 3254
3255 ret = mnt_want_write(file->f_path.mnt);
3256 if (ret)
3257 return ret;
3258
3228 mutex_lock(&fs_info->volume_mutex); 3259 mutex_lock(&fs_info->volume_mutex);
3229 mutex_lock(&fs_info->balance_mutex); 3260 mutex_lock(&fs_info->balance_mutex);
3230 3261
@@ -3291,6 +3322,7 @@ out_bargs:
3291out: 3322out:
3292 mutex_unlock(&fs_info->balance_mutex); 3323 mutex_unlock(&fs_info->balance_mutex);
3293 mutex_unlock(&fs_info->volume_mutex); 3324 mutex_unlock(&fs_info->volume_mutex);
3325 mnt_drop_write(file->f_path.mnt);
3294 return ret; 3326 return ret;
3295} 3327}
3296 3328
@@ -3386,7 +3418,7 @@ long btrfs_ioctl(struct file *file, unsigned int
3386 case BTRFS_IOC_DEV_INFO: 3418 case BTRFS_IOC_DEV_INFO:
3387 return btrfs_ioctl_dev_info(root, argp); 3419 return btrfs_ioctl_dev_info(root, argp);
3388 case BTRFS_IOC_BALANCE: 3420 case BTRFS_IOC_BALANCE:
3389 return btrfs_ioctl_balance(root, NULL); 3421 return btrfs_ioctl_balance(file, NULL);
3390 case BTRFS_IOC_CLONE: 3422 case BTRFS_IOC_CLONE:
3391 return btrfs_ioctl_clone(file, arg, 0, 0, 0); 3423 return btrfs_ioctl_clone(file, arg, 0, 0, 0);
3392 case BTRFS_IOC_CLONE_RANGE: 3424 case BTRFS_IOC_CLONE_RANGE:
@@ -3419,11 +3451,15 @@ long btrfs_ioctl(struct file *file, unsigned int
3419 case BTRFS_IOC_SCRUB_PROGRESS: 3451 case BTRFS_IOC_SCRUB_PROGRESS:
3420 return btrfs_ioctl_scrub_progress(root, argp); 3452 return btrfs_ioctl_scrub_progress(root, argp);
3421 case BTRFS_IOC_BALANCE_V2: 3453 case BTRFS_IOC_BALANCE_V2:
3422 return btrfs_ioctl_balance(root, argp); 3454 return btrfs_ioctl_balance(file, argp);
3423 case BTRFS_IOC_BALANCE_CTL: 3455 case BTRFS_IOC_BALANCE_CTL:
3424 return btrfs_ioctl_balance_ctl(root, arg); 3456 return btrfs_ioctl_balance_ctl(root, arg);
3425 case BTRFS_IOC_BALANCE_PROGRESS: 3457 case BTRFS_IOC_BALANCE_PROGRESS:
3426 return btrfs_ioctl_balance_progress(root, argp); 3458 return btrfs_ioctl_balance_progress(root, argp);
3459 case BTRFS_IOC_GET_DEV_STATS:
3460 return btrfs_ioctl_get_dev_stats(root, argp, 0);
3461 case BTRFS_IOC_GET_AND_RESET_DEV_STATS:
3462 return btrfs_ioctl_get_dev_stats(root, argp, 1);
3427 } 3463 }
3428 3464
3429 return -ENOTTY; 3465 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 086e6bdae1c4..497c530724cf 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -266,6 +266,35 @@ struct btrfs_ioctl_logical_ino_args {
266 __u64 inodes; 266 __u64 inodes;
267}; 267};
268 268
269enum btrfs_dev_stat_values {
270 /* disk I/O failure stats */
271 BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */
272 BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */
273 BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */
274
275 /* stats for indirect indications for I/O failures */
276 BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or
277 * contents is illegal: this is an
278 * indication that the block was damaged
279 * during read or write, or written to
280 * wrong location or read from wrong
281 * location */
282 BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not
283 * been written */
284
285 BTRFS_DEV_STAT_VALUES_MAX
286};
287
288struct btrfs_ioctl_get_dev_stats {
289 __u64 devid; /* in */
290 __u64 nr_items; /* in/out */
291
292 /* out values: */
293 __u64 values[BTRFS_DEV_STAT_VALUES_MAX];
294
295 __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
296};
297
269#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 298#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
270 struct btrfs_ioctl_vol_args) 299 struct btrfs_ioctl_vol_args)
271#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 300#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -330,5 +359,9 @@ struct btrfs_ioctl_logical_ino_args {
330 struct btrfs_ioctl_ino_path_args) 359 struct btrfs_ioctl_ino_path_args)
331#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ 360#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
332 struct btrfs_ioctl_ino_path_args) 361 struct btrfs_ioctl_ino_path_args)
362#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
363 struct btrfs_ioctl_get_dev_stats)
364#define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \
365 struct btrfs_ioctl_get_dev_stats)
333 366
334#endif 367#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index bbf6d0d9aebe..9e138cdc36c5 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -196,7 +196,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
196 entry->len = len; 196 entry->len = len;
197 entry->disk_len = disk_len; 197 entry->disk_len = disk_len;
198 entry->bytes_left = len; 198 entry->bytes_left = len;
199 entry->inode = inode; 199 entry->inode = igrab(inode);
200 entry->compress_type = compress_type; 200 entry->compress_type = compress_type;
201 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 201 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
202 set_bit(type, &entry->flags); 202 set_bit(type, &entry->flags);
@@ -212,12 +212,12 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
212 212
213 trace_btrfs_ordered_extent_add(inode, entry); 213 trace_btrfs_ordered_extent_add(inode, entry);
214 214
215 spin_lock(&tree->lock); 215 spin_lock_irq(&tree->lock);
216 node = tree_insert(&tree->tree, file_offset, 216 node = tree_insert(&tree->tree, file_offset,
217 &entry->rb_node); 217 &entry->rb_node);
218 if (node) 218 if (node)
219 ordered_data_tree_panic(inode, -EEXIST, file_offset); 219 ordered_data_tree_panic(inode, -EEXIST, file_offset);
220 spin_unlock(&tree->lock); 220 spin_unlock_irq(&tree->lock);
221 221
222 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 222 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
223 list_add_tail(&entry->root_extent_list, 223 list_add_tail(&entry->root_extent_list,
@@ -264,9 +264,9 @@ void btrfs_add_ordered_sum(struct inode *inode,
264 struct btrfs_ordered_inode_tree *tree; 264 struct btrfs_ordered_inode_tree *tree;
265 265
266 tree = &BTRFS_I(inode)->ordered_tree; 266 tree = &BTRFS_I(inode)->ordered_tree;
267 spin_lock(&tree->lock); 267 spin_lock_irq(&tree->lock);
268 list_add_tail(&sum->list, &entry->list); 268 list_add_tail(&sum->list, &entry->list);
269 spin_unlock(&tree->lock); 269 spin_unlock_irq(&tree->lock);
270} 270}
271 271
272/* 272/*
@@ -283,18 +283,19 @@ void btrfs_add_ordered_sum(struct inode *inode,
283 */ 283 */
284int btrfs_dec_test_first_ordered_pending(struct inode *inode, 284int btrfs_dec_test_first_ordered_pending(struct inode *inode,
285 struct btrfs_ordered_extent **cached, 285 struct btrfs_ordered_extent **cached,
286 u64 *file_offset, u64 io_size) 286 u64 *file_offset, u64 io_size, int uptodate)
287{ 287{
288 struct btrfs_ordered_inode_tree *tree; 288 struct btrfs_ordered_inode_tree *tree;
289 struct rb_node *node; 289 struct rb_node *node;
290 struct btrfs_ordered_extent *entry = NULL; 290 struct btrfs_ordered_extent *entry = NULL;
291 int ret; 291 int ret;
292 unsigned long flags;
292 u64 dec_end; 293 u64 dec_end;
293 u64 dec_start; 294 u64 dec_start;
294 u64 to_dec; 295 u64 to_dec;
295 296
296 tree = &BTRFS_I(inode)->ordered_tree; 297 tree = &BTRFS_I(inode)->ordered_tree;
297 spin_lock(&tree->lock); 298 spin_lock_irqsave(&tree->lock, flags);
298 node = tree_search(tree, *file_offset); 299 node = tree_search(tree, *file_offset);
299 if (!node) { 300 if (!node) {
300 ret = 1; 301 ret = 1;
@@ -323,6 +324,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
323 (unsigned long long)to_dec); 324 (unsigned long long)to_dec);
324 } 325 }
325 entry->bytes_left -= to_dec; 326 entry->bytes_left -= to_dec;
327 if (!uptodate)
328 set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
329
326 if (entry->bytes_left == 0) 330 if (entry->bytes_left == 0)
327 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 331 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
328 else 332 else
@@ -332,7 +336,7 @@ out:
332 *cached = entry; 336 *cached = entry;
333 atomic_inc(&entry->refs); 337 atomic_inc(&entry->refs);
334 } 338 }
335 spin_unlock(&tree->lock); 339 spin_unlock_irqrestore(&tree->lock, flags);
336 return ret == 0; 340 return ret == 0;
337} 341}
338 342
@@ -347,15 +351,21 @@ out:
347 */ 351 */
348int btrfs_dec_test_ordered_pending(struct inode *inode, 352int btrfs_dec_test_ordered_pending(struct inode *inode,
349 struct btrfs_ordered_extent **cached, 353 struct btrfs_ordered_extent **cached,
350 u64 file_offset, u64 io_size) 354 u64 file_offset, u64 io_size, int uptodate)
351{ 355{
352 struct btrfs_ordered_inode_tree *tree; 356 struct btrfs_ordered_inode_tree *tree;
353 struct rb_node *node; 357 struct rb_node *node;
354 struct btrfs_ordered_extent *entry = NULL; 358 struct btrfs_ordered_extent *entry = NULL;
359 unsigned long flags;
355 int ret; 360 int ret;
356 361
357 tree = &BTRFS_I(inode)->ordered_tree; 362 tree = &BTRFS_I(inode)->ordered_tree;
358 spin_lock(&tree->lock); 363 spin_lock_irqsave(&tree->lock, flags);
364 if (cached && *cached) {
365 entry = *cached;
366 goto have_entry;
367 }
368
359 node = tree_search(tree, file_offset); 369 node = tree_search(tree, file_offset);
360 if (!node) { 370 if (!node) {
361 ret = 1; 371 ret = 1;
@@ -363,6 +373,7 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
363 } 373 }
364 374
365 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 375 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
376have_entry:
366 if (!offset_in_entry(entry, file_offset)) { 377 if (!offset_in_entry(entry, file_offset)) {
367 ret = 1; 378 ret = 1;
368 goto out; 379 goto out;
@@ -374,6 +385,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
374 (unsigned long long)io_size); 385 (unsigned long long)io_size);
375 } 386 }
376 entry->bytes_left -= io_size; 387 entry->bytes_left -= io_size;
388 if (!uptodate)
389 set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
390
377 if (entry->bytes_left == 0) 391 if (entry->bytes_left == 0)
378 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 392 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
379 else 393 else
@@ -383,7 +397,7 @@ out:
383 *cached = entry; 397 *cached = entry;
384 atomic_inc(&entry->refs); 398 atomic_inc(&entry->refs);
385 } 399 }
386 spin_unlock(&tree->lock); 400 spin_unlock_irqrestore(&tree->lock, flags);
387 return ret == 0; 401 return ret == 0;
388} 402}
389 403
@@ -399,6 +413,8 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
399 trace_btrfs_ordered_extent_put(entry->inode, entry); 413 trace_btrfs_ordered_extent_put(entry->inode, entry);
400 414
401 if (atomic_dec_and_test(&entry->refs)) { 415 if (atomic_dec_and_test(&entry->refs)) {
416 if (entry->inode)
417 btrfs_add_delayed_iput(entry->inode);
402 while (!list_empty(&entry->list)) { 418 while (!list_empty(&entry->list)) {
403 cur = entry->list.next; 419 cur = entry->list.next;
404 sum = list_entry(cur, struct btrfs_ordered_sum, list); 420 sum = list_entry(cur, struct btrfs_ordered_sum, list);
@@ -411,21 +427,22 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
411 427
412/* 428/*
413 * remove an ordered extent from the tree. No references are dropped 429 * remove an ordered extent from the tree. No references are dropped
414 * and you must wake_up entry->wait. You must hold the tree lock 430 * and waiters are woken up.
415 * while you call this function.
416 */ 431 */
417static void __btrfs_remove_ordered_extent(struct inode *inode, 432void btrfs_remove_ordered_extent(struct inode *inode,
418 struct btrfs_ordered_extent *entry) 433 struct btrfs_ordered_extent *entry)
419{ 434{
420 struct btrfs_ordered_inode_tree *tree; 435 struct btrfs_ordered_inode_tree *tree;
421 struct btrfs_root *root = BTRFS_I(inode)->root; 436 struct btrfs_root *root = BTRFS_I(inode)->root;
422 struct rb_node *node; 437 struct rb_node *node;
423 438
424 tree = &BTRFS_I(inode)->ordered_tree; 439 tree = &BTRFS_I(inode)->ordered_tree;
440 spin_lock_irq(&tree->lock);
425 node = &entry->rb_node; 441 node = &entry->rb_node;
426 rb_erase(node, &tree->tree); 442 rb_erase(node, &tree->tree);
427 tree->last = NULL; 443 tree->last = NULL;
428 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 444 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
445 spin_unlock_irq(&tree->lock);
429 446
430 spin_lock(&root->fs_info->ordered_extent_lock); 447 spin_lock(&root->fs_info->ordered_extent_lock);
431 list_del_init(&entry->root_extent_list); 448 list_del_init(&entry->root_extent_list);
@@ -442,21 +459,6 @@ static void __btrfs_remove_ordered_extent(struct inode *inode,
442 list_del_init(&BTRFS_I(inode)->ordered_operations); 459 list_del_init(&BTRFS_I(inode)->ordered_operations);
443 } 460 }
444 spin_unlock(&root->fs_info->ordered_extent_lock); 461 spin_unlock(&root->fs_info->ordered_extent_lock);
445}
446
447/*
448 * remove an ordered extent from the tree. No references are dropped
449 * but any waiters are woken.
450 */
451void btrfs_remove_ordered_extent(struct inode *inode,
452 struct btrfs_ordered_extent *entry)
453{
454 struct btrfs_ordered_inode_tree *tree;
455
456 tree = &BTRFS_I(inode)->ordered_tree;
457 spin_lock(&tree->lock);
458 __btrfs_remove_ordered_extent(inode, entry);
459 spin_unlock(&tree->lock);
460 wake_up(&entry->wait); 462 wake_up(&entry->wait);
461} 463}
462 464
@@ -621,19 +623,11 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
621 if (orig_end > INT_LIMIT(loff_t)) 623 if (orig_end > INT_LIMIT(loff_t))
622 orig_end = INT_LIMIT(loff_t); 624 orig_end = INT_LIMIT(loff_t);
623 } 625 }
624again: 626
625 /* start IO across the range first to instantiate any delalloc 627 /* start IO across the range first to instantiate any delalloc
626 * extents 628 * extents
627 */ 629 */
628 filemap_fdatawrite_range(inode->i_mapping, start, orig_end); 630 filemap_write_and_wait_range(inode->i_mapping, start, orig_end);
629
630 /* The compression code will leave pages locked but return from
631 * writepage without setting the page writeback. Starting again
632 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
633 */
634 filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
635
636 filemap_fdatawait_range(inode->i_mapping, start, orig_end);
637 631
638 end = orig_end; 632 end = orig_end;
639 found = 0; 633 found = 0;
@@ -657,11 +651,6 @@ again:
657 break; 651 break;
658 end--; 652 end--;
659 } 653 }
660 if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
661 EXTENT_DELALLOC, 0, NULL)) {
662 schedule_timeout(1);
663 goto again;
664 }
665} 654}
666 655
667/* 656/*
@@ -676,7 +665,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
676 struct btrfs_ordered_extent *entry = NULL; 665 struct btrfs_ordered_extent *entry = NULL;
677 666
678 tree = &BTRFS_I(inode)->ordered_tree; 667 tree = &BTRFS_I(inode)->ordered_tree;
679 spin_lock(&tree->lock); 668 spin_lock_irq(&tree->lock);
680 node = tree_search(tree, file_offset); 669 node = tree_search(tree, file_offset);
681 if (!node) 670 if (!node)
682 goto out; 671 goto out;
@@ -687,7 +676,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
687 if (entry) 676 if (entry)
688 atomic_inc(&entry->refs); 677 atomic_inc(&entry->refs);
689out: 678out:
690 spin_unlock(&tree->lock); 679 spin_unlock_irq(&tree->lock);
691 return entry; 680 return entry;
692} 681}
693 682
@@ -703,7 +692,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
703 struct btrfs_ordered_extent *entry = NULL; 692 struct btrfs_ordered_extent *entry = NULL;
704 693
705 tree = &BTRFS_I(inode)->ordered_tree; 694 tree = &BTRFS_I(inode)->ordered_tree;
706 spin_lock(&tree->lock); 695 spin_lock_irq(&tree->lock);
707 node = tree_search(tree, file_offset); 696 node = tree_search(tree, file_offset);
708 if (!node) { 697 if (!node) {
709 node = tree_search(tree, file_offset + len); 698 node = tree_search(tree, file_offset + len);
@@ -728,7 +717,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
728out: 717out:
729 if (entry) 718 if (entry)
730 atomic_inc(&entry->refs); 719 atomic_inc(&entry->refs);
731 spin_unlock(&tree->lock); 720 spin_unlock_irq(&tree->lock);
732 return entry; 721 return entry;
733} 722}
734 723
@@ -744,7 +733,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
744 struct btrfs_ordered_extent *entry = NULL; 733 struct btrfs_ordered_extent *entry = NULL;
745 734
746 tree = &BTRFS_I(inode)->ordered_tree; 735 tree = &BTRFS_I(inode)->ordered_tree;
747 spin_lock(&tree->lock); 736 spin_lock_irq(&tree->lock);
748 node = tree_search(tree, file_offset); 737 node = tree_search(tree, file_offset);
749 if (!node) 738 if (!node)
750 goto out; 739 goto out;
@@ -752,7 +741,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
752 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 741 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
753 atomic_inc(&entry->refs); 742 atomic_inc(&entry->refs);
754out: 743out:
755 spin_unlock(&tree->lock); 744 spin_unlock_irq(&tree->lock);
756 return entry; 745 return entry;
757} 746}
758 747
@@ -764,7 +753,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
764 struct btrfs_ordered_extent *ordered) 753 struct btrfs_ordered_extent *ordered)
765{ 754{
766 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 755 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
767 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
768 u64 disk_i_size; 756 u64 disk_i_size;
769 u64 new_i_size; 757 u64 new_i_size;
770 u64 i_size_test; 758 u64 i_size_test;
@@ -779,7 +767,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
779 else 767 else
780 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); 768 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
781 769
782 spin_lock(&tree->lock); 770 spin_lock_irq(&tree->lock);
783 disk_i_size = BTRFS_I(inode)->disk_i_size; 771 disk_i_size = BTRFS_I(inode)->disk_i_size;
784 772
785 /* truncate file */ 773 /* truncate file */
@@ -798,14 +786,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
798 } 786 }
799 787
800 /* 788 /*
801 * we can't update the disk_isize if there are delalloc bytes
802 * between disk_i_size and this ordered extent
803 */
804 if (test_range_bit(io_tree, disk_i_size, offset - 1,
805 EXTENT_DELALLOC, 0, NULL)) {
806 goto out;
807 }
808 /*
809 * walk backward from this ordered extent to disk_i_size. 789 * walk backward from this ordered extent to disk_i_size.
810 * if we find an ordered extent then we can't update disk i_size 790 * if we find an ordered extent then we can't update disk i_size
811 * yet 791 * yet
@@ -825,15 +805,18 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
825 } 805 }
826 node = prev; 806 node = prev;
827 } 807 }
828 while (node) { 808 for (; node; node = rb_prev(node)) {
829 test = rb_entry(node, struct btrfs_ordered_extent, rb_node); 809 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
810
811 /* We treat this entry as if it doesnt exist */
812 if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
813 continue;
830 if (test->file_offset + test->len <= disk_i_size) 814 if (test->file_offset + test->len <= disk_i_size)
831 break; 815 break;
832 if (test->file_offset >= i_size) 816 if (test->file_offset >= i_size)
833 break; 817 break;
834 if (test->file_offset >= disk_i_size) 818 if (test->file_offset >= disk_i_size)
835 goto out; 819 goto out;
836 node = rb_prev(node);
837 } 820 }
838 new_i_size = min_t(u64, offset, i_size); 821 new_i_size = min_t(u64, offset, i_size);
839 822
@@ -851,43 +834,49 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
851 else 834 else
852 node = rb_first(&tree->tree); 835 node = rb_first(&tree->tree);
853 } 836 }
854 i_size_test = 0; 837
855 if (node) { 838 /*
856 /* 839 * We are looking for an area between our current extent and the next
857 * do we have an area where IO might have finished 840 * ordered extent to update the i_size to. There are 3 cases here
858 * between our ordered extent and the next one. 841 *
859 */ 842 * 1) We don't actually have anything and we can update to i_size.
843 * 2) We have stuff but they already did their i_size update so again we
844 * can just update to i_size.
845 * 3) We have an outstanding ordered extent so the most we can update
846 * our disk_i_size to is the start of the next offset.
847 */
848 i_size_test = i_size;
849 for (; node; node = rb_next(node)) {
860 test = rb_entry(node, struct btrfs_ordered_extent, rb_node); 850 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
861 if (test->file_offset > offset) 851
852 if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
853 continue;
854 if (test->file_offset > offset) {
862 i_size_test = test->file_offset; 855 i_size_test = test->file_offset;
863 } else { 856 break;
864 i_size_test = i_size; 857 }
865 } 858 }
866 859
867 /* 860 /*
868 * i_size_test is the end of a region after this ordered 861 * i_size_test is the end of a region after this ordered
869 * extent where there are no ordered extents. As long as there 862 * extent where there are no ordered extents, we can safely set
870 * are no delalloc bytes in this area, it is safe to update 863 * disk_i_size to this.
871 * disk_i_size to the end of the region.
872 */ 864 */
873 if (i_size_test > offset && 865 if (i_size_test > offset)
874 !test_range_bit(io_tree, offset, i_size_test - 1,
875 EXTENT_DELALLOC, 0, NULL)) {
876 new_i_size = min_t(u64, i_size_test, i_size); 866 new_i_size = min_t(u64, i_size_test, i_size);
877 }
878 BTRFS_I(inode)->disk_i_size = new_i_size; 867 BTRFS_I(inode)->disk_i_size = new_i_size;
879 ret = 0; 868 ret = 0;
880out: 869out:
881 /* 870 /*
882 * we need to remove the ordered extent with the tree lock held 871 * We need to do this because we can't remove ordered extents until
883 * so that other people calling this function don't find our fully 872 * after the i_disk_size has been updated and then the inode has been
884 * processed ordered entry and skip updating the i_size 873 * updated to reflect the change, so we need to tell anybody who finds
874 * this ordered extent that we've already done all the real work, we
875 * just haven't completed all the other work.
885 */ 876 */
886 if (ordered) 877 if (ordered)
887 __btrfs_remove_ordered_extent(inode, ordered); 878 set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
888 spin_unlock(&tree->lock); 879 spin_unlock_irq(&tree->lock);
889 if (ordered)
890 wake_up(&ordered->wait);
891 return ret; 880 return ret;
892} 881}
893 882
@@ -912,7 +901,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
912 if (!ordered) 901 if (!ordered)
913 return 1; 902 return 1;
914 903
915 spin_lock(&tree->lock); 904 spin_lock_irq(&tree->lock);
916 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { 905 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
917 if (disk_bytenr >= ordered_sum->bytenr) { 906 if (disk_bytenr >= ordered_sum->bytenr) {
918 num_sectors = ordered_sum->len / sectorsize; 907 num_sectors = ordered_sum->len / sectorsize;
@@ -927,7 +916,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
927 } 916 }
928 } 917 }
929out: 918out:
930 spin_unlock(&tree->lock); 919 spin_unlock_irq(&tree->lock);
931 btrfs_put_ordered_extent(ordered); 920 btrfs_put_ordered_extent(ordered);
932 return ret; 921 return ret;
933} 922}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c355ad4dc1a6..e03c560d2997 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -74,6 +74,12 @@ struct btrfs_ordered_sum {
74 74
75#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ 75#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
76 76
77#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
78
79#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent
80 * has done its due diligence in updating
81 * the isize. */
82
77struct btrfs_ordered_extent { 83struct btrfs_ordered_extent {
78 /* logical offset in the file */ 84 /* logical offset in the file */
79 u64 file_offset; 85 u64 file_offset;
@@ -113,6 +119,8 @@ struct btrfs_ordered_extent {
113 119
114 /* a per root list of all the pending ordered extents */ 120 /* a per root list of all the pending ordered extents */
115 struct list_head root_extent_list; 121 struct list_head root_extent_list;
122
123 struct btrfs_work work;
116}; 124};
117 125
118 126
@@ -143,10 +151,11 @@ void btrfs_remove_ordered_extent(struct inode *inode,
143 struct btrfs_ordered_extent *entry); 151 struct btrfs_ordered_extent *entry);
144int btrfs_dec_test_ordered_pending(struct inode *inode, 152int btrfs_dec_test_ordered_pending(struct inode *inode,
145 struct btrfs_ordered_extent **cached, 153 struct btrfs_ordered_extent **cached,
146 u64 file_offset, u64 io_size); 154 u64 file_offset, u64 io_size, int uptodate);
147int btrfs_dec_test_first_ordered_pending(struct inode *inode, 155int btrfs_dec_test_first_ordered_pending(struct inode *inode,
148 struct btrfs_ordered_extent **cached, 156 struct btrfs_ordered_extent **cached,
149 u64 *file_offset, u64 io_size); 157 u64 *file_offset, u64 io_size,
158 int uptodate);
150int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 159int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
151 u64 start, u64 len, u64 disk_len, int type); 160 u64 start, u64 len, u64 disk_len, int type);
152int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 161int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index f38e452486b8..5e23684887eb 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -294,6 +294,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
294 btrfs_dev_extent_chunk_offset(l, dev_extent), 294 btrfs_dev_extent_chunk_offset(l, dev_extent),
295 (unsigned long long) 295 (unsigned long long)
296 btrfs_dev_extent_length(l, dev_extent)); 296 btrfs_dev_extent_length(l, dev_extent));
297 case BTRFS_DEV_STATS_KEY:
298 printk(KERN_INFO "\t\tdevice stats\n");
299 break;
297 }; 300 };
298 } 301 }
299} 302}
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index ac5d01085884..48a4882d8ad5 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -718,13 +718,18 @@ static void reada_start_machine_worker(struct btrfs_work *work)
718{ 718{
719 struct reada_machine_work *rmw; 719 struct reada_machine_work *rmw;
720 struct btrfs_fs_info *fs_info; 720 struct btrfs_fs_info *fs_info;
721 int old_ioprio;
721 722
722 rmw = container_of(work, struct reada_machine_work, work); 723 rmw = container_of(work, struct reada_machine_work, work);
723 fs_info = rmw->fs_info; 724 fs_info = rmw->fs_info;
724 725
725 kfree(rmw); 726 kfree(rmw);
726 727
728 old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
729 task_nice_ioprio(current));
730 set_task_ioprio(current, BTRFS_IOPRIO_READA);
727 __reada_start_machine(fs_info); 731 __reada_start_machine(fs_info);
732 set_task_ioprio(current, old_ioprio);
728} 733}
729 734
730static void __reada_start_machine(struct btrfs_fs_info *fs_info) 735static void __reada_start_machine(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 2f3d6f917fb3..a38cfa4f251e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -50,7 +50,7 @@ struct scrub_dev;
50struct scrub_page { 50struct scrub_page {
51 struct scrub_block *sblock; 51 struct scrub_block *sblock;
52 struct page *page; 52 struct page *page;
53 struct block_device *bdev; 53 struct btrfs_device *dev;
54 u64 flags; /* extent flags */ 54 u64 flags; /* extent flags */
55 u64 generation; 55 u64 generation;
56 u64 logical; 56 u64 logical;
@@ -86,6 +86,7 @@ struct scrub_block {
86 unsigned int header_error:1; 86 unsigned int header_error:1;
87 unsigned int checksum_error:1; 87 unsigned int checksum_error:1;
88 unsigned int no_io_error_seen:1; 88 unsigned int no_io_error_seen:1;
89 unsigned int generation_error:1; /* also sets header_error */
89 }; 90 };
90}; 91};
91 92
@@ -675,6 +676,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
675 sdev->stat.read_errors++; 676 sdev->stat.read_errors++;
676 sdev->stat.uncorrectable_errors++; 677 sdev->stat.uncorrectable_errors++;
677 spin_unlock(&sdev->stat_lock); 678 spin_unlock(&sdev->stat_lock);
679 btrfs_dev_stat_inc_and_print(sdev->dev,
680 BTRFS_DEV_STAT_READ_ERRS);
678 goto out; 681 goto out;
679 } 682 }
680 683
@@ -686,6 +689,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
686 sdev->stat.read_errors++; 689 sdev->stat.read_errors++;
687 sdev->stat.uncorrectable_errors++; 690 sdev->stat.uncorrectable_errors++;
688 spin_unlock(&sdev->stat_lock); 691 spin_unlock(&sdev->stat_lock);
692 btrfs_dev_stat_inc_and_print(sdev->dev,
693 BTRFS_DEV_STAT_READ_ERRS);
689 goto out; 694 goto out;
690 } 695 }
691 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); 696 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
@@ -699,6 +704,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
699 sdev->stat.read_errors++; 704 sdev->stat.read_errors++;
700 sdev->stat.uncorrectable_errors++; 705 sdev->stat.uncorrectable_errors++;
701 spin_unlock(&sdev->stat_lock); 706 spin_unlock(&sdev->stat_lock);
707 btrfs_dev_stat_inc_and_print(sdev->dev,
708 BTRFS_DEV_STAT_READ_ERRS);
702 goto out; 709 goto out;
703 } 710 }
704 711
@@ -725,12 +732,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
725 spin_unlock(&sdev->stat_lock); 732 spin_unlock(&sdev->stat_lock);
726 if (__ratelimit(&_rs)) 733 if (__ratelimit(&_rs))
727 scrub_print_warning("i/o error", sblock_to_check); 734 scrub_print_warning("i/o error", sblock_to_check);
735 btrfs_dev_stat_inc_and_print(sdev->dev,
736 BTRFS_DEV_STAT_READ_ERRS);
728 } else if (sblock_bad->checksum_error) { 737 } else if (sblock_bad->checksum_error) {
729 spin_lock(&sdev->stat_lock); 738 spin_lock(&sdev->stat_lock);
730 sdev->stat.csum_errors++; 739 sdev->stat.csum_errors++;
731 spin_unlock(&sdev->stat_lock); 740 spin_unlock(&sdev->stat_lock);
732 if (__ratelimit(&_rs)) 741 if (__ratelimit(&_rs))
733 scrub_print_warning("checksum error", sblock_to_check); 742 scrub_print_warning("checksum error", sblock_to_check);
743 btrfs_dev_stat_inc_and_print(sdev->dev,
744 BTRFS_DEV_STAT_CORRUPTION_ERRS);
734 } else if (sblock_bad->header_error) { 745 } else if (sblock_bad->header_error) {
735 spin_lock(&sdev->stat_lock); 746 spin_lock(&sdev->stat_lock);
736 sdev->stat.verify_errors++; 747 sdev->stat.verify_errors++;
@@ -738,6 +749,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
738 if (__ratelimit(&_rs)) 749 if (__ratelimit(&_rs))
739 scrub_print_warning("checksum/header error", 750 scrub_print_warning("checksum/header error",
740 sblock_to_check); 751 sblock_to_check);
752 if (sblock_bad->generation_error)
753 btrfs_dev_stat_inc_and_print(sdev->dev,
754 BTRFS_DEV_STAT_GENERATION_ERRS);
755 else
756 btrfs_dev_stat_inc_and_print(sdev->dev,
757 BTRFS_DEV_STAT_CORRUPTION_ERRS);
741 } 758 }
742 759
743 if (sdev->readonly) 760 if (sdev->readonly)
@@ -998,8 +1015,8 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
998 page = sblock->pagev + page_index; 1015 page = sblock->pagev + page_index;
999 page->logical = logical; 1016 page->logical = logical;
1000 page->physical = bbio->stripes[mirror_index].physical; 1017 page->physical = bbio->stripes[mirror_index].physical;
1001 /* for missing devices, bdev is NULL */ 1018 /* for missing devices, dev->bdev is NULL */
1002 page->bdev = bbio->stripes[mirror_index].dev->bdev; 1019 page->dev = bbio->stripes[mirror_index].dev;
1003 page->mirror_num = mirror_index + 1; 1020 page->mirror_num = mirror_index + 1;
1004 page->page = alloc_page(GFP_NOFS); 1021 page->page = alloc_page(GFP_NOFS);
1005 if (!page->page) { 1022 if (!page->page) {
@@ -1043,7 +1060,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1043 struct scrub_page *page = sblock->pagev + page_num; 1060 struct scrub_page *page = sblock->pagev + page_num;
1044 DECLARE_COMPLETION_ONSTACK(complete); 1061 DECLARE_COMPLETION_ONSTACK(complete);
1045 1062
1046 if (page->bdev == NULL) { 1063 if (page->dev->bdev == NULL) {
1047 page->io_error = 1; 1064 page->io_error = 1;
1048 sblock->no_io_error_seen = 0; 1065 sblock->no_io_error_seen = 0;
1049 continue; 1066 continue;
@@ -1053,7 +1070,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1053 bio = bio_alloc(GFP_NOFS, 1); 1070 bio = bio_alloc(GFP_NOFS, 1);
1054 if (!bio) 1071 if (!bio)
1055 return -EIO; 1072 return -EIO;
1056 bio->bi_bdev = page->bdev; 1073 bio->bi_bdev = page->dev->bdev;
1057 bio->bi_sector = page->physical >> 9; 1074 bio->bi_sector = page->physical >> 9;
1058 bio->bi_end_io = scrub_complete_bio_end_io; 1075 bio->bi_end_io = scrub_complete_bio_end_io;
1059 bio->bi_private = &complete; 1076 bio->bi_private = &complete;
@@ -1102,11 +1119,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1102 h = (struct btrfs_header *)mapped_buffer; 1119 h = (struct btrfs_header *)mapped_buffer;
1103 1120
1104 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || 1121 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
1105 generation != le64_to_cpu(h->generation) ||
1106 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || 1122 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1107 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1123 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1108 BTRFS_UUID_SIZE)) 1124 BTRFS_UUID_SIZE)) {
1109 sblock->header_error = 1; 1125 sblock->header_error = 1;
1126 } else if (generation != le64_to_cpu(h->generation)) {
1127 sblock->header_error = 1;
1128 sblock->generation_error = 1;
1129 }
1110 csum = h->csum; 1130 csum = h->csum;
1111 } else { 1131 } else {
1112 if (!have_csum) 1132 if (!have_csum)
@@ -1182,7 +1202,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1182 bio = bio_alloc(GFP_NOFS, 1); 1202 bio = bio_alloc(GFP_NOFS, 1);
1183 if (!bio) 1203 if (!bio)
1184 return -EIO; 1204 return -EIO;
1185 bio->bi_bdev = page_bad->bdev; 1205 bio->bi_bdev = page_bad->dev->bdev;
1186 bio->bi_sector = page_bad->physical >> 9; 1206 bio->bi_sector = page_bad->physical >> 9;
1187 bio->bi_end_io = scrub_complete_bio_end_io; 1207 bio->bi_end_io = scrub_complete_bio_end_io;
1188 bio->bi_private = &complete; 1208 bio->bi_private = &complete;
@@ -1196,6 +1216,12 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1196 1216
1197 /* this will also unplug the queue */ 1217 /* this will also unplug the queue */
1198 wait_for_completion(&complete); 1218 wait_for_completion(&complete);
1219 if (!bio_flagged(bio, BIO_UPTODATE)) {
1220 btrfs_dev_stat_inc_and_print(page_bad->dev,
1221 BTRFS_DEV_STAT_WRITE_ERRS);
1222 bio_put(bio);
1223 return -EIO;
1224 }
1199 bio_put(bio); 1225 bio_put(bio);
1200 } 1226 }
1201 1227
@@ -1352,7 +1378,8 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1352 u64 mapped_size; 1378 u64 mapped_size;
1353 void *p; 1379 void *p;
1354 u32 crc = ~(u32)0; 1380 u32 crc = ~(u32)0;
1355 int fail = 0; 1381 int fail_gen = 0;
1382 int fail_cor = 0;
1356 u64 len; 1383 u64 len;
1357 int index; 1384 int index;
1358 1385
@@ -1363,13 +1390,13 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1363 memcpy(on_disk_csum, s->csum, sdev->csum_size); 1390 memcpy(on_disk_csum, s->csum, sdev->csum_size);
1364 1391
1365 if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) 1392 if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
1366 ++fail; 1393 ++fail_cor;
1367 1394
1368 if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) 1395 if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
1369 ++fail; 1396 ++fail_gen;
1370 1397
1371 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1398 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1372 ++fail; 1399 ++fail_cor;
1373 1400
1374 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; 1401 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1375 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1402 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
@@ -1394,9 +1421,9 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1394 1421
1395 btrfs_csum_final(crc, calculated_csum); 1422 btrfs_csum_final(crc, calculated_csum);
1396 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1423 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
1397 ++fail; 1424 ++fail_cor;
1398 1425
1399 if (fail) { 1426 if (fail_cor + fail_gen) {
1400 /* 1427 /*
1401 * if we find an error in a super block, we just report it. 1428 * if we find an error in a super block, we just report it.
1402 * They will get written with the next transaction commit 1429 * They will get written with the next transaction commit
@@ -1405,9 +1432,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1405 spin_lock(&sdev->stat_lock); 1432 spin_lock(&sdev->stat_lock);
1406 ++sdev->stat.super_errors; 1433 ++sdev->stat.super_errors;
1407 spin_unlock(&sdev->stat_lock); 1434 spin_unlock(&sdev->stat_lock);
1435 if (fail_cor)
1436 btrfs_dev_stat_inc_and_print(sdev->dev,
1437 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1438 else
1439 btrfs_dev_stat_inc_and_print(sdev->dev,
1440 BTRFS_DEV_STAT_GENERATION_ERRS);
1408 } 1441 }
1409 1442
1410 return fail; 1443 return fail_cor + fail_gen;
1411} 1444}
1412 1445
1413static void scrub_block_get(struct scrub_block *sblock) 1446static void scrub_block_get(struct scrub_block *sblock)
@@ -1551,7 +1584,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
1551 return -ENOMEM; 1584 return -ENOMEM;
1552 } 1585 }
1553 spage->sblock = sblock; 1586 spage->sblock = sblock;
1554 spage->bdev = sdev->dev->bdev; 1587 spage->dev = sdev->dev;
1555 spage->flags = flags; 1588 spage->flags = flags;
1556 spage->generation = gen; 1589 spage->generation = gen;
1557 spage->logical = logical; 1590 spage->logical = logical;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c5f8fca4195f..96eb9fef7bd2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -188,7 +188,8 @@ void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
188 va_start(args, fmt); 188 va_start(args, fmt);
189 189
190 if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') { 190 if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') {
191 strncpy(lvl, fmt, 3); 191 memcpy(lvl, fmt, 3);
192 lvl[3] = '\0';
192 fmt += 3; 193 fmt += 3;
193 type = logtypes[fmt[1] - '0']; 194 type = logtypes[fmt[1] - '0'];
194 } else 195 } else
@@ -435,11 +436,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
435 case Opt_thread_pool: 436 case Opt_thread_pool:
436 intarg = 0; 437 intarg = 0;
437 match_int(&args[0], &intarg); 438 match_int(&args[0], &intarg);
438 if (intarg) { 439 if (intarg)
439 info->thread_pool_size = intarg; 440 info->thread_pool_size = intarg;
440 printk(KERN_INFO "btrfs: thread pool %d\n",
441 info->thread_pool_size);
442 }
443 break; 441 break;
444 case Opt_max_inline: 442 case Opt_max_inline:
445 num = match_strdup(&args[0]); 443 num = match_strdup(&args[0]);
@@ -769,7 +767,7 @@ static int btrfs_fill_super(struct super_block *sb,
769#ifdef CONFIG_BTRFS_FS_POSIX_ACL 767#ifdef CONFIG_BTRFS_FS_POSIX_ACL
770 sb->s_flags |= MS_POSIXACL; 768 sb->s_flags |= MS_POSIXACL;
771#endif 769#endif
772 770 sb->s_flags |= MS_I_VERSION;
773 err = open_ctree(sb, fs_devices, (char *)data); 771 err = open_ctree(sb, fs_devices, (char *)data);
774 if (err) { 772 if (err) {
775 printk("btrfs: open_ctree failed\n"); 773 printk("btrfs: open_ctree failed\n");
@@ -925,63 +923,48 @@ static inline int is_subvolume_inode(struct inode *inode)
925 */ 923 */
926static char *setup_root_args(char *args) 924static char *setup_root_args(char *args)
927{ 925{
928 unsigned copied = 0; 926 unsigned len = strlen(args) + 2 + 1;
929 unsigned len = strlen(args) + 2; 927 char *src, *dst, *buf;
930 char *pos;
931 char *ret;
932 928
933 /* 929 /*
934 * We need the same args as before, but minus 930 * We need the same args as before, but with this substitution:
935 * 931 * s!subvol=[^,]+!subvolid=0!
936 * subvol=a
937 *
938 * and add
939 *
940 * subvolid=0
941 * 932 *
942 * which is a difference of 2 characters, so we allocate strlen(args) + 933 * Since the replacement string is up to 2 bytes longer than the
943 * 2 characters. 934 * original, allocate strlen(args) + 2 + 1 bytes.
944 */ 935 */
945 ret = kzalloc(len * sizeof(char), GFP_NOFS);
946 if (!ret)
947 return NULL;
948 pos = strstr(args, "subvol=");
949 936
937 src = strstr(args, "subvol=");
950 /* This shouldn't happen, but just in case.. */ 938 /* This shouldn't happen, but just in case.. */
951 if (!pos) { 939 if (!src)
952 kfree(ret); 940 return NULL;
941
942 buf = dst = kmalloc(len, GFP_NOFS);
943 if (!buf)
953 return NULL; 944 return NULL;
954 }
955 945
956 /* 946 /*
957 * The subvol=<> arg is not at the front of the string, copy everybody 947 * If the subvol= arg is not at the start of the string,
958 * up to that into ret. 948 * copy whatever precedes it into buf.
959 */ 949 */
960 if (pos != args) { 950 if (src != args) {
961 *pos = '\0'; 951 *src++ = '\0';
962 strcpy(ret, args); 952 strcpy(buf, args);
963 copied += strlen(args); 953 dst += strlen(args);
964 pos++;
965 } 954 }
966 955
967 strncpy(ret + copied, "subvolid=0", len - copied); 956 strcpy(dst, "subvolid=0");
968 957 dst += strlen("subvolid=0");
969 /* Length of subvolid=0 */
970 copied += 10;
971 958
972 /* 959 /*
973 * If there is no , after the subvol= option then we know there's no 960 * If there is a "," after the original subvol=... string,
974 * other options and we can just return. 961 * copy that suffix into our buffer. Otherwise, we're done.
975 */ 962 */
976 pos = strchr(pos, ','); 963 src = strchr(src, ',');
977 if (!pos) 964 if (src)
978 return ret; 965 strcpy(dst, src);
979 966
980 /* Copy the rest of the arguments into our buffer */ 967 return buf;
981 strncpy(ret + copied, pos, len - copied);
982 copied += strlen(pos);
983
984 return ret;
985} 968}
986 969
987static struct dentry *mount_subvol(const char *subvol_name, int flags, 970static struct dentry *mount_subvol(const char *subvol_name, int flags,
@@ -1118,6 +1101,40 @@ error_fs_info:
1118 return ERR_PTR(error); 1101 return ERR_PTR(error);
1119} 1102}
1120 1103
1104static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit)
1105{
1106 spin_lock_irq(&workers->lock);
1107 workers->max_workers = new_limit;
1108 spin_unlock_irq(&workers->lock);
1109}
1110
1111static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1112 int new_pool_size, int old_pool_size)
1113{
1114 if (new_pool_size == old_pool_size)
1115 return;
1116
1117 fs_info->thread_pool_size = new_pool_size;
1118
1119 printk(KERN_INFO "btrfs: resize thread pool %d -> %d\n",
1120 old_pool_size, new_pool_size);
1121
1122 btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size);
1123 btrfs_set_max_workers(&fs_info->workers, new_pool_size);
1124 btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size);
1125 btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size);
1126 btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size);
1127 btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size);
1128 btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size);
1129 btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size);
1130 btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size);
1131 btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size);
1132 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
1133 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
1134 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
1135 btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size);
1136}
1137
1121static int btrfs_remount(struct super_block *sb, int *flags, char *data) 1138static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1122{ 1139{
1123 struct btrfs_fs_info *fs_info = btrfs_sb(sb); 1140 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -1137,6 +1154,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1137 goto restore; 1154 goto restore;
1138 } 1155 }
1139 1156
1157 btrfs_resize_thread_pool(fs_info,
1158 fs_info->thread_pool_size, old_thread_pool_size);
1159
1140 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 1160 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
1141 return 0; 1161 return 0;
1142 1162
@@ -1180,7 +1200,8 @@ restore:
1180 fs_info->compress_type = old_compress_type; 1200 fs_info->compress_type = old_compress_type;
1181 fs_info->max_inline = old_max_inline; 1201 fs_info->max_inline = old_max_inline;
1182 fs_info->alloc_start = old_alloc_start; 1202 fs_info->alloc_start = old_alloc_start;
1183 fs_info->thread_pool_size = old_thread_pool_size; 1203 btrfs_resize_thread_pool(fs_info,
1204 old_thread_pool_size, fs_info->thread_pool_size);
1184 fs_info->metadata_ratio = old_metadata_ratio; 1205 fs_info->metadata_ratio = old_metadata_ratio;
1185 return ret; 1206 return ret;
1186} 1207}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 36422254ef67..1791c6e3d834 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -28,6 +28,7 @@
28#include "locking.h" 28#include "locking.h"
29#include "tree-log.h" 29#include "tree-log.h"
30#include "inode-map.h" 30#include "inode-map.h"
31#include "volumes.h"
31 32
32#define BTRFS_ROOT_TRANS_TAG 0 33#define BTRFS_ROOT_TRANS_TAG 0
33 34
@@ -55,48 +56,49 @@ static noinline void switch_commit_root(struct btrfs_root *root)
55static noinline int join_transaction(struct btrfs_root *root, int nofail) 56static noinline int join_transaction(struct btrfs_root *root, int nofail)
56{ 57{
57 struct btrfs_transaction *cur_trans; 58 struct btrfs_transaction *cur_trans;
59 struct btrfs_fs_info *fs_info = root->fs_info;
58 60
59 spin_lock(&root->fs_info->trans_lock); 61 spin_lock(&fs_info->trans_lock);
60loop: 62loop:
61 /* The file system has been taken offline. No new transactions. */ 63 /* The file system has been taken offline. No new transactions. */
62 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 64 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
63 spin_unlock(&root->fs_info->trans_lock); 65 spin_unlock(&fs_info->trans_lock);
64 return -EROFS; 66 return -EROFS;
65 } 67 }
66 68
67 if (root->fs_info->trans_no_join) { 69 if (fs_info->trans_no_join) {
68 if (!nofail) { 70 if (!nofail) {
69 spin_unlock(&root->fs_info->trans_lock); 71 spin_unlock(&fs_info->trans_lock);
70 return -EBUSY; 72 return -EBUSY;
71 } 73 }
72 } 74 }
73 75
74 cur_trans = root->fs_info->running_transaction; 76 cur_trans = fs_info->running_transaction;
75 if (cur_trans) { 77 if (cur_trans) {
76 if (cur_trans->aborted) { 78 if (cur_trans->aborted) {
77 spin_unlock(&root->fs_info->trans_lock); 79 spin_unlock(&fs_info->trans_lock);
78 return cur_trans->aborted; 80 return cur_trans->aborted;
79 } 81 }
80 atomic_inc(&cur_trans->use_count); 82 atomic_inc(&cur_trans->use_count);
81 atomic_inc(&cur_trans->num_writers); 83 atomic_inc(&cur_trans->num_writers);
82 cur_trans->num_joined++; 84 cur_trans->num_joined++;
83 spin_unlock(&root->fs_info->trans_lock); 85 spin_unlock(&fs_info->trans_lock);
84 return 0; 86 return 0;
85 } 87 }
86 spin_unlock(&root->fs_info->trans_lock); 88 spin_unlock(&fs_info->trans_lock);
87 89
88 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 90 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
89 if (!cur_trans) 91 if (!cur_trans)
90 return -ENOMEM; 92 return -ENOMEM;
91 93
92 spin_lock(&root->fs_info->trans_lock); 94 spin_lock(&fs_info->trans_lock);
93 if (root->fs_info->running_transaction) { 95 if (fs_info->running_transaction) {
94 /* 96 /*
95 * someone started a transaction after we unlocked. Make sure 97 * someone started a transaction after we unlocked. Make sure
96 * to redo the trans_no_join checks above 98 * to redo the trans_no_join checks above
97 */ 99 */
98 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 100 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
99 cur_trans = root->fs_info->running_transaction; 101 cur_trans = fs_info->running_transaction;
100 goto loop; 102 goto loop;
101 } 103 }
102 104
@@ -121,20 +123,38 @@ loop:
121 cur_trans->delayed_refs.flushing = 0; 123 cur_trans->delayed_refs.flushing = 0;
122 cur_trans->delayed_refs.run_delayed_start = 0; 124 cur_trans->delayed_refs.run_delayed_start = 0;
123 cur_trans->delayed_refs.seq = 1; 125 cur_trans->delayed_refs.seq = 1;
126
127 /*
128 * although the tree mod log is per file system and not per transaction,
129 * the log must never go across transaction boundaries.
130 */
131 smp_mb();
132 if (!list_empty(&fs_info->tree_mod_seq_list)) {
133 printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when "
134 "creating a fresh transaction\n");
135 WARN_ON(1);
136 }
137 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
138 printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
139 "creating a fresh transaction\n");
140 WARN_ON(1);
141 }
142 atomic_set(&fs_info->tree_mod_seq, 0);
143
124 init_waitqueue_head(&cur_trans->delayed_refs.seq_wait); 144 init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
125 spin_lock_init(&cur_trans->commit_lock); 145 spin_lock_init(&cur_trans->commit_lock);
126 spin_lock_init(&cur_trans->delayed_refs.lock); 146 spin_lock_init(&cur_trans->delayed_refs.lock);
127 INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head); 147 INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
128 148
129 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 149 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
130 list_add_tail(&cur_trans->list, &root->fs_info->trans_list); 150 list_add_tail(&cur_trans->list, &fs_info->trans_list);
131 extent_io_tree_init(&cur_trans->dirty_pages, 151 extent_io_tree_init(&cur_trans->dirty_pages,
132 root->fs_info->btree_inode->i_mapping); 152 fs_info->btree_inode->i_mapping);
133 root->fs_info->generation++; 153 fs_info->generation++;
134 cur_trans->transid = root->fs_info->generation; 154 cur_trans->transid = fs_info->generation;
135 root->fs_info->running_transaction = cur_trans; 155 fs_info->running_transaction = cur_trans;
136 cur_trans->aborted = 0; 156 cur_trans->aborted = 0;
137 spin_unlock(&root->fs_info->trans_lock); 157 spin_unlock(&fs_info->trans_lock);
138 158
139 return 0; 159 return 0;
140} 160}
@@ -758,6 +778,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
758 if (ret) 778 if (ret)
759 return ret; 779 return ret;
760 780
781 ret = btrfs_run_dev_stats(trans, root->fs_info);
782 BUG_ON(ret);
783
761 while (!list_empty(&fs_info->dirty_cowonly_roots)) { 784 while (!list_empty(&fs_info->dirty_cowonly_roots)) {
762 next = fs_info->dirty_cowonly_roots.next; 785 next = fs_info->dirty_cowonly_roots.next;
763 list_del_init(next); 786 list_del_init(next);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index eb1ae908582c..2017d0ff511c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1628,7 +1628,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1628 int i; 1628 int i;
1629 int ret; 1629 int ret;
1630 1630
1631 btrfs_read_buffer(eb, gen); 1631 ret = btrfs_read_buffer(eb, gen);
1632 if (ret)
1633 return ret;
1632 1634
1633 level = btrfs_header_level(eb); 1635 level = btrfs_header_level(eb);
1634 1636
@@ -1749,7 +1751,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1749 1751
1750 path->slots[*level]++; 1752 path->slots[*level]++;
1751 if (wc->free) { 1753 if (wc->free) {
1752 btrfs_read_buffer(next, ptr_gen); 1754 ret = btrfs_read_buffer(next, ptr_gen);
1755 if (ret) {
1756 free_extent_buffer(next);
1757 return ret;
1758 }
1753 1759
1754 btrfs_tree_lock(next); 1760 btrfs_tree_lock(next);
1755 btrfs_set_lock_blocking(next); 1761 btrfs_set_lock_blocking(next);
@@ -1766,7 +1772,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1766 free_extent_buffer(next); 1772 free_extent_buffer(next);
1767 continue; 1773 continue;
1768 } 1774 }
1769 btrfs_read_buffer(next, ptr_gen); 1775 ret = btrfs_read_buffer(next, ptr_gen);
1776 if (ret) {
1777 free_extent_buffer(next);
1778 return ret;
1779 }
1770 1780
1771 WARN_ON(*level <= 0); 1781 WARN_ON(*level <= 0);
1772 if (path->nodes[*level-1]) 1782 if (path->nodes[*level-1])
@@ -2657,6 +2667,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2657 btrfs_release_path(path); 2667 btrfs_release_path(path);
2658 } 2668 }
2659 btrfs_release_path(path); 2669 btrfs_release_path(path);
2670 if (ret > 0)
2671 ret = 0;
2660 return ret; 2672 return ret;
2661} 2673}
2662 2674
@@ -3028,21 +3040,6 @@ out:
3028 return ret; 3040 return ret;
3029} 3041}
3030 3042
3031static int inode_in_log(struct btrfs_trans_handle *trans,
3032 struct inode *inode)
3033{
3034 struct btrfs_root *root = BTRFS_I(inode)->root;
3035 int ret = 0;
3036
3037 mutex_lock(&root->log_mutex);
3038 if (BTRFS_I(inode)->logged_trans == trans->transid &&
3039 BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
3040 ret = 1;
3041 mutex_unlock(&root->log_mutex);
3042 return ret;
3043}
3044
3045
3046/* 3043/*
3047 * helper function around btrfs_log_inode to make sure newly created 3044 * helper function around btrfs_log_inode to make sure newly created
3048 * parent directories also end up in the log. A minimal inode and backref 3045 * parent directories also end up in the log. A minimal inode and backref
@@ -3083,7 +3080,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
3083 if (ret) 3080 if (ret)
3084 goto end_no_trans; 3081 goto end_no_trans;
3085 3082
3086 if (inode_in_log(trans, inode)) { 3083 if (btrfs_inode_in_log(inode, trans->transid)) {
3087 ret = BTRFS_NO_LOG_SYNC; 3084 ret = BTRFS_NO_LOG_SYNC;
3088 goto end_no_trans; 3085 goto end_no_trans;
3089 } 3086 }
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 12f5147bd2b1..ab942f46b3dd 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -23,9 +23,9 @@
23 * 23 *
24 * ulist = ulist_alloc(); 24 * ulist = ulist_alloc();
25 * ulist_add(ulist, root); 25 * ulist_add(ulist, root);
26 * elem = NULL; 26 * ULIST_ITER_INIT(&uiter);
27 * 27 *
28 * while ((elem = ulist_next(ulist, elem)) { 28 * while ((elem = ulist_next(ulist, &uiter)) {
29 * for (all child nodes n in elem) 29 * for (all child nodes n in elem)
30 * ulist_add(ulist, n); 30 * ulist_add(ulist, n);
31 * do something useful with the node; 31 * do something useful with the node;
@@ -95,7 +95,7 @@ EXPORT_SYMBOL(ulist_reinit);
95 * 95 *
96 * The allocated ulist will be returned in an initialized state. 96 * The allocated ulist will be returned in an initialized state.
97 */ 97 */
98struct ulist *ulist_alloc(unsigned long gfp_mask) 98struct ulist *ulist_alloc(gfp_t gfp_mask)
99{ 99{
100 struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask); 100 struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
101 101
@@ -144,13 +144,22 @@ EXPORT_SYMBOL(ulist_free);
144 * unaltered. 144 * unaltered.
145 */ 145 */
146int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, 146int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
147 unsigned long gfp_mask) 147 gfp_t gfp_mask)
148{
149 return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
150}
151
152int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
153 unsigned long *old_aux, gfp_t gfp_mask)
148{ 154{
149 int i; 155 int i;
150 156
151 for (i = 0; i < ulist->nnodes; ++i) { 157 for (i = 0; i < ulist->nnodes; ++i) {
152 if (ulist->nodes[i].val == val) 158 if (ulist->nodes[i].val == val) {
159 if (old_aux)
160 *old_aux = ulist->nodes[i].aux;
153 return 0; 161 return 0;
162 }
154 } 163 }
155 164
156 if (ulist->nnodes >= ulist->nodes_alloced) { 165 if (ulist->nnodes >= ulist->nodes_alloced) {
@@ -188,33 +197,26 @@ EXPORT_SYMBOL(ulist_add);
188/** 197/**
189 * ulist_next - iterate ulist 198 * ulist_next - iterate ulist
190 * @ulist: ulist to iterate 199 * @ulist: ulist to iterate
191 * @prev: previously returned element or %NULL to start iteration 200 * @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator)
192 * 201 *
193 * Note: locking must be provided by the caller. In case of rwlocks only read 202 * Note: locking must be provided by the caller. In case of rwlocks only read
194 * locking is needed 203 * locking is needed
195 * 204 *
196 * This function is used to iterate an ulist. The iteration is started with 205 * This function is used to iterate an ulist.
197 * @prev = %NULL. It returns the next element from the ulist or %NULL when the 206 * It returns the next element from the ulist or %NULL when the
198 * end is reached. No guarantee is made with respect to the order in which 207 * end is reached. No guarantee is made with respect to the order in which
199 * the elements are returned. They might neither be returned in order of 208 * the elements are returned. They might neither be returned in order of
200 * addition nor in ascending order. 209 * addition nor in ascending order.
201 * It is allowed to call ulist_add during an enumeration. Newly added items 210 * It is allowed to call ulist_add during an enumeration. Newly added items
202 * are guaranteed to show up in the running enumeration. 211 * are guaranteed to show up in the running enumeration.
203 */ 212 */
204struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev) 213struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
205{ 214{
206 int next;
207
208 if (ulist->nnodes == 0) 215 if (ulist->nnodes == 0)
209 return NULL; 216 return NULL;
210 217 if (uiter->i < 0 || uiter->i >= ulist->nnodes)
211 if (!prev)
212 return &ulist->nodes[0];
213
214 next = (prev - ulist->nodes) + 1;
215 if (next < 0 || next >= ulist->nnodes)
216 return NULL; 218 return NULL;
217 219
218 return &ulist->nodes[next]; 220 return &ulist->nodes[uiter->i++];
219} 221}
220EXPORT_SYMBOL(ulist_next); 222EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 2e25dec58ec0..21bdc8ec8130 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -24,6 +24,10 @@
24 */ 24 */
25#define ULIST_SIZE 16 25#define ULIST_SIZE 16
26 26
27struct ulist_iterator {
28 int i;
29};
30
27/* 31/*
28 * element of the list 32 * element of the list
29 */ 33 */
@@ -59,10 +63,15 @@ struct ulist {
59void ulist_init(struct ulist *ulist); 63void ulist_init(struct ulist *ulist);
60void ulist_fini(struct ulist *ulist); 64void ulist_fini(struct ulist *ulist);
61void ulist_reinit(struct ulist *ulist); 65void ulist_reinit(struct ulist *ulist);
62struct ulist *ulist_alloc(unsigned long gfp_mask); 66struct ulist *ulist_alloc(gfp_t gfp_mask);
63void ulist_free(struct ulist *ulist); 67void ulist_free(struct ulist *ulist);
64int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, 68int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
65 unsigned long gfp_mask); 69 gfp_t gfp_mask);
66struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev); 70int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
71 unsigned long *old_aux, gfp_t gfp_mask);
72struct ulist_node *ulist_next(struct ulist *ulist,
73 struct ulist_iterator *uiter);
74
75#define ULIST_ITER_INIT(uiter) ((uiter)->i = 0)
67 76
68#endif 77#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1411b99555a4..7782020996fe 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -23,6 +23,7 @@
23#include <linux/random.h> 23#include <linux/random.h>
24#include <linux/iocontext.h> 24#include <linux/iocontext.h>
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/ratelimit.h>
26#include <linux/kthread.h> 27#include <linux/kthread.h>
27#include <asm/div64.h> 28#include <asm/div64.h>
28#include "compat.h" 29#include "compat.h"
@@ -39,6 +40,8 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans,
39 struct btrfs_root *root, 40 struct btrfs_root *root,
40 struct btrfs_device *device); 41 struct btrfs_device *device);
41static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 42static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
43static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
44static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
42 45
43static DEFINE_MUTEX(uuid_mutex); 46static DEFINE_MUTEX(uuid_mutex);
44static LIST_HEAD(fs_uuids); 47static LIST_HEAD(fs_uuids);
@@ -361,6 +364,7 @@ static noinline int device_list_add(const char *path,
361 return -ENOMEM; 364 return -ENOMEM;
362 } 365 }
363 device->devid = devid; 366 device->devid = devid;
367 device->dev_stats_valid = 0;
364 device->work.func = pending_bios_fn; 368 device->work.func = pending_bios_fn;
365 memcpy(device->uuid, disk_super->dev_item.uuid, 369 memcpy(device->uuid, disk_super->dev_item.uuid,
366 BTRFS_UUID_SIZE); 370 BTRFS_UUID_SIZE);
@@ -1633,7 +1637,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1633 int ret = 0; 1637 int ret = 0;
1634 1638
1635 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1639 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1636 return -EINVAL; 1640 return -EROFS;
1637 1641
1638 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 1642 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
1639 root->fs_info->bdev_holder); 1643 root->fs_info->bdev_holder);
@@ -4001,13 +4005,58 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4001 return 0; 4005 return 0;
4002} 4006}
4003 4007
4008static void *merge_stripe_index_into_bio_private(void *bi_private,
4009 unsigned int stripe_index)
4010{
4011 /*
4012 * with single, dup, RAID0, RAID1 and RAID10, stripe_index is
4013 * at most 1.
4014 * The alternative solution (instead of stealing bits from the
4015 * pointer) would be to allocate an intermediate structure
4016 * that contains the old private pointer plus the stripe_index.
4017 */
4018 BUG_ON((((uintptr_t)bi_private) & 3) != 0);
4019 BUG_ON(stripe_index > 3);
4020 return (void *)(((uintptr_t)bi_private) | stripe_index);
4021}
4022
4023static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private)
4024{
4025 return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3));
4026}
4027
4028static unsigned int extract_stripe_index_from_bio_private(void *bi_private)
4029{
4030 return (unsigned int)((uintptr_t)bi_private) & 3;
4031}
4032
4004static void btrfs_end_bio(struct bio *bio, int err) 4033static void btrfs_end_bio(struct bio *bio, int err)
4005{ 4034{
4006 struct btrfs_bio *bbio = bio->bi_private; 4035 struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private);
4007 int is_orig_bio = 0; 4036 int is_orig_bio = 0;
4008 4037
4009 if (err) 4038 if (err) {
4010 atomic_inc(&bbio->error); 4039 atomic_inc(&bbio->error);
4040 if (err == -EIO || err == -EREMOTEIO) {
4041 unsigned int stripe_index =
4042 extract_stripe_index_from_bio_private(
4043 bio->bi_private);
4044 struct btrfs_device *dev;
4045
4046 BUG_ON(stripe_index >= bbio->num_stripes);
4047 dev = bbio->stripes[stripe_index].dev;
4048 if (bio->bi_rw & WRITE)
4049 btrfs_dev_stat_inc(dev,
4050 BTRFS_DEV_STAT_WRITE_ERRS);
4051 else
4052 btrfs_dev_stat_inc(dev,
4053 BTRFS_DEV_STAT_READ_ERRS);
4054 if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
4055 btrfs_dev_stat_inc(dev,
4056 BTRFS_DEV_STAT_FLUSH_ERRS);
4057 btrfs_dev_stat_print_on_error(dev);
4058 }
4059 }
4011 4060
4012 if (bio == bbio->orig_bio) 4061 if (bio == bbio->orig_bio)
4013 is_orig_bio = 1; 4062 is_orig_bio = 1;
@@ -4149,6 +4198,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4149 bio = first_bio; 4198 bio = first_bio;
4150 } 4199 }
4151 bio->bi_private = bbio; 4200 bio->bi_private = bbio;
4201 bio->bi_private = merge_stripe_index_into_bio_private(
4202 bio->bi_private, (unsigned int)dev_nr);
4152 bio->bi_end_io = btrfs_end_bio; 4203 bio->bi_end_io = btrfs_end_bio;
4153 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; 4204 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
4154 dev = bbio->stripes[dev_nr].dev; 4205 dev = bbio->stripes[dev_nr].dev;
@@ -4509,6 +4560,28 @@ int btrfs_read_sys_array(struct btrfs_root *root)
4509 return ret; 4560 return ret;
4510} 4561}
4511 4562
4563struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
4564 u64 logical, int mirror_num)
4565{
4566 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4567 int ret;
4568 u64 map_length = 0;
4569 struct btrfs_bio *bbio = NULL;
4570 struct btrfs_device *device;
4571
4572 BUG_ON(mirror_num == 0);
4573 ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio,
4574 mirror_num);
4575 if (ret) {
4576 BUG_ON(bbio != NULL);
4577 return NULL;
4578 }
4579 BUG_ON(mirror_num != bbio->mirror_num);
4580 device = bbio->stripes[mirror_num - 1].dev;
4581 kfree(bbio);
4582 return device;
4583}
4584
4512int btrfs_read_chunk_tree(struct btrfs_root *root) 4585int btrfs_read_chunk_tree(struct btrfs_root *root)
4513{ 4586{
4514 struct btrfs_path *path; 4587 struct btrfs_path *path;
@@ -4583,3 +4656,230 @@ error:
4583 btrfs_free_path(path); 4656 btrfs_free_path(path);
4584 return ret; 4657 return ret;
4585} 4658}
4659
4660static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
4661{
4662 int i;
4663
4664 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4665 btrfs_dev_stat_reset(dev, i);
4666}
4667
4668int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
4669{
4670 struct btrfs_key key;
4671 struct btrfs_key found_key;
4672 struct btrfs_root *dev_root = fs_info->dev_root;
4673 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
4674 struct extent_buffer *eb;
4675 int slot;
4676 int ret = 0;
4677 struct btrfs_device *device;
4678 struct btrfs_path *path = NULL;
4679 int i;
4680
4681 path = btrfs_alloc_path();
4682 if (!path) {
4683 ret = -ENOMEM;
4684 goto out;
4685 }
4686
4687 mutex_lock(&fs_devices->device_list_mutex);
4688 list_for_each_entry(device, &fs_devices->devices, dev_list) {
4689 int item_size;
4690 struct btrfs_dev_stats_item *ptr;
4691
4692 key.objectid = 0;
4693 key.type = BTRFS_DEV_STATS_KEY;
4694 key.offset = device->devid;
4695 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
4696 if (ret) {
4697 printk(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n",
4698 device->name, (unsigned long long)device->devid);
4699 __btrfs_reset_dev_stats(device);
4700 device->dev_stats_valid = 1;
4701 btrfs_release_path(path);
4702 continue;
4703 }
4704 slot = path->slots[0];
4705 eb = path->nodes[0];
4706 btrfs_item_key_to_cpu(eb, &found_key, slot);
4707 item_size = btrfs_item_size_nr(eb, slot);
4708
4709 ptr = btrfs_item_ptr(eb, slot,
4710 struct btrfs_dev_stats_item);
4711
4712 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
4713 if (item_size >= (1 + i) * sizeof(__le64))
4714 btrfs_dev_stat_set(device, i,
4715 btrfs_dev_stats_value(eb, ptr, i));
4716 else
4717 btrfs_dev_stat_reset(device, i);
4718 }
4719
4720 device->dev_stats_valid = 1;
4721 btrfs_dev_stat_print_on_load(device);
4722 btrfs_release_path(path);
4723 }
4724 mutex_unlock(&fs_devices->device_list_mutex);
4725
4726out:
4727 btrfs_free_path(path);
4728 return ret < 0 ? ret : 0;
4729}
4730
4731static int update_dev_stat_item(struct btrfs_trans_handle *trans,
4732 struct btrfs_root *dev_root,
4733 struct btrfs_device *device)
4734{
4735 struct btrfs_path *path;
4736 struct btrfs_key key;
4737 struct extent_buffer *eb;
4738 struct btrfs_dev_stats_item *ptr;
4739 int ret;
4740 int i;
4741
4742 key.objectid = 0;
4743 key.type = BTRFS_DEV_STATS_KEY;
4744 key.offset = device->devid;
4745
4746 path = btrfs_alloc_path();
4747 BUG_ON(!path);
4748 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
4749 if (ret < 0) {
4750 printk(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n",
4751 ret, device->name);
4752 goto out;
4753 }
4754
4755 if (ret == 0 &&
4756 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
4757 /* need to delete old one and insert a new one */
4758 ret = btrfs_del_item(trans, dev_root, path);
4759 if (ret != 0) {
4760 printk(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n",
4761 device->name, ret);
4762 goto out;
4763 }
4764 ret = 1;
4765 }
4766
4767 if (ret == 1) {
4768 /* need to insert a new item */
4769 btrfs_release_path(path);
4770 ret = btrfs_insert_empty_item(trans, dev_root, path,
4771 &key, sizeof(*ptr));
4772 if (ret < 0) {
4773 printk(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n",
4774 device->name, ret);
4775 goto out;
4776 }
4777 }
4778
4779 eb = path->nodes[0];
4780 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
4781 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4782 btrfs_set_dev_stats_value(eb, ptr, i,
4783 btrfs_dev_stat_read(device, i));
4784 btrfs_mark_buffer_dirty(eb);
4785
4786out:
4787 btrfs_free_path(path);
4788 return ret;
4789}
4790
4791/*
4792 * called from commit_transaction. Writes all changed device stats to disk.
4793 */
4794int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
4795 struct btrfs_fs_info *fs_info)
4796{
4797 struct btrfs_root *dev_root = fs_info->dev_root;
4798 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
4799 struct btrfs_device *device;
4800 int ret = 0;
4801
4802 mutex_lock(&fs_devices->device_list_mutex);
4803 list_for_each_entry(device, &fs_devices->devices, dev_list) {
4804 if (!device->dev_stats_valid || !device->dev_stats_dirty)
4805 continue;
4806
4807 ret = update_dev_stat_item(trans, dev_root, device);
4808 if (!ret)
4809 device->dev_stats_dirty = 0;
4810 }
4811 mutex_unlock(&fs_devices->device_list_mutex);
4812
4813 return ret;
4814}
4815
4816void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
4817{
4818 btrfs_dev_stat_inc(dev, index);
4819 btrfs_dev_stat_print_on_error(dev);
4820}
4821
4822void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
4823{
4824 if (!dev->dev_stats_valid)
4825 return;
4826 printk_ratelimited(KERN_ERR
4827 "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
4828 dev->name,
4829 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
4830 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
4831 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
4832 btrfs_dev_stat_read(dev,
4833 BTRFS_DEV_STAT_CORRUPTION_ERRS),
4834 btrfs_dev_stat_read(dev,
4835 BTRFS_DEV_STAT_GENERATION_ERRS));
4836}
4837
4838static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
4839{
4840 printk(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
4841 dev->name,
4842 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
4843 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
4844 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
4845 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
4846 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
4847}
4848
4849int btrfs_get_dev_stats(struct btrfs_root *root,
4850 struct btrfs_ioctl_get_dev_stats *stats,
4851 int reset_after_read)
4852{
4853 struct btrfs_device *dev;
4854 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
4855 int i;
4856
4857 mutex_lock(&fs_devices->device_list_mutex);
4858 dev = btrfs_find_device(root, stats->devid, NULL, NULL);
4859 mutex_unlock(&fs_devices->device_list_mutex);
4860
4861 if (!dev) {
4862 printk(KERN_WARNING
4863 "btrfs: get dev_stats failed, device not found\n");
4864 return -ENODEV;
4865 } else if (!dev->dev_stats_valid) {
4866 printk(KERN_WARNING
4867 "btrfs: get dev_stats failed, not yet valid\n");
4868 return -ENODEV;
4869 } else if (reset_after_read) {
4870 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
4871 if (stats->nr_items > i)
4872 stats->values[i] =
4873 btrfs_dev_stat_read_and_reset(dev, i);
4874 else
4875 btrfs_dev_stat_reset(dev, i);
4876 }
4877 } else {
4878 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4879 if (stats->nr_items > i)
4880 stats->values[i] = btrfs_dev_stat_read(dev, i);
4881 }
4882 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
4883 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
4884 return 0;
4885}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index bb6b03f97aaa..3406a88ca83e 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -22,6 +22,7 @@
22#include <linux/bio.h> 22#include <linux/bio.h>
23#include <linux/sort.h> 23#include <linux/sort.h>
24#include "async-thread.h" 24#include "async-thread.h"
25#include "ioctl.h"
25 26
26#define BTRFS_STRIPE_LEN (64 * 1024) 27#define BTRFS_STRIPE_LEN (64 * 1024)
27 28
@@ -106,6 +107,11 @@ struct btrfs_device {
106 struct completion flush_wait; 107 struct completion flush_wait;
107 int nobarriers; 108 int nobarriers;
108 109
110 /* disk I/O failure stats. For detailed description refer to
111 * enum btrfs_dev_stat_values in ioctl.h */
112 int dev_stats_valid;
113 int dev_stats_dirty; /* counters need to be written to disk */
114 atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
109}; 115};
110 116
111struct btrfs_fs_devices { 117struct btrfs_fs_devices {
@@ -281,4 +287,50 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
281int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); 287int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
282int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 288int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
283 u64 *start, u64 *max_avail); 289 u64 *start, u64 *max_avail);
290struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
291 u64 logical, int mirror_num);
292void btrfs_dev_stat_print_on_error(struct btrfs_device *device);
293void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
294int btrfs_get_dev_stats(struct btrfs_root *root,
295 struct btrfs_ioctl_get_dev_stats *stats,
296 int reset_after_read);
297int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
298int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
299 struct btrfs_fs_info *fs_info);
300
301static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
302 int index)
303{
304 atomic_inc(dev->dev_stat_values + index);
305 dev->dev_stats_dirty = 1;
306}
307
308static inline int btrfs_dev_stat_read(struct btrfs_device *dev,
309 int index)
310{
311 return atomic_read(dev->dev_stat_values + index);
312}
313
314static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
315 int index)
316{
317 int ret;
318
319 ret = atomic_xchg(dev->dev_stat_values + index, 0);
320 dev->dev_stats_dirty = 1;
321 return ret;
322}
323
324static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
325 int index, unsigned long val)
326{
327 atomic_set(dev->dev_stat_values + index, val);
328 dev->dev_stats_dirty = 1;
329}
330
331static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
332 int index)
333{
334 btrfs_dev_stat_set(dev, index, 0);
335}
284#endif 336#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index e7a5659087e6..3f4e2d69e83a 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -196,6 +196,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
196 if (ret) 196 if (ret)
197 goto out; 197 goto out;
198 198
199 inode_inc_iversion(inode);
199 inode->i_ctime = CURRENT_TIME; 200 inode->i_ctime = CURRENT_TIME;
200 ret = btrfs_update_inode(trans, root, inode); 201 ret = btrfs_update_inode(trans, root, inode);
201 BUG_ON(ret); 202 BUG_ON(ret);
diff --git a/fs/buffer.c b/fs/buffer.c
index ad5938ca357c..838a9cf246bd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3152,7 +3152,7 @@ SYSCALL_DEFINE2(bdflush, int, func, long, data)
3152/* 3152/*
3153 * Buffer-head allocation 3153 * Buffer-head allocation
3154 */ 3154 */
3155static struct kmem_cache *bh_cachep; 3155static struct kmem_cache *bh_cachep __read_mostly;
3156 3156
3157/* 3157/*
3158 * Once the number of bh's in the machine exceeds this level, we start 3158 * Once the number of bh's in the machine exceeds this level, we start
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index fbb2a643ef10..8e1b60e557b6 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -40,38 +40,49 @@ struct ceph_nfs_confh {
40 u32 parent_name_hash; 40 u32 parent_name_hash;
41} __attribute__ ((packed)); 41} __attribute__ ((packed));
42 42
43static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, 43/*
44 int connectable) 44 * The presence of @parent_inode here tells us whether NFS wants a
45 * connectable file handle. However, we want to make a connectionable
46 * file handle unconditionally so that the MDS gets as much of a hint
47 * as possible. That means we only use @parent_dentry to indicate
48 * whether nfsd wants a connectable fh, and whether we should indicate
49 * failure from a too-small @max_len.
50 */
51static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
52 struct inode *parent_inode)
45{ 53{
46 int type; 54 int type;
47 struct ceph_nfs_fh *fh = (void *)rawfh; 55 struct ceph_nfs_fh *fh = (void *)rawfh;
48 struct ceph_nfs_confh *cfh = (void *)rawfh; 56 struct ceph_nfs_confh *cfh = (void *)rawfh;
49 struct dentry *parent;
50 struct inode *inode = dentry->d_inode;
51 int connected_handle_length = sizeof(*cfh)/4; 57 int connected_handle_length = sizeof(*cfh)/4;
52 int handle_length = sizeof(*fh)/4; 58 int handle_length = sizeof(*fh)/4;
59 struct dentry *dentry = d_find_alias(inode);
60 struct dentry *parent;
53 61
54 /* don't re-export snaps */ 62 /* don't re-export snaps */
55 if (ceph_snap(inode) != CEPH_NOSNAP) 63 if (ceph_snap(inode) != CEPH_NOSNAP)
56 return -EINVAL; 64 return -EINVAL;
57 65
58 spin_lock(&dentry->d_lock); 66 /* if we found an alias, generate a connectable fh */
59 parent = dentry->d_parent; 67 if (*max_len >= connected_handle_length && dentry) {
60 if (*max_len >= connected_handle_length) {
61 dout("encode_fh %p connectable\n", dentry); 68 dout("encode_fh %p connectable\n", dentry);
62 cfh->ino = ceph_ino(dentry->d_inode); 69 spin_lock(&dentry->d_lock);
70 parent = dentry->d_parent;
71 cfh->ino = ceph_ino(inode);
63 cfh->parent_ino = ceph_ino(parent->d_inode); 72 cfh->parent_ino = ceph_ino(parent->d_inode);
64 cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode, 73 cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode,
65 dentry); 74 dentry);
66 *max_len = connected_handle_length; 75 *max_len = connected_handle_length;
67 type = 2; 76 type = 2;
77 spin_unlock(&dentry->d_lock);
68 } else if (*max_len >= handle_length) { 78 } else if (*max_len >= handle_length) {
69 if (connectable) { 79 if (parent_inode) {
80 /* nfsd wants connectable */
70 *max_len = connected_handle_length; 81 *max_len = connected_handle_length;
71 type = 255; 82 type = 255;
72 } else { 83 } else {
73 dout("encode_fh %p\n", dentry); 84 dout("encode_fh %p\n", dentry);
74 fh->ino = ceph_ino(dentry->d_inode); 85 fh->ino = ceph_ino(inode);
75 *max_len = handle_length; 86 *max_len = handle_length;
76 type = 1; 87 type = 1;
77 } 88 }
@@ -79,7 +90,6 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
79 *max_len = handle_length; 90 *max_len = handle_length;
80 type = 255; 91 type = 255;
81 } 92 }
82 spin_unlock(&dentry->d_lock);
83 return type; 93 return type;
84} 94}
85 95
diff --git a/fs/compat.c b/fs/compat.c
index 3adf3d4c2cd9..6161255fac45 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -871,12 +871,12 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
871{ 871{
872 int error; 872 int error;
873 struct file *file; 873 struct file *file;
874 int fput_needed;
874 struct compat_readdir_callback buf; 875 struct compat_readdir_callback buf;
875 876
876 error = -EBADF; 877 file = fget_light(fd, &fput_needed);
877 file = fget(fd);
878 if (!file) 878 if (!file)
879 goto out; 879 return -EBADF;
880 880
881 buf.result = 0; 881 buf.result = 0;
882 buf.dirent = dirent; 882 buf.dirent = dirent;
@@ -885,8 +885,7 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
885 if (buf.result) 885 if (buf.result)
886 error = buf.result; 886 error = buf.result;
887 887
888 fput(file); 888 fput_light(file, fput_needed);
889out:
890 return error; 889 return error;
891} 890}
892 891
@@ -953,16 +952,15 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
953 struct file * file; 952 struct file * file;
954 struct compat_linux_dirent __user * lastdirent; 953 struct compat_linux_dirent __user * lastdirent;
955 struct compat_getdents_callback buf; 954 struct compat_getdents_callback buf;
955 int fput_needed;
956 int error; 956 int error;
957 957
958 error = -EFAULT;
959 if (!access_ok(VERIFY_WRITE, dirent, count)) 958 if (!access_ok(VERIFY_WRITE, dirent, count))
960 goto out; 959 return -EFAULT;
961 960
962 error = -EBADF; 961 file = fget_light(fd, &fput_needed);
963 file = fget(fd);
964 if (!file) 962 if (!file)
965 goto out; 963 return -EBADF;
966 964
967 buf.current_dir = dirent; 965 buf.current_dir = dirent;
968 buf.previous = NULL; 966 buf.previous = NULL;
@@ -979,8 +977,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
979 else 977 else
980 error = count - buf.count; 978 error = count - buf.count;
981 } 979 }
982 fput(file); 980 fput_light(file, fput_needed);
983out:
984 return error; 981 return error;
985} 982}
986 983
@@ -1041,16 +1038,15 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
1041 struct file * file; 1038 struct file * file;
1042 struct linux_dirent64 __user * lastdirent; 1039 struct linux_dirent64 __user * lastdirent;
1043 struct compat_getdents_callback64 buf; 1040 struct compat_getdents_callback64 buf;
1041 int fput_needed;
1044 int error; 1042 int error;
1045 1043
1046 error = -EFAULT;
1047 if (!access_ok(VERIFY_WRITE, dirent, count)) 1044 if (!access_ok(VERIFY_WRITE, dirent, count))
1048 goto out; 1045 return -EFAULT;
1049 1046
1050 error = -EBADF; 1047 file = fget_light(fd, &fput_needed);
1051 file = fget(fd);
1052 if (!file) 1048 if (!file)
1053 goto out; 1049 return -EBADF;
1054 1050
1055 buf.current_dir = dirent; 1051 buf.current_dir = dirent;
1056 buf.previous = NULL; 1052 buf.previous = NULL;
@@ -1068,8 +1064,7 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
1068 else 1064 else
1069 error = count - buf.count; 1065 error = count - buf.count;
1070 } 1066 }
1071 fput(file); 1067 fput_light(file, fput_needed);
1072out:
1073 return error; 1068 return error;
1074} 1069}
1075#endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */ 1070#endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */
diff --git a/fs/dcache.c b/fs/dcache.c
index 4435d8b32904..85c9e2bff8e6 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -683,8 +683,6 @@ EXPORT_SYMBOL(dget_parent);
683/** 683/**
684 * d_find_alias - grab a hashed alias of inode 684 * d_find_alias - grab a hashed alias of inode
685 * @inode: inode in question 685 * @inode: inode in question
686 * @want_discon: flag, used by d_splice_alias, to request
687 * that only a DISCONNECTED alias be returned.
688 * 686 *
689 * If inode has a hashed alias, or is a directory and has any alias, 687 * If inode has a hashed alias, or is a directory and has any alias,
690 * acquire the reference to alias and return it. Otherwise return NULL. 688 * acquire the reference to alias and return it. Otherwise return NULL.
@@ -693,10 +691,9 @@ EXPORT_SYMBOL(dget_parent);
693 * of a filesystem. 691 * of a filesystem.
694 * 692 *
695 * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer 693 * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
696 * any other hashed alias over that one unless @want_discon is set, 694 * any other hashed alias over that.
697 * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
698 */ 695 */
699static struct dentry *__d_find_alias(struct inode *inode, int want_discon) 696static struct dentry *__d_find_alias(struct inode *inode)
700{ 697{
701 struct dentry *alias, *discon_alias; 698 struct dentry *alias, *discon_alias;
702 699
@@ -708,7 +705,7 @@ again:
708 if (IS_ROOT(alias) && 705 if (IS_ROOT(alias) &&
709 (alias->d_flags & DCACHE_DISCONNECTED)) { 706 (alias->d_flags & DCACHE_DISCONNECTED)) {
710 discon_alias = alias; 707 discon_alias = alias;
711 } else if (!want_discon) { 708 } else {
712 __dget_dlock(alias); 709 __dget_dlock(alias);
713 spin_unlock(&alias->d_lock); 710 spin_unlock(&alias->d_lock);
714 return alias; 711 return alias;
@@ -739,7 +736,7 @@ struct dentry *d_find_alias(struct inode *inode)
739 736
740 if (!list_empty(&inode->i_dentry)) { 737 if (!list_empty(&inode->i_dentry)) {
741 spin_lock(&inode->i_lock); 738 spin_lock(&inode->i_lock);
742 de = __d_find_alias(inode, 0); 739 de = __d_find_alias(inode);
743 spin_unlock(&inode->i_lock); 740 spin_unlock(&inode->i_lock);
744 } 741 }
745 return de; 742 return de;
@@ -1650,9 +1647,8 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
1650 1647
1651 if (inode && S_ISDIR(inode->i_mode)) { 1648 if (inode && S_ISDIR(inode->i_mode)) {
1652 spin_lock(&inode->i_lock); 1649 spin_lock(&inode->i_lock);
1653 new = __d_find_alias(inode, 1); 1650 new = __d_find_any_alias(inode);
1654 if (new) { 1651 if (new) {
1655 BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
1656 spin_unlock(&inode->i_lock); 1652 spin_unlock(&inode->i_lock);
1657 security_d_instantiate(new, inode); 1653 security_d_instantiate(new, inode);
1658 d_move(new, dentry); 1654 d_move(new, dentry);
@@ -2482,7 +2478,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
2482 struct dentry *alias; 2478 struct dentry *alias;
2483 2479
2484 /* Does an aliased dentry already exist? */ 2480 /* Does an aliased dentry already exist? */
2485 alias = __d_find_alias(inode, 0); 2481 alias = __d_find_alias(inode);
2486 if (alias) { 2482 if (alias) {
2487 actual = alias; 2483 actual = alias;
2488 write_seqlock(&rename_lock); 2484 write_seqlock(&rename_lock);
@@ -2575,7 +2571,7 @@ static int prepend_path(const struct path *path,
2575 bool slash = false; 2571 bool slash = false;
2576 int error = 0; 2572 int error = 0;
2577 2573
2578 br_read_lock(vfsmount_lock); 2574 br_read_lock(&vfsmount_lock);
2579 while (dentry != root->dentry || vfsmnt != root->mnt) { 2575 while (dentry != root->dentry || vfsmnt != root->mnt) {
2580 struct dentry * parent; 2576 struct dentry * parent;
2581 2577
@@ -2606,7 +2602,7 @@ static int prepend_path(const struct path *path,
2606 error = prepend(buffer, buflen, "/", 1); 2602 error = prepend(buffer, buflen, "/", 1);
2607 2603
2608out: 2604out:
2609 br_read_unlock(vfsmount_lock); 2605 br_read_unlock(&vfsmount_lock);
2610 return error; 2606 return error;
2611 2607
2612global_root: 2608global_root:
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index ab35b113003b..a07441a0a878 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -660,11 +660,10 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
660{ 660{
661 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); 661 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
662 char *lower_buf; 662 char *lower_buf;
663 size_t lower_bufsiz = PATH_MAX;
664 mm_segment_t old_fs; 663 mm_segment_t old_fs;
665 int rc; 664 int rc;
666 665
667 lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL); 666 lower_buf = kmalloc(PATH_MAX, GFP_KERNEL);
668 if (!lower_buf) { 667 if (!lower_buf) {
669 rc = -ENOMEM; 668 rc = -ENOMEM;
670 goto out; 669 goto out;
@@ -673,58 +672,29 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
673 set_fs(get_ds()); 672 set_fs(get_ds());
674 rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, 673 rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
675 (char __user *)lower_buf, 674 (char __user *)lower_buf,
676 lower_bufsiz); 675 PATH_MAX);
677 set_fs(old_fs); 676 set_fs(old_fs);
678 if (rc < 0) 677 if (rc < 0)
679 goto out; 678 goto out;
680 lower_bufsiz = rc;
681 rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry, 679 rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry,
682 lower_buf, lower_bufsiz); 680 lower_buf, rc);
683out: 681out:
684 kfree(lower_buf); 682 kfree(lower_buf);
685 return rc; 683 return rc;
686} 684}
687 685
688static int 686static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
689ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
690{ 687{
691 char *kbuf; 688 char *buf;
692 size_t kbufsiz, copied; 689 size_t len = PATH_MAX;
693 int rc; 690 int rc;
694 691
695 rc = ecryptfs_readlink_lower(dentry, &kbuf, &kbufsiz); 692 rc = ecryptfs_readlink_lower(dentry, &buf, &len);
696 if (rc) 693 if (rc)
697 goto out; 694 goto out;
698 copied = min_t(size_t, bufsiz, kbufsiz);
699 rc = copy_to_user(buf, kbuf, copied) ? -EFAULT : copied;
700 kfree(kbuf);
701 fsstack_copy_attr_atime(dentry->d_inode, 695 fsstack_copy_attr_atime(dentry->d_inode,
702 ecryptfs_dentry_to_lower(dentry)->d_inode); 696 ecryptfs_dentry_to_lower(dentry)->d_inode);
703out: 697 buf[len] = '\0';
704 return rc;
705}
706
707static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
708{
709 char *buf;
710 int len = PAGE_SIZE, rc;
711 mm_segment_t old_fs;
712
713 /* Released in ecryptfs_put_link(); only release here on error */
714 buf = kmalloc(len, GFP_KERNEL);
715 if (!buf) {
716 buf = ERR_PTR(-ENOMEM);
717 goto out;
718 }
719 old_fs = get_fs();
720 set_fs(get_ds());
721 rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
722 set_fs(old_fs);
723 if (rc < 0) {
724 kfree(buf);
725 buf = ERR_PTR(rc);
726 } else
727 buf[rc] = '\0';
728out: 698out:
729 nd_set_link(nd, buf); 699 nd_set_link(nd, buf);
730 return NULL; 700 return NULL;
@@ -1153,7 +1123,7 @@ out:
1153} 1123}
1154 1124
1155const struct inode_operations ecryptfs_symlink_iops = { 1125const struct inode_operations ecryptfs_symlink_iops = {
1156 .readlink = ecryptfs_readlink, 1126 .readlink = generic_readlink,
1157 .follow_link = ecryptfs_follow_link, 1127 .follow_link = ecryptfs_follow_link,
1158 .put_link = ecryptfs_put_link, 1128 .put_link = ecryptfs_put_link,
1159 .permission = ecryptfs_permission, 1129 .permission = ecryptfs_permission,
diff --git a/fs/exec.c b/fs/exec.c
index 52c9e2ff6e6b..a79786a8d2c8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -280,10 +280,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
280 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 280 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
281 INIT_LIST_HEAD(&vma->anon_vma_chain); 281 INIT_LIST_HEAD(&vma->anon_vma_chain);
282 282
283 err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
284 if (err)
285 goto err;
286
287 err = insert_vm_struct(mm, vma); 283 err = insert_vm_struct(mm, vma);
288 if (err) 284 if (err)
289 goto err; 285 goto err;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index b05acb796135..b0201ca6e9c6 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -304,24 +304,23 @@ out:
304 304
305/** 305/**
306 * export_encode_fh - default export_operations->encode_fh function 306 * export_encode_fh - default export_operations->encode_fh function
307 * @dentry: the dentry to encode 307 * @inode: the object to encode
308 * @fh: where to store the file handle fragment 308 * @fh: where to store the file handle fragment
309 * @max_len: maximum length to store there 309 * @max_len: maximum length to store there
310 * @connectable: whether to store parent information 310 * @parent: parent directory inode, if wanted
311 * 311 *
312 * This default encode_fh function assumes that the 32 inode number 312 * This default encode_fh function assumes that the 32 inode number
313 * is suitable for locating an inode, and that the generation number 313 * is suitable for locating an inode, and that the generation number
314 * can be used to check that it is still valid. It places them in the 314 * can be used to check that it is still valid. It places them in the
315 * filehandle fragment where export_decode_fh expects to find them. 315 * filehandle fragment where export_decode_fh expects to find them.
316 */ 316 */
317static int export_encode_fh(struct dentry *dentry, struct fid *fid, 317static int export_encode_fh(struct inode *inode, struct fid *fid,
318 int *max_len, int connectable) 318 int *max_len, struct inode *parent)
319{ 319{
320 struct inode * inode = dentry->d_inode;
321 int len = *max_len; 320 int len = *max_len;
322 int type = FILEID_INO32_GEN; 321 int type = FILEID_INO32_GEN;
323 322
324 if (connectable && (len < 4)) { 323 if (parent && (len < 4)) {
325 *max_len = 4; 324 *max_len = 4;
326 return 255; 325 return 255;
327 } else if (len < 2) { 326 } else if (len < 2) {
@@ -332,14 +331,9 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid,
332 len = 2; 331 len = 2;
333 fid->i32.ino = inode->i_ino; 332 fid->i32.ino = inode->i_ino;
334 fid->i32.gen = inode->i_generation; 333 fid->i32.gen = inode->i_generation;
335 if (connectable && !S_ISDIR(inode->i_mode)) { 334 if (parent) {
336 struct inode *parent;
337
338 spin_lock(&dentry->d_lock);
339 parent = dentry->d_parent->d_inode;
340 fid->i32.parent_ino = parent->i_ino; 335 fid->i32.parent_ino = parent->i_ino;
341 fid->i32.parent_gen = parent->i_generation; 336 fid->i32.parent_gen = parent->i_generation;
342 spin_unlock(&dentry->d_lock);
343 len = 4; 337 len = 4;
344 type = FILEID_INO32_GEN_PARENT; 338 type = FILEID_INO32_GEN_PARENT;
345 } 339 }
@@ -352,11 +346,22 @@ int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len,
352{ 346{
353 const struct export_operations *nop = dentry->d_sb->s_export_op; 347 const struct export_operations *nop = dentry->d_sb->s_export_op;
354 int error; 348 int error;
349 struct dentry *p = NULL;
350 struct inode *inode = dentry->d_inode, *parent = NULL;
355 351
352 if (connectable && !S_ISDIR(inode->i_mode)) {
353 p = dget_parent(dentry);
354 /*
355 * note that while p might've ceased to be our parent already,
356 * it's still pinned by and still positive.
357 */
358 parent = p->d_inode;
359 }
356 if (nop->encode_fh) 360 if (nop->encode_fh)
357 error = nop->encode_fh(dentry, fid->raw, max_len, connectable); 361 error = nop->encode_fh(inode, fid->raw, max_len, parent);
358 else 362 else
359 error = export_encode_fh(dentry, fid, max_len, connectable); 363 error = export_encode_fh(inode, fid, max_len, parent);
364 dput(p);
360 365
361 return error; 366 return error;
362} 367}
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 9ed1bb1f319f..c22f17021b6e 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -2,6 +2,8 @@ config EXT4_FS
2 tristate "The Extended 4 (ext4) filesystem" 2 tristate "The Extended 4 (ext4) filesystem"
3 select JBD2 3 select JBD2
4 select CRC16 4 select CRC16
5 select CRYPTO
6 select CRYPTO_CRC32C
5 help 7 help
6 This is the next generation of the ext3 filesystem. 8 This is the next generation of the ext3 filesystem.
7 9
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index c45c41129a35..99b6324290db 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -168,12 +168,14 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
168 168
169 /* If checksum is bad mark all blocks used to prevent allocation 169 /* If checksum is bad mark all blocks used to prevent allocation
170 * essentially implementing a per-group read-only flag. */ 170 * essentially implementing a per-group read-only flag. */
171 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 171 if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
172 ext4_error(sb, "Checksum bad for group %u", block_group); 172 ext4_error(sb, "Checksum bad for group %u", block_group);
173 ext4_free_group_clusters_set(sb, gdp, 0); 173 ext4_free_group_clusters_set(sb, gdp, 0);
174 ext4_free_inodes_set(sb, gdp, 0); 174 ext4_free_inodes_set(sb, gdp, 0);
175 ext4_itable_unused_set(sb, gdp, 0); 175 ext4_itable_unused_set(sb, gdp, 0);
176 memset(bh->b_data, 0xff, sb->s_blocksize); 176 memset(bh->b_data, 0xff, sb->s_blocksize);
177 ext4_block_bitmap_csum_set(sb, block_group, gdp, bh,
178 EXT4_BLOCKS_PER_GROUP(sb) / 8);
177 return; 179 return;
178 } 180 }
179 memset(bh->b_data, 0, sb->s_blocksize); 181 memset(bh->b_data, 0, sb->s_blocksize);
@@ -210,6 +212,9 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
210 */ 212 */
211 ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group), 213 ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
212 sb->s_blocksize * 8, bh->b_data); 214 sb->s_blocksize * 8, bh->b_data);
215 ext4_block_bitmap_csum_set(sb, block_group, gdp, bh,
216 EXT4_BLOCKS_PER_GROUP(sb) / 8);
217 ext4_group_desc_csum_set(sb, block_group, gdp);
213} 218}
214 219
215/* Return the number of free blocks in a block group. It is used when 220/* Return the number of free blocks in a block group. It is used when
@@ -276,9 +281,9 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
276} 281}
277 282
278static int ext4_valid_block_bitmap(struct super_block *sb, 283static int ext4_valid_block_bitmap(struct super_block *sb,
279 struct ext4_group_desc *desc, 284 struct ext4_group_desc *desc,
280 unsigned int block_group, 285 unsigned int block_group,
281 struct buffer_head *bh) 286 struct buffer_head *bh)
282{ 287{
283 ext4_grpblk_t offset; 288 ext4_grpblk_t offset;
284 ext4_grpblk_t next_zero_bit; 289 ext4_grpblk_t next_zero_bit;
@@ -325,6 +330,23 @@ err_out:
325 block_group, bitmap_blk); 330 block_group, bitmap_blk);
326 return 0; 331 return 0;
327} 332}
333
334void ext4_validate_block_bitmap(struct super_block *sb,
335 struct ext4_group_desc *desc,
336 unsigned int block_group,
337 struct buffer_head *bh)
338{
339 if (buffer_verified(bh))
340 return;
341
342 ext4_lock_group(sb, block_group);
343 if (ext4_valid_block_bitmap(sb, desc, block_group, bh) &&
344 ext4_block_bitmap_csum_verify(sb, block_group, desc, bh,
345 EXT4_BLOCKS_PER_GROUP(sb) / 8))
346 set_buffer_verified(bh);
347 ext4_unlock_group(sb, block_group);
348}
349
328/** 350/**
329 * ext4_read_block_bitmap() 351 * ext4_read_block_bitmap()
330 * @sb: super block 352 * @sb: super block
@@ -355,12 +377,12 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
355 } 377 }
356 378
357 if (bitmap_uptodate(bh)) 379 if (bitmap_uptodate(bh))
358 return bh; 380 goto verify;
359 381
360 lock_buffer(bh); 382 lock_buffer(bh);
361 if (bitmap_uptodate(bh)) { 383 if (bitmap_uptodate(bh)) {
362 unlock_buffer(bh); 384 unlock_buffer(bh);
363 return bh; 385 goto verify;
364 } 386 }
365 ext4_lock_group(sb, block_group); 387 ext4_lock_group(sb, block_group);
366 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 388 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
@@ -379,7 +401,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
379 */ 401 */
380 set_bitmap_uptodate(bh); 402 set_bitmap_uptodate(bh);
381 unlock_buffer(bh); 403 unlock_buffer(bh);
382 return bh; 404 goto verify;
383 } 405 }
384 /* 406 /*
385 * submit the buffer_head for reading 407 * submit the buffer_head for reading
@@ -390,6 +412,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
390 get_bh(bh); 412 get_bh(bh);
391 submit_bh(READ, bh); 413 submit_bh(READ, bh);
392 return bh; 414 return bh;
415verify:
416 ext4_validate_block_bitmap(sb, desc, block_group, bh);
417 return bh;
393} 418}
394 419
395/* Returns 0 on success, 1 on error */ 420/* Returns 0 on success, 1 on error */
@@ -412,7 +437,7 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
412 } 437 }
413 clear_buffer_new(bh); 438 clear_buffer_new(bh);
414 /* Panic or remount fs read-only if block bitmap is invalid */ 439 /* Panic or remount fs read-only if block bitmap is invalid */
415 ext4_valid_block_bitmap(sb, desc, block_group, bh); 440 ext4_validate_block_bitmap(sb, desc, block_group, bh);
416 return 0; 441 return 0;
417} 442}
418 443
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index fa3af81ac565..b319721da26a 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -29,3 +29,86 @@ unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars)
29 29
30#endif /* EXT4FS_DEBUG */ 30#endif /* EXT4FS_DEBUG */
31 31
32int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
33 struct ext4_group_desc *gdp,
34 struct buffer_head *bh, int sz)
35{
36 __u32 hi;
37 __u32 provided, calculated;
38 struct ext4_sb_info *sbi = EXT4_SB(sb);
39
40 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
41 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
42 return 1;
43
44 provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
45 calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
46 if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
47 hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi);
48 provided |= (hi << 16);
49 } else
50 calculated &= 0xFFFF;
51
52 return provided == calculated;
53}
54
55void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
56 struct ext4_group_desc *gdp,
57 struct buffer_head *bh, int sz)
58{
59 __u32 csum;
60 struct ext4_sb_info *sbi = EXT4_SB(sb);
61
62 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
63 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
64 return;
65
66 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
67 gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
68 if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
69 gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16);
70}
71
72int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
73 struct ext4_group_desc *gdp,
74 struct buffer_head *bh, int sz)
75{
76 __u32 hi;
77 __u32 provided, calculated;
78 struct ext4_sb_info *sbi = EXT4_SB(sb);
79
80 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
81 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
82 return 1;
83
84 provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
85 calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
86 if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) {
87 hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi);
88 provided |= (hi << 16);
89 } else
90 calculated &= 0xFFFF;
91
92 if (provided == calculated)
93 return 1;
94
95 ext4_error(sb, "Bad block bitmap checksum: block_group = %u", group);
96 return 0;
97}
98
99void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
100 struct ext4_group_desc *gdp,
101 struct buffer_head *bh, int sz)
102{
103 __u32 csum;
104 struct ext4_sb_info *sbi = EXT4_SB(sb);
105
106 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
107 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
108 return;
109
110 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
111 gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
112 if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
113 gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16);
114}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index b86786202643..aa39e600d159 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -179,6 +179,18 @@ static int ext4_readdir(struct file *filp,
179 continue; 179 continue;
180 } 180 }
181 181
182 /* Check the checksum */
183 if (!buffer_verified(bh) &&
184 !ext4_dirent_csum_verify(inode,
185 (struct ext4_dir_entry *)bh->b_data)) {
186 EXT4_ERROR_FILE(filp, 0, "directory fails checksum "
187 "at offset %llu",
188 (unsigned long long)filp->f_pos);
189 filp->f_pos += sb->s_blocksize - offset;
190 continue;
191 }
192 set_buffer_verified(bh);
193
182revalidate: 194revalidate:
183 /* If the dir block has changed since the last call to 195 /* If the dir block has changed since the last call to
184 * readdir(2), then we might be pointing to an invalid 196 * readdir(2), then we might be pointing to an invalid
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c21b1de51afb..cfc4e01b3c83 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,7 @@
29#include <linux/wait.h> 29#include <linux/wait.h>
30#include <linux/blockgroup_lock.h> 30#include <linux/blockgroup_lock.h>
31#include <linux/percpu_counter.h> 31#include <linux/percpu_counter.h>
32#include <crypto/hash.h>
32#ifdef __KERNEL__ 33#ifdef __KERNEL__
33#include <linux/compat.h> 34#include <linux/compat.h>
34#endif 35#endif
@@ -298,7 +299,9 @@ struct ext4_group_desc
298 __le16 bg_free_inodes_count_lo;/* Free inodes count */ 299 __le16 bg_free_inodes_count_lo;/* Free inodes count */
299 __le16 bg_used_dirs_count_lo; /* Directories count */ 300 __le16 bg_used_dirs_count_lo; /* Directories count */
300 __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ 301 __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */
301 __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ 302 __le32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */
303 __le16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */
304 __le16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */
302 __le16 bg_itable_unused_lo; /* Unused inodes count */ 305 __le16 bg_itable_unused_lo; /* Unused inodes count */
303 __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ 306 __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */
304 __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ 307 __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */
@@ -308,9 +311,19 @@ struct ext4_group_desc
308 __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ 311 __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */
309 __le16 bg_used_dirs_count_hi; /* Directories count MSB */ 312 __le16 bg_used_dirs_count_hi; /* Directories count MSB */
310 __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ 313 __le16 bg_itable_unused_hi; /* Unused inodes count MSB */
311 __u32 bg_reserved2[3]; 314 __le32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */
315 __le16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */
316 __le16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */
317 __u32 bg_reserved;
312}; 318};
313 319
320#define EXT4_BG_INODE_BITMAP_CSUM_HI_END \
321 (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \
322 sizeof(__le16))
323#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END \
324 (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \
325 sizeof(__le16))
326
314/* 327/*
315 * Structure of a flex block group info 328 * Structure of a flex block group info
316 */ 329 */
@@ -650,7 +663,8 @@ struct ext4_inode {
650 __le16 l_i_file_acl_high; 663 __le16 l_i_file_acl_high;
651 __le16 l_i_uid_high; /* these 2 fields */ 664 __le16 l_i_uid_high; /* these 2 fields */
652 __le16 l_i_gid_high; /* were reserved2[0] */ 665 __le16 l_i_gid_high; /* were reserved2[0] */
653 __u32 l_i_reserved2; 666 __le16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */
667 __le16 l_i_reserved;
654 } linux2; 668 } linux2;
655 struct { 669 struct {
656 __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ 670 __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */
@@ -666,7 +680,7 @@ struct ext4_inode {
666 } masix2; 680 } masix2;
667 } osd2; /* OS dependent 2 */ 681 } osd2; /* OS dependent 2 */
668 __le16 i_extra_isize; 682 __le16 i_extra_isize;
669 __le16 i_pad1; 683 __le16 i_checksum_hi; /* crc32c(uuid+inum+inode) BE */
670 __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ 684 __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */
671 __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ 685 __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */
672 __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ 686 __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */
@@ -768,7 +782,7 @@ do { \
768#define i_gid_low i_gid 782#define i_gid_low i_gid
769#define i_uid_high osd2.linux2.l_i_uid_high 783#define i_uid_high osd2.linux2.l_i_uid_high
770#define i_gid_high osd2.linux2.l_i_gid_high 784#define i_gid_high osd2.linux2.l_i_gid_high
771#define i_reserved2 osd2.linux2.l_i_reserved2 785#define i_checksum_lo osd2.linux2.l_i_checksum_lo
772 786
773#elif defined(__GNU__) 787#elif defined(__GNU__)
774 788
@@ -908,6 +922,9 @@ struct ext4_inode_info {
908 */ 922 */
909 tid_t i_sync_tid; 923 tid_t i_sync_tid;
910 tid_t i_datasync_tid; 924 tid_t i_datasync_tid;
925
926 /* Precomputed uuid+inum+igen checksum for seeding inode checksums */
927 __u32 i_csum_seed;
911}; 928};
912 929
913/* 930/*
@@ -1001,6 +1018,9 @@ extern void ext4_set_bits(void *bm, int cur, int len);
1001#define EXT4_ERRORS_PANIC 3 /* Panic */ 1018#define EXT4_ERRORS_PANIC 3 /* Panic */
1002#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE 1019#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE
1003 1020
1021/* Metadata checksum algorithm codes */
1022#define EXT4_CRC32C_CHKSUM 1
1023
1004/* 1024/*
1005 * Structure of the super block 1025 * Structure of the super block
1006 */ 1026 */
@@ -1087,7 +1107,7 @@ struct ext4_super_block {
1087 __le64 s_mmp_block; /* Block for multi-mount protection */ 1107 __le64 s_mmp_block; /* Block for multi-mount protection */
1088 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ 1108 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
1089 __u8 s_log_groups_per_flex; /* FLEX_BG group size */ 1109 __u8 s_log_groups_per_flex; /* FLEX_BG group size */
1090 __u8 s_reserved_char_pad; 1110 __u8 s_checksum_type; /* metadata checksum algorithm used */
1091 __le16 s_reserved_pad; 1111 __le16 s_reserved_pad;
1092 __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ 1112 __le64 s_kbytes_written; /* nr of lifetime kilobytes written */
1093 __le32 s_snapshot_inum; /* Inode number of active snapshot */ 1113 __le32 s_snapshot_inum; /* Inode number of active snapshot */
@@ -1113,7 +1133,8 @@ struct ext4_super_block {
1113 __le32 s_usr_quota_inum; /* inode for tracking user quota */ 1133 __le32 s_usr_quota_inum; /* inode for tracking user quota */
1114 __le32 s_grp_quota_inum; /* inode for tracking group quota */ 1134 __le32 s_grp_quota_inum; /* inode for tracking group quota */
1115 __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ 1135 __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */
1116 __le32 s_reserved[109]; /* Padding to the end of the block */ 1136 __le32 s_reserved[108]; /* Padding to the end of the block */
1137 __le32 s_checksum; /* crc32c(superblock) */
1117}; 1138};
1118 1139
1119#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) 1140#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
@@ -1176,6 +1197,7 @@ struct ext4_sb_info {
1176 struct proc_dir_entry *s_proc; 1197 struct proc_dir_entry *s_proc;
1177 struct kobject s_kobj; 1198 struct kobject s_kobj;
1178 struct completion s_kobj_unregister; 1199 struct completion s_kobj_unregister;
1200 struct super_block *s_sb;
1179 1201
1180 /* Journaling */ 1202 /* Journaling */
1181 struct journal_s *s_journal; 1203 struct journal_s *s_journal;
@@ -1266,6 +1288,12 @@ struct ext4_sb_info {
1266 1288
1267 /* record the last minlen when FITRIM is called. */ 1289 /* record the last minlen when FITRIM is called. */
1268 atomic_t s_last_trim_minblks; 1290 atomic_t s_last_trim_minblks;
1291
1292 /* Reference to checksum algorithm driver via cryptoapi */
1293 struct crypto_shash *s_chksum_driver;
1294
1295 /* Precomputed FS UUID checksum for seeding other checksums */
1296 __u32 s_csum_seed;
1269}; 1297};
1270 1298
1271static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1299static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1414,6 +1442,12 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1414#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 1442#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
1415#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 1443#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
1416#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 1444#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200
1445/*
1446 * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM). When
1447 * METADATA_CSUM is set, group descriptor checksums use the same algorithm as
1448 * all other data structures' checksums. However, the METADATA_CSUM and
1449 * GDT_CSUM bits are mutually exclusive.
1450 */
1417#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 1451#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400
1418 1452
1419#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 1453#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
@@ -1461,7 +1495,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1461 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ 1495 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
1462 EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ 1496 EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
1463 EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ 1497 EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
1464 EXT4_FEATURE_RO_COMPAT_BIGALLOC) 1498 EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
1499 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)
1465 1500
1466/* 1501/*
1467 * Default values for user and/or group using reserved blocks 1502 * Default values for user and/or group using reserved blocks
@@ -1527,6 +1562,18 @@ struct ext4_dir_entry_2 {
1527}; 1562};
1528 1563
1529/* 1564/*
1565 * This is a bogus directory entry at the end of each leaf block that
1566 * records checksums.
1567 */
1568struct ext4_dir_entry_tail {
1569 __le32 det_reserved_zero1; /* Pretend to be unused */
1570 __le16 det_rec_len; /* 12 */
1571 __u8 det_reserved_zero2; /* Zero name length */
1572 __u8 det_reserved_ft; /* 0xDE, fake file type */
1573 __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */
1574};
1575
1576/*
1530 * Ext4 directory file types. Only the low 3 bits are used. The 1577 * Ext4 directory file types. Only the low 3 bits are used. The
1531 * other bits are reserved for now. 1578 * other bits are reserved for now.
1532 */ 1579 */
@@ -1541,6 +1588,8 @@ struct ext4_dir_entry_2 {
1541 1588
1542#define EXT4_FT_MAX 8 1589#define EXT4_FT_MAX 8
1543 1590
1591#define EXT4_FT_DIR_CSUM 0xDE
1592
1544/* 1593/*
1545 * EXT4_DIR_PAD defines the directory entries boundaries 1594 * EXT4_DIR_PAD defines the directory entries boundaries
1546 * 1595 *
@@ -1609,6 +1658,25 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
1609#define DX_HASH_HALF_MD4_UNSIGNED 4 1658#define DX_HASH_HALF_MD4_UNSIGNED 4
1610#define DX_HASH_TEA_UNSIGNED 5 1659#define DX_HASH_TEA_UNSIGNED 5
1611 1660
1661static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
1662 const void *address, unsigned int length)
1663{
1664 struct {
1665 struct shash_desc shash;
1666 char ctx[crypto_shash_descsize(sbi->s_chksum_driver)];
1667 } desc;
1668 int err;
1669
1670 desc.shash.tfm = sbi->s_chksum_driver;
1671 desc.shash.flags = 0;
1672 *(u32 *)desc.ctx = crc;
1673
1674 err = crypto_shash_update(&desc.shash, address, length);
1675 BUG_ON(err);
1676
1677 return *(u32 *)desc.ctx;
1678}
1679
1612#ifdef __KERNEL__ 1680#ifdef __KERNEL__
1613 1681
1614/* hash info structure used by the directory hash */ 1682/* hash info structure used by the directory hash */
@@ -1741,7 +1809,8 @@ struct mmp_struct {
1741 __le16 mmp_check_interval; 1809 __le16 mmp_check_interval;
1742 1810
1743 __le16 mmp_pad1; 1811 __le16 mmp_pad1;
1744 __le32 mmp_pad2[227]; 1812 __le32 mmp_pad2[226];
1813 __le32 mmp_checksum; /* crc32c(uuid+mmp_block) */
1745}; 1814};
1746 1815
1747/* arguments passed to the mmp thread */ 1816/* arguments passed to the mmp thread */
@@ -1784,8 +1853,24 @@ struct mmpd_data {
1784 1853
1785/* bitmap.c */ 1854/* bitmap.c */
1786extern unsigned int ext4_count_free(struct buffer_head *, unsigned); 1855extern unsigned int ext4_count_free(struct buffer_head *, unsigned);
1856void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
1857 struct ext4_group_desc *gdp,
1858 struct buffer_head *bh, int sz);
1859int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
1860 struct ext4_group_desc *gdp,
1861 struct buffer_head *bh, int sz);
1862void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
1863 struct ext4_group_desc *gdp,
1864 struct buffer_head *bh, int sz);
1865int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
1866 struct ext4_group_desc *gdp,
1867 struct buffer_head *bh, int sz);
1787 1868
1788/* balloc.c */ 1869/* balloc.c */
1870extern void ext4_validate_block_bitmap(struct super_block *sb,
1871 struct ext4_group_desc *desc,
1872 unsigned int block_group,
1873 struct buffer_head *bh);
1789extern unsigned int ext4_block_group(struct super_block *sb, 1874extern unsigned int ext4_block_group(struct super_block *sb,
1790 ext4_fsblk_t blocknr); 1875 ext4_fsblk_t blocknr);
1791extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, 1876extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
@@ -1864,7 +1949,7 @@ extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
1864/* mballoc.c */ 1949/* mballoc.c */
1865extern long ext4_mb_stats; 1950extern long ext4_mb_stats;
1866extern long ext4_mb_max_to_scan; 1951extern long ext4_mb_max_to_scan;
1867extern int ext4_mb_init(struct super_block *, int); 1952extern int ext4_mb_init(struct super_block *);
1868extern int ext4_mb_release(struct super_block *); 1953extern int ext4_mb_release(struct super_block *);
1869extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, 1954extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
1870 struct ext4_allocation_request *, int *); 1955 struct ext4_allocation_request *, int *);
@@ -1936,6 +2021,8 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
1936extern int ext4_ext_migrate(struct inode *); 2021extern int ext4_ext_migrate(struct inode *);
1937 2022
1938/* namei.c */ 2023/* namei.c */
2024extern int ext4_dirent_csum_verify(struct inode *inode,
2025 struct ext4_dir_entry *dirent);
1939extern int ext4_orphan_add(handle_t *, struct inode *); 2026extern int ext4_orphan_add(handle_t *, struct inode *);
1940extern int ext4_orphan_del(handle_t *, struct inode *); 2027extern int ext4_orphan_del(handle_t *, struct inode *);
1941extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, 2028extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
@@ -1950,6 +2037,10 @@ extern int ext4_group_extend(struct super_block *sb,
1950extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); 2037extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
1951 2038
1952/* super.c */ 2039/* super.c */
2040extern int ext4_superblock_csum_verify(struct super_block *sb,
2041 struct ext4_super_block *es);
2042extern void ext4_superblock_csum_set(struct super_block *sb,
2043 struct ext4_super_block *es);
1953extern void *ext4_kvmalloc(size_t size, gfp_t flags); 2044extern void *ext4_kvmalloc(size_t size, gfp_t flags);
1954extern void *ext4_kvzalloc(size_t size, gfp_t flags); 2045extern void *ext4_kvzalloc(size_t size, gfp_t flags);
1955extern void ext4_kvfree(void *ptr); 2046extern void ext4_kvfree(void *ptr);
@@ -2025,10 +2116,17 @@ extern void ext4_used_dirs_set(struct super_block *sb,
2025 struct ext4_group_desc *bg, __u32 count); 2116 struct ext4_group_desc *bg, __u32 count);
2026extern void ext4_itable_unused_set(struct super_block *sb, 2117extern void ext4_itable_unused_set(struct super_block *sb,
2027 struct ext4_group_desc *bg, __u32 count); 2118 struct ext4_group_desc *bg, __u32 count);
2028extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, 2119extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group,
2029 struct ext4_group_desc *gdp);
2030extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
2031 struct ext4_group_desc *gdp); 2120 struct ext4_group_desc *gdp);
2121extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
2122 struct ext4_group_desc *gdp);
2123
2124static inline int ext4_has_group_desc_csum(struct super_block *sb)
2125{
2126 return EXT4_HAS_RO_COMPAT_FEATURE(sb,
2127 EXT4_FEATURE_RO_COMPAT_GDT_CSUM |
2128 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM);
2129}
2032 2130
2033static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) 2131static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
2034{ 2132{
@@ -2225,6 +2323,9 @@ static inline void ext4_unlock_group(struct super_block *sb,
2225 2323
2226static inline void ext4_mark_super_dirty(struct super_block *sb) 2324static inline void ext4_mark_super_dirty(struct super_block *sb)
2227{ 2325{
2326 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
2327
2328 ext4_superblock_csum_set(sb, es);
2228 if (EXT4_SB(sb)->s_journal == NULL) 2329 if (EXT4_SB(sb)->s_journal == NULL)
2229 sb->s_dirt =1; 2330 sb->s_dirt =1;
2230} 2331}
@@ -2314,6 +2415,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
2314 2415
2315/* mmp.c */ 2416/* mmp.c */
2316extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); 2417extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
2418extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);
2419extern int ext4_mmp_csum_verify(struct super_block *sb,
2420 struct mmp_struct *mmp);
2317 2421
2318/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ 2422/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
2319enum ext4_state_bits { 2423enum ext4_state_bits {
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 0f58b86e3a02..cb1b2c919963 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -63,9 +63,22 @@
63 * ext4_inode has i_block array (60 bytes total). 63 * ext4_inode has i_block array (60 bytes total).
64 * The first 12 bytes store ext4_extent_header; 64 * The first 12 bytes store ext4_extent_header;
65 * the remainder stores an array of ext4_extent. 65 * the remainder stores an array of ext4_extent.
66 * For non-inode extent blocks, ext4_extent_tail
67 * follows the array.
66 */ 68 */
67 69
68/* 70/*
71 * This is the extent tail on-disk structure.
72 * All other extent structures are 12 bytes long. It turns out that
73 * block_size % 12 >= 4 for at least all powers of 2 greater than 512, which
74 * covers all valid ext4 block sizes. Therefore, this tail structure can be
75 * crammed into the end of the block without having to rebalance the tree.
76 */
77struct ext4_extent_tail {
78 __le32 et_checksum; /* crc32c(uuid+inum+extent_block) */
79};
80
81/*
69 * This is the extent on-disk structure. 82 * This is the extent on-disk structure.
70 * It's used at the bottom of the tree. 83 * It's used at the bottom of the tree.
71 */ 84 */
@@ -101,6 +114,17 @@ struct ext4_extent_header {
101 114
102#define EXT4_EXT_MAGIC cpu_to_le16(0xf30a) 115#define EXT4_EXT_MAGIC cpu_to_le16(0xf30a)
103 116
117#define EXT4_EXTENT_TAIL_OFFSET(hdr) \
118 (sizeof(struct ext4_extent_header) + \
119 (sizeof(struct ext4_extent) * le16_to_cpu((hdr)->eh_max)))
120
121static inline struct ext4_extent_tail *
122find_ext4_extent_tail(struct ext4_extent_header *eh)
123{
124 return (struct ext4_extent_tail *)(((void *)eh) +
125 EXT4_EXTENT_TAIL_OFFSET(eh));
126}
127
104/* 128/*
105 * Array of ext4_ext_path contains path to some extent. 129 * Array of ext4_ext_path contains path to some extent.
106 * Creation/lookup routines use it for traversal/splitting/etc. 130 * Creation/lookup routines use it for traversal/splitting/etc.
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index aca179017582..90f7c2e84db1 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -138,16 +138,23 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
138} 138}
139 139
140int __ext4_handle_dirty_super(const char *where, unsigned int line, 140int __ext4_handle_dirty_super(const char *where, unsigned int line,
141 handle_t *handle, struct super_block *sb) 141 handle_t *handle, struct super_block *sb,
142 int now)
142{ 143{
143 struct buffer_head *bh = EXT4_SB(sb)->s_sbh; 144 struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
144 int err = 0; 145 int err = 0;
145 146
146 if (ext4_handle_valid(handle)) { 147 if (ext4_handle_valid(handle)) {
148 ext4_superblock_csum_set(sb,
149 (struct ext4_super_block *)bh->b_data);
147 err = jbd2_journal_dirty_metadata(handle, bh); 150 err = jbd2_journal_dirty_metadata(handle, bh);
148 if (err) 151 if (err)
149 ext4_journal_abort_handle(where, line, __func__, 152 ext4_journal_abort_handle(where, line, __func__,
150 bh, handle, err); 153 bh, handle, err);
154 } else if (now) {
155 ext4_superblock_csum_set(sb,
156 (struct ext4_super_block *)bh->b_data);
157 mark_buffer_dirty(bh);
151 } else 158 } else
152 sb->s_dirt = 1; 159 sb->s_dirt = 1;
153 return err; 160 return err;
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 83b20fcf9400..f440e8f1841f 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -213,7 +213,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
213 struct buffer_head *bh); 213 struct buffer_head *bh);
214 214
215int __ext4_handle_dirty_super(const char *where, unsigned int line, 215int __ext4_handle_dirty_super(const char *where, unsigned int line,
216 handle_t *handle, struct super_block *sb); 216 handle_t *handle, struct super_block *sb,
217 int now);
217 218
218#define ext4_journal_get_write_access(handle, bh) \ 219#define ext4_journal_get_write_access(handle, bh) \
219 __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) 220 __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
@@ -225,8 +226,10 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
225#define ext4_handle_dirty_metadata(handle, inode, bh) \ 226#define ext4_handle_dirty_metadata(handle, inode, bh) \
226 __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ 227 __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
227 (bh)) 228 (bh))
229#define ext4_handle_dirty_super_now(handle, sb) \
230 __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb), 1)
228#define ext4_handle_dirty_super(handle, sb) \ 231#define ext4_handle_dirty_super(handle, sb) \
229 __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) 232 __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb), 0)
230 233
231handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); 234handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
232int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); 235int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index abcdeab67f52..91341ec6e06a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -52,6 +52,46 @@
52#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ 52#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
53#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ 53#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
54 54
55static __le32 ext4_extent_block_csum(struct inode *inode,
56 struct ext4_extent_header *eh)
57{
58 struct ext4_inode_info *ei = EXT4_I(inode);
59 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
60 __u32 csum;
61
62 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
63 EXT4_EXTENT_TAIL_OFFSET(eh));
64 return cpu_to_le32(csum);
65}
66
67static int ext4_extent_block_csum_verify(struct inode *inode,
68 struct ext4_extent_header *eh)
69{
70 struct ext4_extent_tail *et;
71
72 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
73 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
74 return 1;
75
76 et = find_ext4_extent_tail(eh);
77 if (et->et_checksum != ext4_extent_block_csum(inode, eh))
78 return 0;
79 return 1;
80}
81
82static void ext4_extent_block_csum_set(struct inode *inode,
83 struct ext4_extent_header *eh)
84{
85 struct ext4_extent_tail *et;
86
87 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
88 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
89 return;
90
91 et = find_ext4_extent_tail(eh);
92 et->et_checksum = ext4_extent_block_csum(inode, eh);
93}
94
55static int ext4_split_extent(handle_t *handle, 95static int ext4_split_extent(handle_t *handle,
56 struct inode *inode, 96 struct inode *inode,
57 struct ext4_ext_path *path, 97 struct ext4_ext_path *path,
@@ -117,6 +157,7 @@ static int __ext4_ext_dirty(const char *where, unsigned int line,
117{ 157{
118 int err; 158 int err;
119 if (path->p_bh) { 159 if (path->p_bh) {
160 ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
120 /* path points to block */ 161 /* path points to block */
121 err = __ext4_handle_dirty_metadata(where, line, handle, 162 err = __ext4_handle_dirty_metadata(where, line, handle,
122 inode, path->p_bh); 163 inode, path->p_bh);
@@ -391,6 +432,12 @@ static int __ext4_ext_check(const char *function, unsigned int line,
391 error_msg = "invalid extent entries"; 432 error_msg = "invalid extent entries";
392 goto corrupted; 433 goto corrupted;
393 } 434 }
435 /* Verify checksum on non-root extent tree nodes */
436 if (ext_depth(inode) != depth &&
437 !ext4_extent_block_csum_verify(inode, eh)) {
438 error_msg = "extent tree corrupted";
439 goto corrupted;
440 }
394 return 0; 441 return 0;
395 442
396corrupted: 443corrupted:
@@ -412,6 +459,26 @@ int ext4_ext_check_inode(struct inode *inode)
412 return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); 459 return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
413} 460}
414 461
462static int __ext4_ext_check_block(const char *function, unsigned int line,
463 struct inode *inode,
464 struct ext4_extent_header *eh,
465 int depth,
466 struct buffer_head *bh)
467{
468 int ret;
469
470 if (buffer_verified(bh))
471 return 0;
472 ret = ext4_ext_check(inode, eh, depth);
473 if (ret)
474 return ret;
475 set_buffer_verified(bh);
476 return ret;
477}
478
479#define ext4_ext_check_block(inode, eh, depth, bh) \
480 __ext4_ext_check_block(__func__, __LINE__, inode, eh, depth, bh)
481
415#ifdef EXT_DEBUG 482#ifdef EXT_DEBUG
416static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) 483static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
417{ 484{
@@ -536,7 +603,7 @@ ext4_ext_binsearch_idx(struct inode *inode,
536 } 603 }
537 604
538 path->p_idx = l - 1; 605 path->p_idx = l - 1;
539 ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), 606 ext_debug(" -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
540 ext4_idx_pblock(path->p_idx)); 607 ext4_idx_pblock(path->p_idx));
541 608
542#ifdef CHECK_BINSEARCH 609#ifdef CHECK_BINSEARCH
@@ -668,8 +735,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
668 i = depth; 735 i = depth;
669 /* walk through the tree */ 736 /* walk through the tree */
670 while (i) { 737 while (i) {
671 int need_to_validate = 0;
672
673 ext_debug("depth %d: num %d, max %d\n", 738 ext_debug("depth %d: num %d, max %d\n",
674 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 739 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
675 740
@@ -688,8 +753,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
688 put_bh(bh); 753 put_bh(bh);
689 goto err; 754 goto err;
690 } 755 }
691 /* validate the extent entries */
692 need_to_validate = 1;
693 } 756 }
694 eh = ext_block_hdr(bh); 757 eh = ext_block_hdr(bh);
695 ppos++; 758 ppos++;
@@ -703,7 +766,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
703 path[ppos].p_hdr = eh; 766 path[ppos].p_hdr = eh;
704 i--; 767 i--;
705 768
706 if (need_to_validate && ext4_ext_check(inode, eh, i)) 769 if (ext4_ext_check_block(inode, eh, i, bh))
707 goto err; 770 goto err;
708 } 771 }
709 772
@@ -914,6 +977,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
914 le16_add_cpu(&neh->eh_entries, m); 977 le16_add_cpu(&neh->eh_entries, m);
915 } 978 }
916 979
980 ext4_extent_block_csum_set(inode, neh);
917 set_buffer_uptodate(bh); 981 set_buffer_uptodate(bh);
918 unlock_buffer(bh); 982 unlock_buffer(bh);
919 983
@@ -992,6 +1056,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
992 sizeof(struct ext4_extent_idx) * m); 1056 sizeof(struct ext4_extent_idx) * m);
993 le16_add_cpu(&neh->eh_entries, m); 1057 le16_add_cpu(&neh->eh_entries, m);
994 } 1058 }
1059 ext4_extent_block_csum_set(inode, neh);
995 set_buffer_uptodate(bh); 1060 set_buffer_uptodate(bh);
996 unlock_buffer(bh); 1061 unlock_buffer(bh);
997 1062
@@ -1089,6 +1154,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1089 else 1154 else
1090 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 1155 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1091 neh->eh_magic = EXT4_EXT_MAGIC; 1156 neh->eh_magic = EXT4_EXT_MAGIC;
1157 ext4_extent_block_csum_set(inode, neh);
1092 set_buffer_uptodate(bh); 1158 set_buffer_uptodate(bh);
1093 unlock_buffer(bh); 1159 unlock_buffer(bh);
1094 1160
@@ -1344,7 +1410,8 @@ got_index:
1344 return -EIO; 1410 return -EIO;
1345 eh = ext_block_hdr(bh); 1411 eh = ext_block_hdr(bh);
1346 /* subtract from p_depth to get proper eh_depth */ 1412 /* subtract from p_depth to get proper eh_depth */
1347 if (ext4_ext_check(inode, eh, path->p_depth - depth)) { 1413 if (ext4_ext_check_block(inode, eh,
1414 path->p_depth - depth, bh)) {
1348 put_bh(bh); 1415 put_bh(bh);
1349 return -EIO; 1416 return -EIO;
1350 } 1417 }
@@ -1357,7 +1424,7 @@ got_index:
1357 if (bh == NULL) 1424 if (bh == NULL)
1358 return -EIO; 1425 return -EIO;
1359 eh = ext_block_hdr(bh); 1426 eh = ext_block_hdr(bh);
1360 if (ext4_ext_check(inode, eh, path->p_depth - depth)) { 1427 if (ext4_ext_check_block(inode, eh, path->p_depth - depth, bh)) {
1361 put_bh(bh); 1428 put_bh(bh);
1362 return -EIO; 1429 return -EIO;
1363 } 1430 }
@@ -2644,8 +2711,8 @@ cont:
2644 err = -EIO; 2711 err = -EIO;
2645 break; 2712 break;
2646 } 2713 }
2647 if (ext4_ext_check(inode, ext_block_hdr(bh), 2714 if (ext4_ext_check_block(inode, ext_block_hdr(bh),
2648 depth - i - 1)) { 2715 depth - i - 1, bh)) {
2649 err = -EIO; 2716 err = -EIO;
2650 break; 2717 break;
2651 } 2718 }
@@ -4722,8 +4789,8 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4722 4789
4723 /* Now release the pages */ 4790 /* Now release the pages */
4724 if (last_page_offset > first_page_offset) { 4791 if (last_page_offset > first_page_offset) {
4725 truncate_inode_pages_range(mapping, first_page_offset, 4792 truncate_pagecache_range(inode, first_page_offset,
4726 last_page_offset-1); 4793 last_page_offset - 1);
4727 } 4794 }
4728 4795
4729 /* finish any pending end_io work */ 4796 /* finish any pending end_io work */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index cb70f1812a70..8c7642a00054 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -95,7 +95,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
95{ 95{
96 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 96 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
97 int unaligned_aio = 0; 97 int unaligned_aio = 0;
98 int ret; 98 ssize_t ret;
99 99
100 /* 100 /*
101 * If we have encountered a bitmap-format file, the size limit 101 * If we have encountered a bitmap-format file, the size limit
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 9f9acac6c43f..d48e8b14928c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -70,24 +70,27 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
70 ext4_group_t block_group, 70 ext4_group_t block_group,
71 struct ext4_group_desc *gdp) 71 struct ext4_group_desc *gdp)
72{ 72{
73 struct ext4_sb_info *sbi = EXT4_SB(sb);
74
75 J_ASSERT_BH(bh, buffer_locked(bh)); 73 J_ASSERT_BH(bh, buffer_locked(bh));
76 74
77 /* If checksum is bad mark all blocks and inodes use to prevent 75 /* If checksum is bad mark all blocks and inodes use to prevent
78 * allocation, essentially implementing a per-group read-only flag. */ 76 * allocation, essentially implementing a per-group read-only flag. */
79 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 77 if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
80 ext4_error(sb, "Checksum bad for group %u", block_group); 78 ext4_error(sb, "Checksum bad for group %u", block_group);
81 ext4_free_group_clusters_set(sb, gdp, 0); 79 ext4_free_group_clusters_set(sb, gdp, 0);
82 ext4_free_inodes_set(sb, gdp, 0); 80 ext4_free_inodes_set(sb, gdp, 0);
83 ext4_itable_unused_set(sb, gdp, 0); 81 ext4_itable_unused_set(sb, gdp, 0);
84 memset(bh->b_data, 0xff, sb->s_blocksize); 82 memset(bh->b_data, 0xff, sb->s_blocksize);
83 ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh,
84 EXT4_INODES_PER_GROUP(sb) / 8);
85 return 0; 85 return 0;
86 } 86 }
87 87
88 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); 88 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
89 ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, 89 ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
90 bh->b_data); 90 bh->b_data);
91 ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh,
92 EXT4_INODES_PER_GROUP(sb) / 8);
93 ext4_group_desc_csum_set(sb, block_group, gdp);
91 94
92 return EXT4_INODES_PER_GROUP(sb); 95 return EXT4_INODES_PER_GROUP(sb);
93} 96}
@@ -128,12 +131,12 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
128 return NULL; 131 return NULL;
129 } 132 }
130 if (bitmap_uptodate(bh)) 133 if (bitmap_uptodate(bh))
131 return bh; 134 goto verify;
132 135
133 lock_buffer(bh); 136 lock_buffer(bh);
134 if (bitmap_uptodate(bh)) { 137 if (bitmap_uptodate(bh)) {
135 unlock_buffer(bh); 138 unlock_buffer(bh);
136 return bh; 139 goto verify;
137 } 140 }
138 141
139 ext4_lock_group(sb, block_group); 142 ext4_lock_group(sb, block_group);
@@ -141,6 +144,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
141 ext4_init_inode_bitmap(sb, bh, block_group, desc); 144 ext4_init_inode_bitmap(sb, bh, block_group, desc);
142 set_bitmap_uptodate(bh); 145 set_bitmap_uptodate(bh);
143 set_buffer_uptodate(bh); 146 set_buffer_uptodate(bh);
147 set_buffer_verified(bh);
144 ext4_unlock_group(sb, block_group); 148 ext4_unlock_group(sb, block_group);
145 unlock_buffer(bh); 149 unlock_buffer(bh);
146 return bh; 150 return bh;
@@ -154,7 +158,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
154 */ 158 */
155 set_bitmap_uptodate(bh); 159 set_bitmap_uptodate(bh);
156 unlock_buffer(bh); 160 unlock_buffer(bh);
157 return bh; 161 goto verify;
158 } 162 }
159 /* 163 /*
160 * submit the buffer_head for reading 164 * submit the buffer_head for reading
@@ -171,6 +175,20 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
171 block_group, bitmap_blk); 175 block_group, bitmap_blk);
172 return NULL; 176 return NULL;
173 } 177 }
178
179verify:
180 ext4_lock_group(sb, block_group);
181 if (!buffer_verified(bh) &&
182 !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
183 EXT4_INODES_PER_GROUP(sb) / 8)) {
184 ext4_unlock_group(sb, block_group);
185 put_bh(bh);
186 ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
187 "inode_bitmap = %llu", block_group, bitmap_blk);
188 return NULL;
189 }
190 ext4_unlock_group(sb, block_group);
191 set_buffer_verified(bh);
174 return bh; 192 return bh;
175} 193}
176 194
@@ -276,7 +294,9 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
276 ext4_used_dirs_set(sb, gdp, count); 294 ext4_used_dirs_set(sb, gdp, count);
277 percpu_counter_dec(&sbi->s_dirs_counter); 295 percpu_counter_dec(&sbi->s_dirs_counter);
278 } 296 }
279 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); 297 ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
298 EXT4_INODES_PER_GROUP(sb) / 8);
299 ext4_group_desc_csum_set(sb, block_group, gdp);
280 ext4_unlock_group(sb, block_group); 300 ext4_unlock_group(sb, block_group);
281 301
282 percpu_counter_inc(&sbi->s_freeinodes_counter); 302 percpu_counter_inc(&sbi->s_freeinodes_counter);
@@ -488,10 +508,12 @@ fallback_retry:
488 for (i = 0; i < ngroups; i++) { 508 for (i = 0; i < ngroups; i++) {
489 grp = (parent_group + i) % ngroups; 509 grp = (parent_group + i) % ngroups;
490 desc = ext4_get_group_desc(sb, grp, NULL); 510 desc = ext4_get_group_desc(sb, grp, NULL);
491 grp_free = ext4_free_inodes_count(sb, desc); 511 if (desc) {
492 if (desc && grp_free && grp_free >= avefreei) { 512 grp_free = ext4_free_inodes_count(sb, desc);
493 *group = grp; 513 if (grp_free && grp_free >= avefreei) {
494 return 0; 514 *group = grp;
515 return 0;
516 }
495 } 517 }
496 } 518 }
497 519
@@ -709,7 +731,7 @@ repeat_in_this_group:
709 731
710got: 732got:
711 /* We may have to initialize the block bitmap if it isn't already */ 733 /* We may have to initialize the block bitmap if it isn't already */
712 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && 734 if (ext4_has_group_desc_csum(sb) &&
713 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 735 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
714 struct buffer_head *block_bitmap_bh; 736 struct buffer_head *block_bitmap_bh;
715 737
@@ -731,8 +753,11 @@ got:
731 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 753 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
732 ext4_free_group_clusters_set(sb, gdp, 754 ext4_free_group_clusters_set(sb, gdp,
733 ext4_free_clusters_after_init(sb, group, gdp)); 755 ext4_free_clusters_after_init(sb, group, gdp));
734 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, 756 ext4_block_bitmap_csum_set(sb, group, gdp,
735 gdp); 757 block_bitmap_bh,
758 EXT4_BLOCKS_PER_GROUP(sb) /
759 8);
760 ext4_group_desc_csum_set(sb, group, gdp);
736 } 761 }
737 ext4_unlock_group(sb, group); 762 ext4_unlock_group(sb, group);
738 763
@@ -751,7 +776,7 @@ got:
751 goto fail; 776 goto fail;
752 777
753 /* Update the relevant bg descriptor fields */ 778 /* Update the relevant bg descriptor fields */
754 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { 779 if (ext4_has_group_desc_csum(sb)) {
755 int free; 780 int free;
756 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 781 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
757 782
@@ -772,7 +797,10 @@ got:
772 ext4_itable_unused_set(sb, gdp, 797 ext4_itable_unused_set(sb, gdp,
773 (EXT4_INODES_PER_GROUP(sb) - ino)); 798 (EXT4_INODES_PER_GROUP(sb) - ino));
774 up_read(&grp->alloc_sem); 799 up_read(&grp->alloc_sem);
800 } else {
801 ext4_lock_group(sb, group);
775 } 802 }
803
776 ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); 804 ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
777 if (S_ISDIR(mode)) { 805 if (S_ISDIR(mode)) {
778 ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1); 806 ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
@@ -782,10 +810,12 @@ got:
782 atomic_inc(&sbi->s_flex_groups[f].used_dirs); 810 atomic_inc(&sbi->s_flex_groups[f].used_dirs);
783 } 811 }
784 } 812 }
785 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { 813 if (ext4_has_group_desc_csum(sb)) {
786 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 814 ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
787 ext4_unlock_group(sb, group); 815 EXT4_INODES_PER_GROUP(sb) / 8);
816 ext4_group_desc_csum_set(sb, group, gdp);
788 } 817 }
818 ext4_unlock_group(sb, group);
789 819
790 BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); 820 BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
791 err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); 821 err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
@@ -850,6 +880,19 @@ got:
850 inode->i_generation = sbi->s_next_generation++; 880 inode->i_generation = sbi->s_next_generation++;
851 spin_unlock(&sbi->s_next_gen_lock); 881 spin_unlock(&sbi->s_next_gen_lock);
852 882
883 /* Precompute checksum seed for inode metadata */
884 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
885 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
886 __u32 csum;
887 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
888 __le32 inum = cpu_to_le32(inode->i_ino);
889 __le32 gen = cpu_to_le32(inode->i_generation);
890 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
891 sizeof(inum));
892 ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
893 sizeof(gen));
894 }
895
853 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ 896 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
854 ext4_set_inode_state(inode, EXT4_STATE_NEW); 897 ext4_set_inode_state(inode, EXT4_STATE_NEW);
855 898
@@ -1140,7 +1183,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
1140skip_zeroout: 1183skip_zeroout:
1141 ext4_lock_group(sb, group); 1184 ext4_lock_group(sb, group);
1142 gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); 1185 gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
1143 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 1186 ext4_group_desc_csum_set(sb, group, gdp);
1144 ext4_unlock_group(sb, group); 1187 ext4_unlock_group(sb, group);
1145 1188
1146 BUFFER_TRACE(group_desc_bh, 1189 BUFFER_TRACE(group_desc_bh,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 07eaf565fdcb..02bc8cbe7281 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -47,6 +47,73 @@
47 47
48#define MPAGE_DA_EXTENT_TAIL 0x01 48#define MPAGE_DA_EXTENT_TAIL 0x01
49 49
50static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
51 struct ext4_inode_info *ei)
52{
53 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
54 __u16 csum_lo;
55 __u16 csum_hi = 0;
56 __u32 csum;
57
58 csum_lo = raw->i_checksum_lo;
59 raw->i_checksum_lo = 0;
60 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
61 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
62 csum_hi = raw->i_checksum_hi;
63 raw->i_checksum_hi = 0;
64 }
65
66 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
67 EXT4_INODE_SIZE(inode->i_sb));
68
69 raw->i_checksum_lo = csum_lo;
70 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
71 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
72 raw->i_checksum_hi = csum_hi;
73
74 return csum;
75}
76
77static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
78 struct ext4_inode_info *ei)
79{
80 __u32 provided, calculated;
81
82 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
83 cpu_to_le32(EXT4_OS_LINUX) ||
84 !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
85 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
86 return 1;
87
88 provided = le16_to_cpu(raw->i_checksum_lo);
89 calculated = ext4_inode_csum(inode, raw, ei);
90 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
91 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
92 provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
93 else
94 calculated &= 0xFFFF;
95
96 return provided == calculated;
97}
98
99static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
100 struct ext4_inode_info *ei)
101{
102 __u32 csum;
103
104 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
105 cpu_to_le32(EXT4_OS_LINUX) ||
106 !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
107 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
108 return;
109
110 csum = ext4_inode_csum(inode, raw, ei);
111 raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
112 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
113 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
114 raw->i_checksum_hi = cpu_to_le16(csum >> 16);
115}
116
50static inline int ext4_begin_ordered_truncate(struct inode *inode, 117static inline int ext4_begin_ordered_truncate(struct inode *inode,
51 loff_t new_size) 118 loff_t new_size)
52{ 119{
@@ -3517,8 +3584,7 @@ make_io:
3517 b = table; 3584 b = table;
3518 end = b + EXT4_SB(sb)->s_inode_readahead_blks; 3585 end = b + EXT4_SB(sb)->s_inode_readahead_blks;
3519 num = EXT4_INODES_PER_GROUP(sb); 3586 num = EXT4_INODES_PER_GROUP(sb);
3520 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 3587 if (ext4_has_group_desc_csum(sb))
3521 EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
3522 num -= ext4_itable_unused_count(sb, gdp); 3588 num -= ext4_itable_unused_count(sb, gdp);
3523 table += num / inodes_per_block; 3589 table += num / inodes_per_block;
3524 if (end > table) 3590 if (end > table)
@@ -3646,6 +3712,39 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3646 if (ret < 0) 3712 if (ret < 0)
3647 goto bad_inode; 3713 goto bad_inode;
3648 raw_inode = ext4_raw_inode(&iloc); 3714 raw_inode = ext4_raw_inode(&iloc);
3715
3716 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
3717 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
3718 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
3719 EXT4_INODE_SIZE(inode->i_sb)) {
3720 EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
3721 EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
3722 EXT4_INODE_SIZE(inode->i_sb));
3723 ret = -EIO;
3724 goto bad_inode;
3725 }
3726 } else
3727 ei->i_extra_isize = 0;
3728
3729 /* Precompute checksum seed for inode metadata */
3730 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3731 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
3732 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3733 __u32 csum;
3734 __le32 inum = cpu_to_le32(inode->i_ino);
3735 __le32 gen = raw_inode->i_generation;
3736 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
3737 sizeof(inum));
3738 ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
3739 sizeof(gen));
3740 }
3741
3742 if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
3743 EXT4_ERROR_INODE(inode, "checksum invalid");
3744 ret = -EIO;
3745 goto bad_inode;
3746 }
3747
3649 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 3748 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
3650 i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 3749 i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
3651 i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 3750 i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
@@ -3725,12 +3824,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3725 } 3824 }
3726 3825
3727 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 3826 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
3728 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
3729 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
3730 EXT4_INODE_SIZE(inode->i_sb)) {
3731 ret = -EIO;
3732 goto bad_inode;
3733 }
3734 if (ei->i_extra_isize == 0) { 3827 if (ei->i_extra_isize == 0) {
3735 /* The extra space is currently unused. Use it. */ 3828 /* The extra space is currently unused. Use it. */
3736 ei->i_extra_isize = sizeof(struct ext4_inode) - 3829 ei->i_extra_isize = sizeof(struct ext4_inode) -
@@ -3742,8 +3835,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3742 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) 3835 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
3743 ext4_set_inode_state(inode, EXT4_STATE_XATTR); 3836 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
3744 } 3837 }
3745 } else 3838 }
3746 ei->i_extra_isize = 0;
3747 3839
3748 EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); 3840 EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
3749 EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); 3841 EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
@@ -3942,7 +4034,7 @@ static int ext4_do_update_inode(handle_t *handle,
3942 EXT4_SET_RO_COMPAT_FEATURE(sb, 4034 EXT4_SET_RO_COMPAT_FEATURE(sb,
3943 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 4035 EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
3944 ext4_handle_sync(handle); 4036 ext4_handle_sync(handle);
3945 err = ext4_handle_dirty_super(handle, sb); 4037 err = ext4_handle_dirty_super_now(handle, sb);
3946 } 4038 }
3947 } 4039 }
3948 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 4040 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
@@ -3969,6 +4061,8 @@ static int ext4_do_update_inode(handle_t *handle,
3969 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 4061 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
3970 } 4062 }
3971 4063
4064 ext4_inode_csum_set(inode, raw_inode, ei);
4065
3972 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4066 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
3973 rc = ext4_handle_dirty_metadata(handle, NULL, bh); 4067 rc = ext4_handle_dirty_metadata(handle, NULL, bh);
3974 if (!err) 4068 if (!err)
@@ -4213,7 +4307,8 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4213 * will return the blocks that include the delayed allocation 4307 * will return the blocks that include the delayed allocation
4214 * blocks for this file. 4308 * blocks for this file.
4215 */ 4309 */
4216 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; 4310 delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
4311 EXT4_I(inode)->i_reserved_data_blocks);
4217 4312
4218 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 4313 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
4219 return 0; 4314 return 0;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 6eee25591b81..8ad112ae0ade 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -38,7 +38,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
38 handle_t *handle = NULL; 38 handle_t *handle = NULL;
39 int err, migrate = 0; 39 int err, migrate = 0;
40 struct ext4_iloc iloc; 40 struct ext4_iloc iloc;
41 unsigned int oldflags; 41 unsigned int oldflags, mask, i;
42 unsigned int jflag; 42 unsigned int jflag;
43 43
44 if (!inode_owner_or_capable(inode)) 44 if (!inode_owner_or_capable(inode))
@@ -115,8 +115,14 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
115 if (err) 115 if (err)
116 goto flags_err; 116 goto flags_err;
117 117
118 flags = flags & EXT4_FL_USER_MODIFIABLE; 118 for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
119 flags |= oldflags & ~EXT4_FL_USER_MODIFIABLE; 119 if (!(mask & EXT4_FL_USER_MODIFIABLE))
120 continue;
121 if (mask & flags)
122 ext4_set_inode_flag(inode, i);
123 else
124 ext4_clear_inode_flag(inode, i);
125 }
120 ei->i_flags = flags; 126 ei->i_flags = flags;
121 127
122 ext4_set_inode_flags(inode); 128 ext4_set_inode_flags(inode);
@@ -152,6 +158,13 @@ flags_out:
152 if (!inode_owner_or_capable(inode)) 158 if (!inode_owner_or_capable(inode))
153 return -EPERM; 159 return -EPERM;
154 160
161 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
162 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
163 ext4_warning(sb, "Setting inode version is not "
164 "supported with metadata_csum enabled.");
165 return -ENOTTY;
166 }
167
155 err = mnt_want_write_file(filp); 168 err = mnt_want_write_file(filp);
156 if (err) 169 if (err)
157 return err; 170 return err;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 99ab428bcfa0..1cd6994fc446 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -788,7 +788,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
788 int first_block; 788 int first_block;
789 struct super_block *sb; 789 struct super_block *sb;
790 struct buffer_head *bhs; 790 struct buffer_head *bhs;
791 struct buffer_head **bh; 791 struct buffer_head **bh = NULL;
792 struct inode *inode; 792 struct inode *inode;
793 char *data; 793 char *data;
794 char *bitmap; 794 char *bitmap;
@@ -2375,7 +2375,7 @@ static int ext4_groupinfo_create_slab(size_t size)
2375 return 0; 2375 return 0;
2376} 2376}
2377 2377
2378int ext4_mb_init(struct super_block *sb, int needs_recovery) 2378int ext4_mb_init(struct super_block *sb)
2379{ 2379{
2380 struct ext4_sb_info *sbi = EXT4_SB(sb); 2380 struct ext4_sb_info *sbi = EXT4_SB(sb);
2381 unsigned i, j; 2381 unsigned i, j;
@@ -2517,6 +2517,9 @@ int ext4_mb_release(struct super_block *sb)
2517 struct ext4_sb_info *sbi = EXT4_SB(sb); 2517 struct ext4_sb_info *sbi = EXT4_SB(sb);
2518 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); 2518 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2519 2519
2520 if (sbi->s_proc)
2521 remove_proc_entry("mb_groups", sbi->s_proc);
2522
2520 if (sbi->s_group_info) { 2523 if (sbi->s_group_info) {
2521 for (i = 0; i < ngroups; i++) { 2524 for (i = 0; i < ngroups; i++) {
2522 grinfo = ext4_get_group_info(sb, i); 2525 grinfo = ext4_get_group_info(sb, i);
@@ -2564,8 +2567,6 @@ int ext4_mb_release(struct super_block *sb)
2564 } 2567 }
2565 2568
2566 free_percpu(sbi->s_locality_groups); 2569 free_percpu(sbi->s_locality_groups);
2567 if (sbi->s_proc)
2568 remove_proc_entry("mb_groups", sbi->s_proc);
2569 2570
2570 return 0; 2571 return 0;
2571} 2572}
@@ -2797,7 +2798,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2797 } 2798 }
2798 len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; 2799 len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
2799 ext4_free_group_clusters_set(sb, gdp, len); 2800 ext4_free_group_clusters_set(sb, gdp, len);
2800 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); 2801 ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh,
2802 EXT4_BLOCKS_PER_GROUP(sb) / 8);
2803 ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
2801 2804
2802 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 2805 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
2803 percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); 2806 percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
@@ -3071,13 +3074,9 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3071static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) 3074static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
3072{ 3075{
3073 struct ext4_prealloc_space *pa = ac->ac_pa; 3076 struct ext4_prealloc_space *pa = ac->ac_pa;
3074 int len;
3075
3076 if (pa && pa->pa_type == MB_INODE_PA) {
3077 len = ac->ac_b_ex.fe_len;
3078 pa->pa_free += len;
3079 }
3080 3077
3078 if (pa && pa->pa_type == MB_INODE_PA)
3079 pa->pa_free += ac->ac_b_ex.fe_len;
3081} 3080}
3082 3081
3083/* 3082/*
@@ -4636,6 +4635,7 @@ do_more:
4636 */ 4635 */
4637 new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); 4636 new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
4638 if (!new_entry) { 4637 if (!new_entry) {
4638 ext4_mb_unload_buddy(&e4b);
4639 err = -ENOMEM; 4639 err = -ENOMEM;
4640 goto error_return; 4640 goto error_return;
4641 } 4641 }
@@ -4659,7 +4659,9 @@ do_more:
4659 4659
4660 ret = ext4_free_group_clusters(sb, gdp) + count_clusters; 4660 ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
4661 ext4_free_group_clusters_set(sb, gdp, ret); 4661 ext4_free_group_clusters_set(sb, gdp, ret);
4662 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); 4662 ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
4663 EXT4_BLOCKS_PER_GROUP(sb) / 8);
4664 ext4_group_desc_csum_set(sb, block_group, gdp);
4663 ext4_unlock_group(sb, block_group); 4665 ext4_unlock_group(sb, block_group);
4664 percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); 4666 percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
4665 4667
@@ -4803,7 +4805,9 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
4803 mb_free_blocks(NULL, &e4b, bit, count); 4805 mb_free_blocks(NULL, &e4b, bit, count);
4804 blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc); 4806 blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
4805 ext4_free_group_clusters_set(sb, desc, blk_free_count); 4807 ext4_free_group_clusters_set(sb, desc, blk_free_count);
4806 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); 4808 ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh,
4809 EXT4_BLOCKS_PER_GROUP(sb) / 8);
4810 ext4_group_desc_csum_set(sb, block_group, desc);
4807 ext4_unlock_group(sb, block_group); 4811 ext4_unlock_group(sb, block_group);
4808 percpu_counter_add(&sbi->s_freeclusters_counter, 4812 percpu_counter_add(&sbi->s_freeclusters_counter,
4809 EXT4_B2C(sbi, blocks_freed)); 4813 EXT4_B2C(sbi, blocks_freed));
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index ed6548d89165..f99a1311e847 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -6,12 +6,45 @@
6 6
7#include "ext4.h" 7#include "ext4.h"
8 8
9/* Checksumming functions */
10static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
11{
12 struct ext4_sb_info *sbi = EXT4_SB(sb);
13 int offset = offsetof(struct mmp_struct, mmp_checksum);
14 __u32 csum;
15
16 csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset);
17
18 return cpu_to_le32(csum);
19}
20
21int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
22{
23 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
24 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
25 return 1;
26
27 return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
28}
29
30void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
31{
32 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
33 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
34 return;
35
36 mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
37}
38
9/* 39/*
10 * Write the MMP block using WRITE_SYNC to try to get the block on-disk 40 * Write the MMP block using WRITE_SYNC to try to get the block on-disk
11 * faster. 41 * faster.
12 */ 42 */
13static int write_mmp_block(struct buffer_head *bh) 43static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
14{ 44{
45 struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data);
46
47 ext4_mmp_csum_set(sb, mmp);
15 mark_buffer_dirty(bh); 48 mark_buffer_dirty(bh);
16 lock_buffer(bh); 49 lock_buffer(bh);
17 bh->b_end_io = end_buffer_write_sync; 50 bh->b_end_io = end_buffer_write_sync;
@@ -59,7 +92,8 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
59 } 92 }
60 93
61 mmp = (struct mmp_struct *)((*bh)->b_data); 94 mmp = (struct mmp_struct *)((*bh)->b_data);
62 if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) 95 if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC ||
96 !ext4_mmp_csum_verify(sb, mmp))
63 return -EINVAL; 97 return -EINVAL;
64 98
65 return 0; 99 return 0;
@@ -120,7 +154,7 @@ static int kmmpd(void *data)
120 mmp->mmp_time = cpu_to_le64(get_seconds()); 154 mmp->mmp_time = cpu_to_le64(get_seconds());
121 last_update_time = jiffies; 155 last_update_time = jiffies;
122 156
123 retval = write_mmp_block(bh); 157 retval = write_mmp_block(sb, bh);
124 /* 158 /*
125 * Don't spew too many error messages. Print one every 159 * Don't spew too many error messages. Print one every
126 * (s_mmp_update_interval * 60) seconds. 160 * (s_mmp_update_interval * 60) seconds.
@@ -200,7 +234,7 @@ static int kmmpd(void *data)
200 mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); 234 mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
201 mmp->mmp_time = cpu_to_le64(get_seconds()); 235 mmp->mmp_time = cpu_to_le64(get_seconds());
202 236
203 retval = write_mmp_block(bh); 237 retval = write_mmp_block(sb, bh);
204 238
205failed: 239failed:
206 kfree(data); 240 kfree(data);
@@ -299,7 +333,7 @@ skip:
299 seq = mmp_new_seq(); 333 seq = mmp_new_seq();
300 mmp->mmp_seq = cpu_to_le32(seq); 334 mmp->mmp_seq = cpu_to_le32(seq);
301 335
302 retval = write_mmp_block(bh); 336 retval = write_mmp_block(sb, bh);
303 if (retval) 337 if (retval)
304 goto failed; 338 goto failed;
305 339
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index e2a3f4b0ff78..5845cd97bf8b 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -145,6 +145,14 @@ struct dx_map_entry
145 u16 size; 145 u16 size;
146}; 146};
147 147
148/*
149 * This goes at the end of each htree block.
150 */
151struct dx_tail {
152 u32 dt_reserved;
153 __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */
154};
155
148static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); 156static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
149static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); 157static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
150static inline unsigned dx_get_hash(struct dx_entry *entry); 158static inline unsigned dx_get_hash(struct dx_entry *entry);
@@ -180,6 +188,230 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
180static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, 188static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
181 struct inode *inode); 189 struct inode *inode);
182 190
191/* checksumming functions */
192#define EXT4_DIRENT_TAIL(block, blocksize) \
193 ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
194 ((blocksize) - \
195 sizeof(struct ext4_dir_entry_tail))))
196
197static void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
198 unsigned int blocksize)
199{
200 memset(t, 0, sizeof(struct ext4_dir_entry_tail));
201 t->det_rec_len = ext4_rec_len_to_disk(
202 sizeof(struct ext4_dir_entry_tail), blocksize);
203 t->det_reserved_ft = EXT4_FT_DIR_CSUM;
204}
205
206/* Walk through a dirent block to find a checksum "dirent" at the tail */
207static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
208 struct ext4_dir_entry *de)
209{
210 struct ext4_dir_entry_tail *t;
211
212#ifdef PARANOID
213 struct ext4_dir_entry *d, *top;
214
215 d = de;
216 top = (struct ext4_dir_entry *)(((void *)de) +
217 (EXT4_BLOCK_SIZE(inode->i_sb) -
218 sizeof(struct ext4_dir_entry_tail)));
219 while (d < top && d->rec_len)
220 d = (struct ext4_dir_entry *)(((void *)d) +
221 le16_to_cpu(d->rec_len));
222
223 if (d != top)
224 return NULL;
225
226 t = (struct ext4_dir_entry_tail *)d;
227#else
228 t = EXT4_DIRENT_TAIL(de, EXT4_BLOCK_SIZE(inode->i_sb));
229#endif
230
231 if (t->det_reserved_zero1 ||
232 le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
233 t->det_reserved_zero2 ||
234 t->det_reserved_ft != EXT4_FT_DIR_CSUM)
235 return NULL;
236
237 return t;
238}
239
240static __le32 ext4_dirent_csum(struct inode *inode,
241 struct ext4_dir_entry *dirent, int size)
242{
243 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
244 struct ext4_inode_info *ei = EXT4_I(inode);
245 __u32 csum;
246
247 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
248 return cpu_to_le32(csum);
249}
250
251int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
252{
253 struct ext4_dir_entry_tail *t;
254
255 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
256 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
257 return 1;
258
259 t = get_dirent_tail(inode, dirent);
260 if (!t) {
261 EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
262 "leaf for checksum. Please run e2fsck -D.");
263 return 0;
264 }
265
266 if (t->det_checksum != ext4_dirent_csum(inode, dirent,
267 (void *)t - (void *)dirent))
268 return 0;
269
270 return 1;
271}
272
273static void ext4_dirent_csum_set(struct inode *inode,
274 struct ext4_dir_entry *dirent)
275{
276 struct ext4_dir_entry_tail *t;
277
278 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
279 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
280 return;
281
282 t = get_dirent_tail(inode, dirent);
283 if (!t) {
284 EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
285 "leaf for checksum. Please run e2fsck -D.");
286 return;
287 }
288
289 t->det_checksum = ext4_dirent_csum(inode, dirent,
290 (void *)t - (void *)dirent);
291}
292
293static inline int ext4_handle_dirty_dirent_node(handle_t *handle,
294 struct inode *inode,
295 struct buffer_head *bh)
296{
297 ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
298 return ext4_handle_dirty_metadata(handle, inode, bh);
299}
300
301static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
302 struct ext4_dir_entry *dirent,
303 int *offset)
304{
305 struct ext4_dir_entry *dp;
306 struct dx_root_info *root;
307 int count_offset;
308
309 if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
310 count_offset = 8;
311 else if (le16_to_cpu(dirent->rec_len) == 12) {
312 dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
313 if (le16_to_cpu(dp->rec_len) !=
314 EXT4_BLOCK_SIZE(inode->i_sb) - 12)
315 return NULL;
316 root = (struct dx_root_info *)(((void *)dp + 12));
317 if (root->reserved_zero ||
318 root->info_length != sizeof(struct dx_root_info))
319 return NULL;
320 count_offset = 32;
321 } else
322 return NULL;
323
324 if (offset)
325 *offset = count_offset;
326 return (struct dx_countlimit *)(((void *)dirent) + count_offset);
327}
328
329static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
330 int count_offset, int count, struct dx_tail *t)
331{
332 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
333 struct ext4_inode_info *ei = EXT4_I(inode);
334 __u32 csum, old_csum;
335 int size;
336
337 size = count_offset + (count * sizeof(struct dx_entry));
338 old_csum = t->dt_checksum;
339 t->dt_checksum = 0;
340 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
341 csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
342 t->dt_checksum = old_csum;
343
344 return cpu_to_le32(csum);
345}
346
347static int ext4_dx_csum_verify(struct inode *inode,
348 struct ext4_dir_entry *dirent)
349{
350 struct dx_countlimit *c;
351 struct dx_tail *t;
352 int count_offset, limit, count;
353
354 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
355 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
356 return 1;
357
358 c = get_dx_countlimit(inode, dirent, &count_offset);
359 if (!c) {
360 EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D.");
361 return 1;
362 }
363 limit = le16_to_cpu(c->limit);
364 count = le16_to_cpu(c->count);
365 if (count_offset + (limit * sizeof(struct dx_entry)) >
366 EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
367 EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
368 "tree checksum found. Run e2fsck -D.");
369 return 1;
370 }
371 t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
372
373 if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset,
374 count, t))
375 return 0;
376 return 1;
377}
378
379static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
380{
381 struct dx_countlimit *c;
382 struct dx_tail *t;
383 int count_offset, limit, count;
384
385 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
386 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
387 return;
388
389 c = get_dx_countlimit(inode, dirent, &count_offset);
390 if (!c) {
391 EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D.");
392 return;
393 }
394 limit = le16_to_cpu(c->limit);
395 count = le16_to_cpu(c->count);
396 if (count_offset + (limit * sizeof(struct dx_entry)) >
397 EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
398 EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
399 "tree checksum. Run e2fsck -D.");
400 return;
401 }
402 t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
403
404 t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t);
405}
406
407static inline int ext4_handle_dirty_dx_node(handle_t *handle,
408 struct inode *inode,
409 struct buffer_head *bh)
410{
411 ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
412 return ext4_handle_dirty_metadata(handle, inode, bh);
413}
414
183/* 415/*
184 * p is at least 6 bytes before the end of page 416 * p is at least 6 bytes before the end of page
185 */ 417 */
@@ -239,12 +471,20 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
239{ 471{
240 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - 472 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
241 EXT4_DIR_REC_LEN(2) - infosize; 473 EXT4_DIR_REC_LEN(2) - infosize;
474
475 if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
476 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
477 entry_space -= sizeof(struct dx_tail);
242 return entry_space / sizeof(struct dx_entry); 478 return entry_space / sizeof(struct dx_entry);
243} 479}
244 480
245static inline unsigned dx_node_limit(struct inode *dir) 481static inline unsigned dx_node_limit(struct inode *dir)
246{ 482{
247 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); 483 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
484
485 if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
486 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
487 entry_space -= sizeof(struct dx_tail);
248 return entry_space / sizeof(struct dx_entry); 488 return entry_space / sizeof(struct dx_entry);
249} 489}
250 490
@@ -390,6 +630,15 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
390 goto fail; 630 goto fail;
391 } 631 }
392 632
633 if (!buffer_verified(bh) &&
634 !ext4_dx_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) {
635 ext4_warning(dir->i_sb, "Root failed checksum");
636 brelse(bh);
637 *err = ERR_BAD_DX_DIR;
638 goto fail;
639 }
640 set_buffer_verified(bh);
641
393 entries = (struct dx_entry *) (((char *)&root->info) + 642 entries = (struct dx_entry *) (((char *)&root->info) +
394 root->info.info_length); 643 root->info.info_length);
395 644
@@ -450,6 +699,17 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
450 if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err))) 699 if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
451 goto fail2; 700 goto fail2;
452 at = entries = ((struct dx_node *) bh->b_data)->entries; 701 at = entries = ((struct dx_node *) bh->b_data)->entries;
702
703 if (!buffer_verified(bh) &&
704 !ext4_dx_csum_verify(dir,
705 (struct ext4_dir_entry *)bh->b_data)) {
706 ext4_warning(dir->i_sb, "Node failed checksum");
707 brelse(bh);
708 *err = ERR_BAD_DX_DIR;
709 goto fail;
710 }
711 set_buffer_verified(bh);
712
453 if (dx_get_limit(entries) != dx_node_limit (dir)) { 713 if (dx_get_limit(entries) != dx_node_limit (dir)) {
454 ext4_warning(dir->i_sb, 714 ext4_warning(dir->i_sb,
455 "dx entry: limit != node limit"); 715 "dx entry: limit != node limit");
@@ -549,6 +809,15 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
549 if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at), 809 if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
550 0, &err))) 810 0, &err)))
551 return err; /* Failure */ 811 return err; /* Failure */
812
813 if (!buffer_verified(bh) &&
814 !ext4_dx_csum_verify(dir,
815 (struct ext4_dir_entry *)bh->b_data)) {
816 ext4_warning(dir->i_sb, "Node failed checksum");
817 return -EIO;
818 }
819 set_buffer_verified(bh);
820
552 p++; 821 p++;
553 brelse(p->bh); 822 brelse(p->bh);
554 p->bh = bh; 823 p->bh = bh;
@@ -577,6 +846,11 @@ static int htree_dirblock_to_tree(struct file *dir_file,
577 if (!(bh = ext4_bread (NULL, dir, block, 0, &err))) 846 if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
578 return err; 847 return err;
579 848
849 if (!buffer_verified(bh) &&
850 !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
851 return -EIO;
852 set_buffer_verified(bh);
853
580 de = (struct ext4_dir_entry_2 *) bh->b_data; 854 de = (struct ext4_dir_entry_2 *) bh->b_data;
581 top = (struct ext4_dir_entry_2 *) ((char *) de + 855 top = (struct ext4_dir_entry_2 *) ((char *) de +
582 dir->i_sb->s_blocksize - 856 dir->i_sb->s_blocksize -
@@ -936,6 +1210,15 @@ restart:
936 brelse(bh); 1210 brelse(bh);
937 goto next; 1211 goto next;
938 } 1212 }
1213 if (!buffer_verified(bh) &&
1214 !ext4_dirent_csum_verify(dir,
1215 (struct ext4_dir_entry *)bh->b_data)) {
1216 EXT4_ERROR_INODE(dir, "checksumming directory "
1217 "block %lu", (unsigned long)block);
1218 brelse(bh);
1219 goto next;
1220 }
1221 set_buffer_verified(bh);
939 i = search_dirblock(bh, dir, d_name, 1222 i = search_dirblock(bh, dir, d_name,
940 block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); 1223 block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
941 if (i == 1) { 1224 if (i == 1) {
@@ -987,6 +1270,16 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
987 if (!(bh = ext4_bread(NULL, dir, block, 0, err))) 1270 if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
988 goto errout; 1271 goto errout;
989 1272
1273 if (!buffer_verified(bh) &&
1274 !ext4_dirent_csum_verify(dir,
1275 (struct ext4_dir_entry *)bh->b_data)) {
1276 EXT4_ERROR_INODE(dir, "checksumming directory "
1277 "block %lu", (unsigned long)block);
1278 brelse(bh);
1279 *err = -EIO;
1280 goto errout;
1281 }
1282 set_buffer_verified(bh);
990 retval = search_dirblock(bh, dir, d_name, 1283 retval = search_dirblock(bh, dir, d_name,
991 block << EXT4_BLOCK_SIZE_BITS(sb), 1284 block << EXT4_BLOCK_SIZE_BITS(sb),
992 res_dir); 1285 res_dir);
@@ -1037,6 +1330,12 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1037 EXT4_ERROR_INODE(dir, "bad inode number: %u", ino); 1330 EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
1038 return ERR_PTR(-EIO); 1331 return ERR_PTR(-EIO);
1039 } 1332 }
1333 if (unlikely(ino == dir->i_ino)) {
1334 EXT4_ERROR_INODE(dir, "'%.*s' linked to parent dir",
1335 dentry->d_name.len,
1336 dentry->d_name.name);
1337 return ERR_PTR(-EIO);
1338 }
1040 inode = ext4_iget(dir->i_sb, ino); 1339 inode = ext4_iget(dir->i_sb, ino);
1041 if (inode == ERR_PTR(-ESTALE)) { 1340 if (inode == ERR_PTR(-ESTALE)) {
1042 EXT4_ERROR_INODE(dir, 1341 EXT4_ERROR_INODE(dir,
@@ -1156,8 +1455,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1156 char *data1 = (*bh)->b_data, *data2; 1455 char *data1 = (*bh)->b_data, *data2;
1157 unsigned split, move, size; 1456 unsigned split, move, size;
1158 struct ext4_dir_entry_2 *de = NULL, *de2; 1457 struct ext4_dir_entry_2 *de = NULL, *de2;
1458 struct ext4_dir_entry_tail *t;
1459 int csum_size = 0;
1159 int err = 0, i; 1460 int err = 0, i;
1160 1461
1462 if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
1463 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
1464 csum_size = sizeof(struct ext4_dir_entry_tail);
1465
1161 bh2 = ext4_append (handle, dir, &newblock, &err); 1466 bh2 = ext4_append (handle, dir, &newblock, &err);
1162 if (!(bh2)) { 1467 if (!(bh2)) {
1163 brelse(*bh); 1468 brelse(*bh);
@@ -1204,10 +1509,20 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1204 /* Fancy dance to stay within two buffers */ 1509 /* Fancy dance to stay within two buffers */
1205 de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize); 1510 de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
1206 de = dx_pack_dirents(data1, blocksize); 1511 de = dx_pack_dirents(data1, blocksize);
1207 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, 1512 de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
1513 (char *) de,
1208 blocksize); 1514 blocksize);
1209 de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2, 1515 de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
1516 (char *) de2,
1210 blocksize); 1517 blocksize);
1518 if (csum_size) {
1519 t = EXT4_DIRENT_TAIL(data2, blocksize);
1520 initialize_dirent_tail(t, blocksize);
1521
1522 t = EXT4_DIRENT_TAIL(data1, blocksize);
1523 initialize_dirent_tail(t, blocksize);
1524 }
1525
1211 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); 1526 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
1212 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); 1527 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
1213 1528
@@ -1218,10 +1533,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1218 de = de2; 1533 de = de2;
1219 } 1534 }
1220 dx_insert_block(frame, hash2 + continued, newblock); 1535 dx_insert_block(frame, hash2 + continued, newblock);
1221 err = ext4_handle_dirty_metadata(handle, dir, bh2); 1536 err = ext4_handle_dirty_dirent_node(handle, dir, bh2);
1222 if (err) 1537 if (err)
1223 goto journal_error; 1538 goto journal_error;
1224 err = ext4_handle_dirty_metadata(handle, dir, frame->bh); 1539 err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
1225 if (err) 1540 if (err)
1226 goto journal_error; 1541 goto journal_error;
1227 brelse(bh2); 1542 brelse(bh2);
@@ -1258,11 +1573,16 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1258 unsigned short reclen; 1573 unsigned short reclen;
1259 int nlen, rlen, err; 1574 int nlen, rlen, err;
1260 char *top; 1575 char *top;
1576 int csum_size = 0;
1577
1578 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
1579 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
1580 csum_size = sizeof(struct ext4_dir_entry_tail);
1261 1581
1262 reclen = EXT4_DIR_REC_LEN(namelen); 1582 reclen = EXT4_DIR_REC_LEN(namelen);
1263 if (!de) { 1583 if (!de) {
1264 de = (struct ext4_dir_entry_2 *)bh->b_data; 1584 de = (struct ext4_dir_entry_2 *)bh->b_data;
1265 top = bh->b_data + blocksize - reclen; 1585 top = bh->b_data + (blocksize - csum_size) - reclen;
1266 while ((char *) de <= top) { 1586 while ((char *) de <= top) {
1267 if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) 1587 if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
1268 return -EIO; 1588 return -EIO;
@@ -1295,11 +1615,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1295 de = de1; 1615 de = de1;
1296 } 1616 }
1297 de->file_type = EXT4_FT_UNKNOWN; 1617 de->file_type = EXT4_FT_UNKNOWN;
1298 if (inode) { 1618 de->inode = cpu_to_le32(inode->i_ino);
1299 de->inode = cpu_to_le32(inode->i_ino); 1619 ext4_set_de_type(dir->i_sb, de, inode->i_mode);
1300 ext4_set_de_type(dir->i_sb, de, inode->i_mode);
1301 } else
1302 de->inode = 0;
1303 de->name_len = namelen; 1620 de->name_len = namelen;
1304 memcpy(de->name, name, namelen); 1621 memcpy(de->name, name, namelen);
1305 /* 1622 /*
@@ -1318,7 +1635,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1318 dir->i_version++; 1635 dir->i_version++;
1319 ext4_mark_inode_dirty(handle, dir); 1636 ext4_mark_inode_dirty(handle, dir);
1320 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 1637 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1321 err = ext4_handle_dirty_metadata(handle, dir, bh); 1638 err = ext4_handle_dirty_dirent_node(handle, dir, bh);
1322 if (err) 1639 if (err)
1323 ext4_std_error(dir->i_sb, err); 1640 ext4_std_error(dir->i_sb, err);
1324 return 0; 1641 return 0;
@@ -1339,6 +1656,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1339 struct dx_frame frames[2], *frame; 1656 struct dx_frame frames[2], *frame;
1340 struct dx_entry *entries; 1657 struct dx_entry *entries;
1341 struct ext4_dir_entry_2 *de, *de2; 1658 struct ext4_dir_entry_2 *de, *de2;
1659 struct ext4_dir_entry_tail *t;
1342 char *data1, *top; 1660 char *data1, *top;
1343 unsigned len; 1661 unsigned len;
1344 int retval; 1662 int retval;
@@ -1346,6 +1664,11 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1346 struct dx_hash_info hinfo; 1664 struct dx_hash_info hinfo;
1347 ext4_lblk_t block; 1665 ext4_lblk_t block;
1348 struct fake_dirent *fde; 1666 struct fake_dirent *fde;
1667 int csum_size = 0;
1668
1669 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
1670 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
1671 csum_size = sizeof(struct ext4_dir_entry_tail);
1349 1672
1350 blocksize = dir->i_sb->s_blocksize; 1673 blocksize = dir->i_sb->s_blocksize;
1351 dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); 1674 dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
@@ -1366,7 +1689,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1366 brelse(bh); 1689 brelse(bh);
1367 return -EIO; 1690 return -EIO;
1368 } 1691 }
1369 len = ((char *) root) + blocksize - (char *) de; 1692 len = ((char *) root) + (blocksize - csum_size) - (char *) de;
1370 1693
1371 /* Allocate new block for the 0th block's dirents */ 1694 /* Allocate new block for the 0th block's dirents */
1372 bh2 = ext4_append(handle, dir, &block, &retval); 1695 bh2 = ext4_append(handle, dir, &block, &retval);
@@ -1382,8 +1705,15 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1382 top = data1 + len; 1705 top = data1 + len;
1383 while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) 1706 while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
1384 de = de2; 1707 de = de2;
1385 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, 1708 de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
1709 (char *) de,
1386 blocksize); 1710 blocksize);
1711
1712 if (csum_size) {
1713 t = EXT4_DIRENT_TAIL(data1, blocksize);
1714 initialize_dirent_tail(t, blocksize);
1715 }
1716
1387 /* Initialize the root; the dot dirents already exist */ 1717 /* Initialize the root; the dot dirents already exist */
1388 de = (struct ext4_dir_entry_2 *) (&root->dotdot); 1718 de = (struct ext4_dir_entry_2 *) (&root->dotdot);
1389 de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2), 1719 de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
@@ -1408,8 +1738,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1408 frame->bh = bh; 1738 frame->bh = bh;
1409 bh = bh2; 1739 bh = bh2;
1410 1740
1411 ext4_handle_dirty_metadata(handle, dir, frame->bh); 1741 ext4_handle_dirty_dx_node(handle, dir, frame->bh);
1412 ext4_handle_dirty_metadata(handle, dir, bh); 1742 ext4_handle_dirty_dirent_node(handle, dir, bh);
1413 1743
1414 de = do_split(handle,dir, &bh, frame, &hinfo, &retval); 1744 de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1415 if (!de) { 1745 if (!de) {
@@ -1445,11 +1775,17 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1445 struct inode *dir = dentry->d_parent->d_inode; 1775 struct inode *dir = dentry->d_parent->d_inode;
1446 struct buffer_head *bh; 1776 struct buffer_head *bh;
1447 struct ext4_dir_entry_2 *de; 1777 struct ext4_dir_entry_2 *de;
1778 struct ext4_dir_entry_tail *t;
1448 struct super_block *sb; 1779 struct super_block *sb;
1449 int retval; 1780 int retval;
1450 int dx_fallback=0; 1781 int dx_fallback=0;
1451 unsigned blocksize; 1782 unsigned blocksize;
1452 ext4_lblk_t block, blocks; 1783 ext4_lblk_t block, blocks;
1784 int csum_size = 0;
1785
1786 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
1787 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
1788 csum_size = sizeof(struct ext4_dir_entry_tail);
1453 1789
1454 sb = dir->i_sb; 1790 sb = dir->i_sb;
1455 blocksize = sb->s_blocksize; 1791 blocksize = sb->s_blocksize;
@@ -1468,6 +1804,11 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1468 bh = ext4_bread(handle, dir, block, 0, &retval); 1804 bh = ext4_bread(handle, dir, block, 0, &retval);
1469 if(!bh) 1805 if(!bh)
1470 return retval; 1806 return retval;
1807 if (!buffer_verified(bh) &&
1808 !ext4_dirent_csum_verify(dir,
1809 (struct ext4_dir_entry *)bh->b_data))
1810 return -EIO;
1811 set_buffer_verified(bh);
1471 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); 1812 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1472 if (retval != -ENOSPC) { 1813 if (retval != -ENOSPC) {
1473 brelse(bh); 1814 brelse(bh);
@@ -1484,7 +1825,13 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1484 return retval; 1825 return retval;
1485 de = (struct ext4_dir_entry_2 *) bh->b_data; 1826 de = (struct ext4_dir_entry_2 *) bh->b_data;
1486 de->inode = 0; 1827 de->inode = 0;
1487 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); 1828 de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize);
1829
1830 if (csum_size) {
1831 t = EXT4_DIRENT_TAIL(bh->b_data, blocksize);
1832 initialize_dirent_tail(t, blocksize);
1833 }
1834
1488 retval = add_dirent_to_buf(handle, dentry, inode, de, bh); 1835 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1489 brelse(bh); 1836 brelse(bh);
1490 if (retval == 0) 1837 if (retval == 0)
@@ -1516,6 +1863,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1516 if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err))) 1863 if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
1517 goto cleanup; 1864 goto cleanup;
1518 1865
1866 if (!buffer_verified(bh) &&
1867 !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
1868 goto journal_error;
1869 set_buffer_verified(bh);
1870
1519 BUFFER_TRACE(bh, "get_write_access"); 1871 BUFFER_TRACE(bh, "get_write_access");
1520 err = ext4_journal_get_write_access(handle, bh); 1872 err = ext4_journal_get_write_access(handle, bh);
1521 if (err) 1873 if (err)
@@ -1583,7 +1935,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1583 dxtrace(dx_show_index("node", frames[1].entries)); 1935 dxtrace(dx_show_index("node", frames[1].entries));
1584 dxtrace(dx_show_index("node", 1936 dxtrace(dx_show_index("node",
1585 ((struct dx_node *) bh2->b_data)->entries)); 1937 ((struct dx_node *) bh2->b_data)->entries));
1586 err = ext4_handle_dirty_metadata(handle, dir, bh2); 1938 err = ext4_handle_dirty_dx_node(handle, dir, bh2);
1587 if (err) 1939 if (err)
1588 goto journal_error; 1940 goto journal_error;
1589 brelse (bh2); 1941 brelse (bh2);
@@ -1609,7 +1961,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1609 if (err) 1961 if (err)
1610 goto journal_error; 1962 goto journal_error;
1611 } 1963 }
1612 err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh); 1964 err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
1613 if (err) { 1965 if (err) {
1614 ext4_std_error(inode->i_sb, err); 1966 ext4_std_error(inode->i_sb, err);
1615 goto cleanup; 1967 goto cleanup;
@@ -1641,12 +1993,17 @@ static int ext4_delete_entry(handle_t *handle,
1641{ 1993{
1642 struct ext4_dir_entry_2 *de, *pde; 1994 struct ext4_dir_entry_2 *de, *pde;
1643 unsigned int blocksize = dir->i_sb->s_blocksize; 1995 unsigned int blocksize = dir->i_sb->s_blocksize;
1996 int csum_size = 0;
1644 int i, err; 1997 int i, err;
1645 1998
1999 if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
2000 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
2001 csum_size = sizeof(struct ext4_dir_entry_tail);
2002
1646 i = 0; 2003 i = 0;
1647 pde = NULL; 2004 pde = NULL;
1648 de = (struct ext4_dir_entry_2 *) bh->b_data; 2005 de = (struct ext4_dir_entry_2 *) bh->b_data;
1649 while (i < bh->b_size) { 2006 while (i < bh->b_size - csum_size) {
1650 if (ext4_check_dir_entry(dir, NULL, de, bh, i)) 2007 if (ext4_check_dir_entry(dir, NULL, de, bh, i))
1651 return -EIO; 2008 return -EIO;
1652 if (de == de_del) { 2009 if (de == de_del) {
@@ -1667,7 +2024,7 @@ static int ext4_delete_entry(handle_t *handle,
1667 de->inode = 0; 2024 de->inode = 0;
1668 dir->i_version++; 2025 dir->i_version++;
1669 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 2026 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1670 err = ext4_handle_dirty_metadata(handle, dir, bh); 2027 err = ext4_handle_dirty_dirent_node(handle, dir, bh);
1671 if (unlikely(err)) { 2028 if (unlikely(err)) {
1672 ext4_std_error(dir->i_sb, err); 2029 ext4_std_error(dir->i_sb, err);
1673 return err; 2030 return err;
@@ -1809,9 +2166,15 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
1809 struct inode *inode; 2166 struct inode *inode;
1810 struct buffer_head *dir_block = NULL; 2167 struct buffer_head *dir_block = NULL;
1811 struct ext4_dir_entry_2 *de; 2168 struct ext4_dir_entry_2 *de;
2169 struct ext4_dir_entry_tail *t;
1812 unsigned int blocksize = dir->i_sb->s_blocksize; 2170 unsigned int blocksize = dir->i_sb->s_blocksize;
2171 int csum_size = 0;
1813 int err, retries = 0; 2172 int err, retries = 0;
1814 2173
2174 if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
2175 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
2176 csum_size = sizeof(struct ext4_dir_entry_tail);
2177
1815 if (EXT4_DIR_LINK_MAX(dir)) 2178 if (EXT4_DIR_LINK_MAX(dir))
1816 return -EMLINK; 2179 return -EMLINK;
1817 2180
@@ -1852,16 +2215,24 @@ retry:
1852 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 2215 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1853 de = ext4_next_entry(de, blocksize); 2216 de = ext4_next_entry(de, blocksize);
1854 de->inode = cpu_to_le32(dir->i_ino); 2217 de->inode = cpu_to_le32(dir->i_ino);
1855 de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1), 2218 de->rec_len = ext4_rec_len_to_disk(blocksize -
2219 (csum_size + EXT4_DIR_REC_LEN(1)),
1856 blocksize); 2220 blocksize);
1857 de->name_len = 2; 2221 de->name_len = 2;
1858 strcpy(de->name, ".."); 2222 strcpy(de->name, "..");
1859 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 2223 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1860 set_nlink(inode, 2); 2224 set_nlink(inode, 2);
2225
2226 if (csum_size) {
2227 t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
2228 initialize_dirent_tail(t, blocksize);
2229 }
2230
1861 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); 2231 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
1862 err = ext4_handle_dirty_metadata(handle, inode, dir_block); 2232 err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
1863 if (err) 2233 if (err)
1864 goto out_clear_inode; 2234 goto out_clear_inode;
2235 set_buffer_verified(dir_block);
1865 err = ext4_mark_inode_dirty(handle, inode); 2236 err = ext4_mark_inode_dirty(handle, inode);
1866 if (!err) 2237 if (!err)
1867 err = ext4_add_entry(handle, dentry, inode); 2238 err = ext4_add_entry(handle, dentry, inode);
@@ -1911,6 +2282,14 @@ static int empty_dir(struct inode *inode)
1911 inode->i_ino); 2282 inode->i_ino);
1912 return 1; 2283 return 1;
1913 } 2284 }
2285 if (!buffer_verified(bh) &&
2286 !ext4_dirent_csum_verify(inode,
2287 (struct ext4_dir_entry *)bh->b_data)) {
2288 EXT4_ERROR_INODE(inode, "checksum error reading directory "
2289 "lblock 0");
2290 return -EIO;
2291 }
2292 set_buffer_verified(bh);
1914 de = (struct ext4_dir_entry_2 *) bh->b_data; 2293 de = (struct ext4_dir_entry_2 *) bh->b_data;
1915 de1 = ext4_next_entry(de, sb->s_blocksize); 2294 de1 = ext4_next_entry(de, sb->s_blocksize);
1916 if (le32_to_cpu(de->inode) != inode->i_ino || 2295 if (le32_to_cpu(de->inode) != inode->i_ino ||
@@ -1942,6 +2321,14 @@ static int empty_dir(struct inode *inode)
1942 offset += sb->s_blocksize; 2321 offset += sb->s_blocksize;
1943 continue; 2322 continue;
1944 } 2323 }
2324 if (!buffer_verified(bh) &&
2325 !ext4_dirent_csum_verify(inode,
2326 (struct ext4_dir_entry *)bh->b_data)) {
2327 EXT4_ERROR_INODE(inode, "checksum error "
2328 "reading directory lblock 0");
2329 return -EIO;
2330 }
2331 set_buffer_verified(bh);
1945 de = (struct ext4_dir_entry_2 *) bh->b_data; 2332 de = (struct ext4_dir_entry_2 *) bh->b_data;
1946 } 2333 }
1947 if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) { 2334 if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
@@ -2010,7 +2397,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
2010 /* Insert this inode at the head of the on-disk orphan list... */ 2397 /* Insert this inode at the head of the on-disk orphan list... */
2011 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); 2398 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
2012 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); 2399 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
2013 err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); 2400 err = ext4_handle_dirty_super_now(handle, sb);
2014 rc = ext4_mark_iloc_dirty(handle, inode, &iloc); 2401 rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
2015 if (!err) 2402 if (!err)
2016 err = rc; 2403 err = rc;
@@ -2083,7 +2470,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2083 if (err) 2470 if (err)
2084 goto out_brelse; 2471 goto out_brelse;
2085 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); 2472 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
2086 err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); 2473 err = ext4_handle_dirty_super_now(handle, inode->i_sb);
2087 } else { 2474 } else {
2088 struct ext4_iloc iloc2; 2475 struct ext4_iloc iloc2;
2089 struct inode *i_prev = 2476 struct inode *i_prev =
@@ -2442,6 +2829,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2442 dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); 2829 dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
2443 if (!dir_bh) 2830 if (!dir_bh)
2444 goto end_rename; 2831 goto end_rename;
2832 if (!buffer_verified(dir_bh) &&
2833 !ext4_dirent_csum_verify(old_inode,
2834 (struct ext4_dir_entry *)dir_bh->b_data))
2835 goto end_rename;
2836 set_buffer_verified(dir_bh);
2445 if (le32_to_cpu(PARENT_INO(dir_bh->b_data, 2837 if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
2446 old_dir->i_sb->s_blocksize)) != old_dir->i_ino) 2838 old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
2447 goto end_rename; 2839 goto end_rename;
@@ -2472,7 +2864,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2472 ext4_current_time(new_dir); 2864 ext4_current_time(new_dir);
2473 ext4_mark_inode_dirty(handle, new_dir); 2865 ext4_mark_inode_dirty(handle, new_dir);
2474 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); 2866 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
2475 retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh); 2867 retval = ext4_handle_dirty_dirent_node(handle, new_dir, new_bh);
2476 if (unlikely(retval)) { 2868 if (unlikely(retval)) {
2477 ext4_std_error(new_dir->i_sb, retval); 2869 ext4_std_error(new_dir->i_sb, retval);
2478 goto end_rename; 2870 goto end_rename;
@@ -2526,7 +2918,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2526 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = 2918 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
2527 cpu_to_le32(new_dir->i_ino); 2919 cpu_to_le32(new_dir->i_ino);
2528 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); 2920 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
2529 retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh); 2921 retval = ext4_handle_dirty_dirent_node(handle, old_inode,
2922 dir_bh);
2530 if (retval) { 2923 if (retval) {
2531 ext4_std_error(old_dir->i_sb, retval); 2924 ext4_std_error(old_dir->i_sb, retval);
2532 goto end_rename; 2925 goto end_rename;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 59fa0be27251..7ea6cbb44121 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -161,6 +161,8 @@ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
161 if (flex_gd == NULL) 161 if (flex_gd == NULL)
162 goto out3; 162 goto out3;
163 163
164 if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data))
165 goto out2;
164 flex_gd->count = flexbg_size; 166 flex_gd->count = flexbg_size;
165 167
166 flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) * 168 flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) *
@@ -796,7 +798,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
796 ext4_kvfree(o_group_desc); 798 ext4_kvfree(o_group_desc);
797 799
798 le16_add_cpu(&es->s_reserved_gdt_blocks, -1); 800 le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
799 err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); 801 err = ext4_handle_dirty_super_now(handle, sb);
800 if (err) 802 if (err)
801 ext4_std_error(sb, err); 803 ext4_std_error(sb, err);
802 804
@@ -968,6 +970,8 @@ static void update_backups(struct super_block *sb,
968 goto exit_err; 970 goto exit_err;
969 } 971 }
970 972
973 ext4_superblock_csum_set(sb, (struct ext4_super_block *)data);
974
971 while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) { 975 while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) {
972 struct buffer_head *bh; 976 struct buffer_head *bh;
973 977
@@ -1067,6 +1071,54 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
1067 return err; 1071 return err;
1068} 1072}
1069 1073
1074static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
1075{
1076 struct buffer_head *bh = sb_getblk(sb, block);
1077 if (!bh)
1078 return NULL;
1079
1080 if (bitmap_uptodate(bh))
1081 return bh;
1082
1083 lock_buffer(bh);
1084 if (bh_submit_read(bh) < 0) {
1085 unlock_buffer(bh);
1086 brelse(bh);
1087 return NULL;
1088 }
1089 unlock_buffer(bh);
1090
1091 return bh;
1092}
1093
1094static int ext4_set_bitmap_checksums(struct super_block *sb,
1095 ext4_group_t group,
1096 struct ext4_group_desc *gdp,
1097 struct ext4_new_group_data *group_data)
1098{
1099 struct buffer_head *bh;
1100
1101 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
1102 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
1103 return 0;
1104
1105 bh = ext4_get_bitmap(sb, group_data->inode_bitmap);
1106 if (!bh)
1107 return -EIO;
1108 ext4_inode_bitmap_csum_set(sb, group, gdp, bh,
1109 EXT4_INODES_PER_GROUP(sb) / 8);
1110 brelse(bh);
1111
1112 bh = ext4_get_bitmap(sb, group_data->block_bitmap);
1113 if (!bh)
1114 return -EIO;
1115 ext4_block_bitmap_csum_set(sb, group, gdp, bh,
1116 EXT4_BLOCKS_PER_GROUP(sb) / 8);
1117 brelse(bh);
1118
1119 return 0;
1120}
1121
1070/* 1122/*
1071 * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg 1123 * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg
1072 */ 1124 */
@@ -1093,18 +1145,24 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
1093 */ 1145 */
1094 gdb_bh = sbi->s_group_desc[gdb_num]; 1146 gdb_bh = sbi->s_group_desc[gdb_num];
1095 /* Update group descriptor block for new group */ 1147 /* Update group descriptor block for new group */
1096 gdp = (struct ext4_group_desc *)((char *)gdb_bh->b_data + 1148 gdp = (struct ext4_group_desc *)(gdb_bh->b_data +
1097 gdb_off * EXT4_DESC_SIZE(sb)); 1149 gdb_off * EXT4_DESC_SIZE(sb));
1098 1150
1099 memset(gdp, 0, EXT4_DESC_SIZE(sb)); 1151 memset(gdp, 0, EXT4_DESC_SIZE(sb));
1100 ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap); 1152 ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap);
1101 ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap); 1153 ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap);
1154 err = ext4_set_bitmap_checksums(sb, group, gdp, group_data);
1155 if (err) {
1156 ext4_std_error(sb, err);
1157 break;
1158 }
1159
1102 ext4_inode_table_set(sb, gdp, group_data->inode_table); 1160 ext4_inode_table_set(sb, gdp, group_data->inode_table);
1103 ext4_free_group_clusters_set(sb, gdp, 1161 ext4_free_group_clusters_set(sb, gdp,
1104 EXT4_B2C(sbi, group_data->free_blocks_count)); 1162 EXT4_B2C(sbi, group_data->free_blocks_count));
1105 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); 1163 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
1106 gdp->bg_flags = cpu_to_le16(*bg_flags); 1164 gdp->bg_flags = cpu_to_le16(*bg_flags);
1107 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 1165 ext4_group_desc_csum_set(sb, group, gdp);
1108 1166
1109 err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); 1167 err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
1110 if (unlikely(err)) { 1168 if (unlikely(err)) {
@@ -1343,17 +1401,14 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
1343 (1 + ext4_bg_num_gdb(sb, group + i) + 1401 (1 + ext4_bg_num_gdb(sb, group + i) +
1344 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; 1402 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
1345 group_data[i].free_blocks_count = blocks_per_group - overhead; 1403 group_data[i].free_blocks_count = blocks_per_group - overhead;
1346 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 1404 if (ext4_has_group_desc_csum(sb))
1347 EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
1348 flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | 1405 flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
1349 EXT4_BG_INODE_UNINIT; 1406 EXT4_BG_INODE_UNINIT;
1350 else 1407 else
1351 flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED; 1408 flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED;
1352 } 1409 }
1353 1410
1354 if (last_group == n_group && 1411 if (last_group == n_group && ext4_has_group_desc_csum(sb))
1355 EXT4_HAS_RO_COMPAT_FEATURE(sb,
1356 EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
1357 /* We need to initialize block bitmap of last group. */ 1412 /* We need to initialize block bitmap of last group. */
1358 flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT; 1413 flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT;
1359 1414
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 35b5954489ee..eb7aa3e4ef05 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -112,6 +112,48 @@ static struct file_system_type ext3_fs_type = {
112#define IS_EXT3_SB(sb) (0) 112#define IS_EXT3_SB(sb) (0)
113#endif 113#endif
114 114
115static int ext4_verify_csum_type(struct super_block *sb,
116 struct ext4_super_block *es)
117{
118 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
119 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
120 return 1;
121
122 return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
123}
124
125static __le32 ext4_superblock_csum(struct super_block *sb,
126 struct ext4_super_block *es)
127{
128 struct ext4_sb_info *sbi = EXT4_SB(sb);
129 int offset = offsetof(struct ext4_super_block, s_checksum);
130 __u32 csum;
131
132 csum = ext4_chksum(sbi, ~0, (char *)es, offset);
133
134 return cpu_to_le32(csum);
135}
136
137int ext4_superblock_csum_verify(struct super_block *sb,
138 struct ext4_super_block *es)
139{
140 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
141 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
142 return 1;
143
144 return es->s_checksum == ext4_superblock_csum(sb, es);
145}
146
147void ext4_superblock_csum_set(struct super_block *sb,
148 struct ext4_super_block *es)
149{
150 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
151 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
152 return;
153
154 es->s_checksum = ext4_superblock_csum(sb, es);
155}
156
115void *ext4_kvmalloc(size_t size, gfp_t flags) 157void *ext4_kvmalloc(size_t size, gfp_t flags)
116{ 158{
117 void *ret; 159 void *ret;
@@ -497,6 +539,7 @@ void __ext4_error(struct super_block *sb, const char *function,
497 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", 539 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
498 sb->s_id, function, line, current->comm, &vaf); 540 sb->s_id, function, line, current->comm, &vaf);
499 va_end(args); 541 va_end(args);
542 save_error_info(sb, function, line);
500 543
501 ext4_handle_error(sb); 544 ext4_handle_error(sb);
502} 545}
@@ -905,6 +948,8 @@ static void ext4_put_super(struct super_block *sb)
905 unlock_super(sb); 948 unlock_super(sb);
906 kobject_put(&sbi->s_kobj); 949 kobject_put(&sbi->s_kobj);
907 wait_for_completion(&sbi->s_kobj_unregister); 950 wait_for_completion(&sbi->s_kobj_unregister);
951 if (sbi->s_chksum_driver)
952 crypto_free_shash(sbi->s_chksum_driver);
908 kfree(sbi->s_blockgroup_lock); 953 kfree(sbi->s_blockgroup_lock);
909 kfree(sbi); 954 kfree(sbi);
910} 955}
@@ -1922,43 +1967,69 @@ failed:
1922 return 0; 1967 return 0;
1923} 1968}
1924 1969
1925__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, 1970static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
1926 struct ext4_group_desc *gdp) 1971 struct ext4_group_desc *gdp)
1927{ 1972{
1973 int offset;
1928 __u16 crc = 0; 1974 __u16 crc = 0;
1975 __le32 le_group = cpu_to_le32(block_group);
1929 1976
1930 if (sbi->s_es->s_feature_ro_compat & 1977 if ((sbi->s_es->s_feature_ro_compat &
1931 cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { 1978 cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
1932 int offset = offsetof(struct ext4_group_desc, bg_checksum); 1979 /* Use new metadata_csum algorithm */
1933 __le32 le_group = cpu_to_le32(block_group); 1980 __u16 old_csum;
1934 1981 __u32 csum32;
1935 crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); 1982
1936 crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); 1983 old_csum = gdp->bg_checksum;
1937 crc = crc16(crc, (__u8 *)gdp, offset); 1984 gdp->bg_checksum = 0;
1938 offset += sizeof(gdp->bg_checksum); /* skip checksum */ 1985 csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
1939 /* for checksum of struct ext4_group_desc do the rest...*/ 1986 sizeof(le_group));
1940 if ((sbi->s_es->s_feature_incompat & 1987 csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
1941 cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) && 1988 sbi->s_desc_size);
1942 offset < le16_to_cpu(sbi->s_es->s_desc_size)) 1989 gdp->bg_checksum = old_csum;
1943 crc = crc16(crc, (__u8 *)gdp + offset, 1990
1944 le16_to_cpu(sbi->s_es->s_desc_size) - 1991 crc = csum32 & 0xFFFF;
1945 offset); 1992 goto out;
1946 } 1993 }
1947 1994
1995 /* old crc16 code */
1996 offset = offsetof(struct ext4_group_desc, bg_checksum);
1997
1998 crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
1999 crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
2000 crc = crc16(crc, (__u8 *)gdp, offset);
2001 offset += sizeof(gdp->bg_checksum); /* skip checksum */
2002 /* for checksum of struct ext4_group_desc do the rest...*/
2003 if ((sbi->s_es->s_feature_incompat &
2004 cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
2005 offset < le16_to_cpu(sbi->s_es->s_desc_size))
2006 crc = crc16(crc, (__u8 *)gdp + offset,
2007 le16_to_cpu(sbi->s_es->s_desc_size) -
2008 offset);
2009
2010out:
1948 return cpu_to_le16(crc); 2011 return cpu_to_le16(crc);
1949} 2012}
1950 2013
1951int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group, 2014int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
1952 struct ext4_group_desc *gdp) 2015 struct ext4_group_desc *gdp)
1953{ 2016{
1954 if ((sbi->s_es->s_feature_ro_compat & 2017 if (ext4_has_group_desc_csum(sb) &&
1955 cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) && 2018 (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb),
1956 (gdp->bg_checksum != ext4_group_desc_csum(sbi, block_group, gdp))) 2019 block_group, gdp)))
1957 return 0; 2020 return 0;
1958 2021
1959 return 1; 2022 return 1;
1960} 2023}
1961 2024
2025void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
2026 struct ext4_group_desc *gdp)
2027{
2028 if (!ext4_has_group_desc_csum(sb))
2029 return;
2030 gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp);
2031}
2032
1962/* Called at mount-time, super-block is locked */ 2033/* Called at mount-time, super-block is locked */
1963static int ext4_check_descriptors(struct super_block *sb, 2034static int ext4_check_descriptors(struct super_block *sb,
1964 ext4_group_t *first_not_zeroed) 2035 ext4_group_t *first_not_zeroed)
@@ -2013,7 +2084,7 @@ static int ext4_check_descriptors(struct super_block *sb,
2013 return 0; 2084 return 0;
2014 } 2085 }
2015 ext4_lock_group(sb, i); 2086 ext4_lock_group(sb, i);
2016 if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { 2087 if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
2017 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2088 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2018 "Checksum for group %u failed (%u!=%u)", 2089 "Checksum for group %u failed (%u!=%u)",
2019 i, le16_to_cpu(ext4_group_desc_csum(sbi, i, 2090 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
@@ -2417,6 +2488,23 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
2417 return count; 2488 return count;
2418} 2489}
2419 2490
2491static ssize_t trigger_test_error(struct ext4_attr *a,
2492 struct ext4_sb_info *sbi,
2493 const char *buf, size_t count)
2494{
2495 int len = count;
2496
2497 if (!capable(CAP_SYS_ADMIN))
2498 return -EPERM;
2499
2500 if (len && buf[len-1] == '\n')
2501 len--;
2502
2503 if (len)
2504 ext4_error(sbi->s_sb, "%.*s", len, buf);
2505 return count;
2506}
2507
2420#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ 2508#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
2421static struct ext4_attr ext4_attr_##_name = { \ 2509static struct ext4_attr ext4_attr_##_name = { \
2422 .attr = {.name = __stringify(_name), .mode = _mode }, \ 2510 .attr = {.name = __stringify(_name), .mode = _mode }, \
@@ -2447,6 +2535,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2447EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 2535EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2448EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 2536EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2449EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); 2537EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
2538EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
2450 2539
2451static struct attribute *ext4_attrs[] = { 2540static struct attribute *ext4_attrs[] = {
2452 ATTR_LIST(delayed_allocation_blocks), 2541 ATTR_LIST(delayed_allocation_blocks),
@@ -2461,6 +2550,7 @@ static struct attribute *ext4_attrs[] = {
2461 ATTR_LIST(mb_stream_req), 2550 ATTR_LIST(mb_stream_req),
2462 ATTR_LIST(mb_group_prealloc), 2551 ATTR_LIST(mb_group_prealloc),
2463 ATTR_LIST(max_writeback_mb_bump), 2552 ATTR_LIST(max_writeback_mb_bump),
2553 ATTR_LIST(trigger_fs_error),
2464 NULL, 2554 NULL,
2465}; 2555};
2466 2556
@@ -2957,6 +3047,44 @@ static void ext4_destroy_lazyinit_thread(void)
2957 kthread_stop(ext4_lazyinit_task); 3047 kthread_stop(ext4_lazyinit_task);
2958} 3048}
2959 3049
3050static int set_journal_csum_feature_set(struct super_block *sb)
3051{
3052 int ret = 1;
3053 int compat, incompat;
3054 struct ext4_sb_info *sbi = EXT4_SB(sb);
3055
3056 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3057 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
3058 /* journal checksum v2 */
3059 compat = 0;
3060 incompat = JBD2_FEATURE_INCOMPAT_CSUM_V2;
3061 } else {
3062 /* journal checksum v1 */
3063 compat = JBD2_FEATURE_COMPAT_CHECKSUM;
3064 incompat = 0;
3065 }
3066
3067 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
3068 ret = jbd2_journal_set_features(sbi->s_journal,
3069 compat, 0,
3070 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
3071 incompat);
3072 } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
3073 ret = jbd2_journal_set_features(sbi->s_journal,
3074 compat, 0,
3075 incompat);
3076 jbd2_journal_clear_features(sbi->s_journal, 0, 0,
3077 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
3078 } else {
3079 jbd2_journal_clear_features(sbi->s_journal,
3080 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
3081 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
3082 JBD2_FEATURE_INCOMPAT_CSUM_V2);
3083 }
3084
3085 return ret;
3086}
3087
2960static int ext4_fill_super(struct super_block *sb, void *data, int silent) 3088static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2961{ 3089{
2962 char *orig_data = kstrdup(data, GFP_KERNEL); 3090 char *orig_data = kstrdup(data, GFP_KERNEL);
@@ -2993,6 +3121,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2993 goto out_free_orig; 3121 goto out_free_orig;
2994 } 3122 }
2995 sb->s_fs_info = sbi; 3123 sb->s_fs_info = sbi;
3124 sbi->s_sb = sb;
2996 sbi->s_mount_opt = 0; 3125 sbi->s_mount_opt = 0;
2997 sbi->s_resuid = make_kuid(&init_user_ns, EXT4_DEF_RESUID); 3126 sbi->s_resuid = make_kuid(&init_user_ns, EXT4_DEF_RESUID);
2998 sbi->s_resgid = make_kgid(&init_user_ns, EXT4_DEF_RESGID); 3127 sbi->s_resgid = make_kgid(&init_user_ns, EXT4_DEF_RESGID);
@@ -3032,13 +3161,54 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3032 * Note: s_es must be initialized as soon as possible because 3161 * Note: s_es must be initialized as soon as possible because
3033 * some ext4 macro-instructions depend on its value 3162 * some ext4 macro-instructions depend on its value
3034 */ 3163 */
3035 es = (struct ext4_super_block *) (((char *)bh->b_data) + offset); 3164 es = (struct ext4_super_block *) (bh->b_data + offset);
3036 sbi->s_es = es; 3165 sbi->s_es = es;
3037 sb->s_magic = le16_to_cpu(es->s_magic); 3166 sb->s_magic = le16_to_cpu(es->s_magic);
3038 if (sb->s_magic != EXT4_SUPER_MAGIC) 3167 if (sb->s_magic != EXT4_SUPER_MAGIC)
3039 goto cantfind_ext4; 3168 goto cantfind_ext4;
3040 sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); 3169 sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
3041 3170
3171 /* Warn if metadata_csum and gdt_csum are both set. */
3172 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3173 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
3174 EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
3175 ext4_warning(sb, KERN_INFO "metadata_csum and uninit_bg are "
3176 "redundant flags; please run fsck.");
3177
3178 /* Check for a known checksum algorithm */
3179 if (!ext4_verify_csum_type(sb, es)) {
3180 ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
3181 "unknown checksum algorithm.");
3182 silent = 1;
3183 goto cantfind_ext4;
3184 }
3185
3186 /* Load the checksum driver */
3187 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3188 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
3189 sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
3190 if (IS_ERR(sbi->s_chksum_driver)) {
3191 ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
3192 ret = PTR_ERR(sbi->s_chksum_driver);
3193 sbi->s_chksum_driver = NULL;
3194 goto failed_mount;
3195 }
3196 }
3197
3198 /* Check superblock checksum */
3199 if (!ext4_superblock_csum_verify(sb, es)) {
3200 ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
3201 "invalid superblock checksum. Run e2fsck?");
3202 silent = 1;
3203 goto cantfind_ext4;
3204 }
3205
3206 /* Precompute checksum seed for all metadata */
3207 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3208 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
3209 sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
3210 sizeof(es->s_uuid));
3211
3042 /* Set defaults before we parse the mount options */ 3212 /* Set defaults before we parse the mount options */
3043 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 3213 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
3044 set_opt(sb, INIT_INODE_TABLE); 3214 set_opt(sb, INIT_INODE_TABLE);
@@ -3200,7 +3370,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3200 "Can't read superblock on 2nd try"); 3370 "Can't read superblock on 2nd try");
3201 goto failed_mount; 3371 goto failed_mount;
3202 } 3372 }
3203 es = (struct ext4_super_block *)(((char *)bh->b_data) + offset); 3373 es = (struct ext4_super_block *)(bh->b_data + offset);
3204 sbi->s_es = es; 3374 sbi->s_es = es;
3205 if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { 3375 if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
3206 ext4_msg(sb, KERN_ERR, 3376 ext4_msg(sb, KERN_ERR,
@@ -3392,6 +3562,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3392 GFP_KERNEL); 3562 GFP_KERNEL);
3393 if (sbi->s_group_desc == NULL) { 3563 if (sbi->s_group_desc == NULL) {
3394 ext4_msg(sb, KERN_ERR, "not enough memory"); 3564 ext4_msg(sb, KERN_ERR, "not enough memory");
3565 ret = -ENOMEM;
3395 goto failed_mount; 3566 goto failed_mount;
3396 } 3567 }
3397 3568
@@ -3449,6 +3620,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3449 } 3620 }
3450 if (err) { 3621 if (err) {
3451 ext4_msg(sb, KERN_ERR, "insufficient memory"); 3622 ext4_msg(sb, KERN_ERR, "insufficient memory");
3623 ret = err;
3452 goto failed_mount3; 3624 goto failed_mount3;
3453 } 3625 }
3454 3626
@@ -3506,26 +3678,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3506 goto no_journal; 3678 goto no_journal;
3507 } 3679 }
3508 3680
3509 if (ext4_blocks_count(es) > 0xffffffffULL && 3681 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) &&
3510 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, 3682 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
3511 JBD2_FEATURE_INCOMPAT_64BIT)) { 3683 JBD2_FEATURE_INCOMPAT_64BIT)) {
3512 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); 3684 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
3513 goto failed_mount_wq; 3685 goto failed_mount_wq;
3514 } 3686 }
3515 3687
3516 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 3688 if (!set_journal_csum_feature_set(sb)) {
3517 jbd2_journal_set_features(sbi->s_journal, 3689 ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
3518 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 3690 "feature set");
3519 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 3691 goto failed_mount_wq;
3520 } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
3521 jbd2_journal_set_features(sbi->s_journal,
3522 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
3523 jbd2_journal_clear_features(sbi->s_journal, 0, 0,
3524 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
3525 } else {
3526 jbd2_journal_clear_features(sbi->s_journal,
3527 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
3528 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
3529 } 3692 }
3530 3693
3531 /* We have now updated the journal if required, so we can 3694 /* We have now updated the journal if required, so we can
@@ -3606,7 +3769,8 @@ no_journal:
3606 goto failed_mount4; 3769 goto failed_mount4;
3607 } 3770 }
3608 3771
3609 ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY); 3772 if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY))
3773 sb->s_flags |= MS_RDONLY;
3610 3774
3611 /* determine the minimum size of new large inodes, if present */ 3775 /* determine the minimum size of new large inodes, if present */
3612 if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { 3776 if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
@@ -3641,7 +3805,7 @@ no_journal:
3641 } 3805 }
3642 3806
3643 ext4_ext_init(sb); 3807 ext4_ext_init(sb);
3644 err = ext4_mb_init(sb, needs_recovery); 3808 err = ext4_mb_init(sb);
3645 if (err) { 3809 if (err) {
3646 ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", 3810 ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
3647 err); 3811 err);
@@ -3724,6 +3888,8 @@ failed_mount2:
3724 brelse(sbi->s_group_desc[i]); 3888 brelse(sbi->s_group_desc[i]);
3725 ext4_kvfree(sbi->s_group_desc); 3889 ext4_kvfree(sbi->s_group_desc);
3726failed_mount: 3890failed_mount:
3891 if (sbi->s_chksum_driver)
3892 crypto_free_shash(sbi->s_chksum_driver);
3727 if (sbi->s_proc) { 3893 if (sbi->s_proc) {
3728 remove_proc_entry("options", sbi->s_proc); 3894 remove_proc_entry("options", sbi->s_proc);
3729 remove_proc_entry(sb->s_id, ext4_proc_root); 3895 remove_proc_entry(sb->s_id, ext4_proc_root);
@@ -3847,7 +4013,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
3847 goto out_bdev; 4013 goto out_bdev;
3848 } 4014 }
3849 4015
3850 es = (struct ext4_super_block *) (((char *)bh->b_data) + offset); 4016 es = (struct ext4_super_block *) (bh->b_data + offset);
3851 if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || 4017 if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
3852 !(le32_to_cpu(es->s_feature_incompat) & 4018 !(le32_to_cpu(es->s_feature_incompat) &
3853 EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { 4019 EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
@@ -4039,6 +4205,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
4039 &EXT4_SB(sb)->s_freeinodes_counter)); 4205 &EXT4_SB(sb)->s_freeinodes_counter));
4040 sb->s_dirt = 0; 4206 sb->s_dirt = 0;
4041 BUFFER_TRACE(sbh, "marking dirty"); 4207 BUFFER_TRACE(sbh, "marking dirty");
4208 ext4_superblock_csum_set(sb, es);
4042 mark_buffer_dirty(sbh); 4209 mark_buffer_dirty(sbh);
4043 if (sync) { 4210 if (sync) {
4044 error = sync_dirty_buffer(sbh); 4211 error = sync_dirty_buffer(sbh);
@@ -4333,7 +4500,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4333 struct ext4_group_desc *gdp = 4500 struct ext4_group_desc *gdp =
4334 ext4_get_group_desc(sb, g, NULL); 4501 ext4_get_group_desc(sb, g, NULL);
4335 4502
4336 if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { 4503 if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
4337 ext4_msg(sb, KERN_ERR, 4504 ext4_msg(sb, KERN_ERR,
4338 "ext4_remount: Checksum for group %u failed (%u!=%u)", 4505 "ext4_remount: Checksum for group %u failed (%u!=%u)",
4339 g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), 4506 g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index e88748e55c0f..e56c9ed7d6e3 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -122,6 +122,58 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
122 NULL 122 NULL
123}; 123};
124 124
125static __le32 ext4_xattr_block_csum(struct inode *inode,
126 sector_t block_nr,
127 struct ext4_xattr_header *hdr)
128{
129 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
130 struct ext4_inode_info *ei = EXT4_I(inode);
131 __u32 csum, old;
132
133 old = hdr->h_checksum;
134 hdr->h_checksum = 0;
135 if (le32_to_cpu(hdr->h_refcount) != 1) {
136 block_nr = cpu_to_le64(block_nr);
137 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr,
138 sizeof(block_nr));
139 } else
140 csum = ei->i_csum_seed;
141 csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
142 EXT4_BLOCK_SIZE(inode->i_sb));
143 hdr->h_checksum = old;
144 return cpu_to_le32(csum);
145}
146
147static int ext4_xattr_block_csum_verify(struct inode *inode,
148 sector_t block_nr,
149 struct ext4_xattr_header *hdr)
150{
151 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
152 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
153 (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr)))
154 return 0;
155 return 1;
156}
157
158static void ext4_xattr_block_csum_set(struct inode *inode,
159 sector_t block_nr,
160 struct ext4_xattr_header *hdr)
161{
162 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
163 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
164 return;
165
166 hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr);
167}
168
169static inline int ext4_handle_dirty_xattr_block(handle_t *handle,
170 struct inode *inode,
171 struct buffer_head *bh)
172{
173 ext4_xattr_block_csum_set(inode, bh->b_blocknr, BHDR(bh));
174 return ext4_handle_dirty_metadata(handle, inode, bh);
175}
176
125static inline const struct xattr_handler * 177static inline const struct xattr_handler *
126ext4_xattr_handler(int name_index) 178ext4_xattr_handler(int name_index)
127{ 179{
@@ -156,12 +208,22 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
156} 208}
157 209
158static inline int 210static inline int
159ext4_xattr_check_block(struct buffer_head *bh) 211ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
160{ 212{
213 int error;
214
215 if (buffer_verified(bh))
216 return 0;
217
161 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || 218 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
162 BHDR(bh)->h_blocks != cpu_to_le32(1)) 219 BHDR(bh)->h_blocks != cpu_to_le32(1))
163 return -EIO; 220 return -EIO;
164 return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); 221 if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh)))
222 return -EIO;
223 error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
224 if (!error)
225 set_buffer_verified(bh);
226 return error;
165} 227}
166 228
167static inline int 229static inline int
@@ -224,7 +286,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
224 goto cleanup; 286 goto cleanup;
225 ea_bdebug(bh, "b_count=%d, refcount=%d", 287 ea_bdebug(bh, "b_count=%d, refcount=%d",
226 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 288 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
227 if (ext4_xattr_check_block(bh)) { 289 if (ext4_xattr_check_block(inode, bh)) {
228bad_block: 290bad_block:
229 EXT4_ERROR_INODE(inode, "bad block %llu", 291 EXT4_ERROR_INODE(inode, "bad block %llu",
230 EXT4_I(inode)->i_file_acl); 292 EXT4_I(inode)->i_file_acl);
@@ -369,7 +431,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
369 goto cleanup; 431 goto cleanup;
370 ea_bdebug(bh, "b_count=%d, refcount=%d", 432 ea_bdebug(bh, "b_count=%d, refcount=%d",
371 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 433 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
372 if (ext4_xattr_check_block(bh)) { 434 if (ext4_xattr_check_block(inode, bh)) {
373 EXT4_ERROR_INODE(inode, "bad block %llu", 435 EXT4_ERROR_INODE(inode, "bad block %llu",
374 EXT4_I(inode)->i_file_acl); 436 EXT4_I(inode)->i_file_acl);
375 error = -EIO; 437 error = -EIO;
@@ -492,7 +554,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
492 if (ce) 554 if (ce)
493 mb_cache_entry_release(ce); 555 mb_cache_entry_release(ce);
494 unlock_buffer(bh); 556 unlock_buffer(bh);
495 error = ext4_handle_dirty_metadata(handle, inode, bh); 557 error = ext4_handle_dirty_xattr_block(handle, inode, bh);
496 if (IS_SYNC(inode)) 558 if (IS_SYNC(inode))
497 ext4_handle_sync(handle); 559 ext4_handle_sync(handle);
498 dquot_free_block(inode, 1); 560 dquot_free_block(inode, 1);
@@ -662,7 +724,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
662 ea_bdebug(bs->bh, "b_count=%d, refcount=%d", 724 ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
663 atomic_read(&(bs->bh->b_count)), 725 atomic_read(&(bs->bh->b_count)),
664 le32_to_cpu(BHDR(bs->bh)->h_refcount)); 726 le32_to_cpu(BHDR(bs->bh)->h_refcount));
665 if (ext4_xattr_check_block(bs->bh)) { 727 if (ext4_xattr_check_block(inode, bs->bh)) {
666 EXT4_ERROR_INODE(inode, "bad block %llu", 728 EXT4_ERROR_INODE(inode, "bad block %llu",
667 EXT4_I(inode)->i_file_acl); 729 EXT4_I(inode)->i_file_acl);
668 error = -EIO; 730 error = -EIO;
@@ -725,9 +787,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
725 if (error == -EIO) 787 if (error == -EIO)
726 goto bad_block; 788 goto bad_block;
727 if (!error) 789 if (!error)
728 error = ext4_handle_dirty_metadata(handle, 790 error = ext4_handle_dirty_xattr_block(handle,
729 inode, 791 inode,
730 bs->bh); 792 bs->bh);
731 if (error) 793 if (error)
732 goto cleanup; 794 goto cleanup;
733 goto inserted; 795 goto inserted;
@@ -796,9 +858,9 @@ inserted:
796 ea_bdebug(new_bh, "reusing; refcount now=%d", 858 ea_bdebug(new_bh, "reusing; refcount now=%d",
797 le32_to_cpu(BHDR(new_bh)->h_refcount)); 859 le32_to_cpu(BHDR(new_bh)->h_refcount));
798 unlock_buffer(new_bh); 860 unlock_buffer(new_bh);
799 error = ext4_handle_dirty_metadata(handle, 861 error = ext4_handle_dirty_xattr_block(handle,
800 inode, 862 inode,
801 new_bh); 863 new_bh);
802 if (error) 864 if (error)
803 goto cleanup_dquot; 865 goto cleanup_dquot;
804 } 866 }
@@ -855,8 +917,8 @@ getblk_failed:
855 set_buffer_uptodate(new_bh); 917 set_buffer_uptodate(new_bh);
856 unlock_buffer(new_bh); 918 unlock_buffer(new_bh);
857 ext4_xattr_cache_insert(new_bh); 919 ext4_xattr_cache_insert(new_bh);
858 error = ext4_handle_dirty_metadata(handle, 920 error = ext4_handle_dirty_xattr_block(handle,
859 inode, new_bh); 921 inode, new_bh);
860 if (error) 922 if (error)
861 goto cleanup; 923 goto cleanup;
862 } 924 }
@@ -1193,7 +1255,7 @@ retry:
1193 error = -EIO; 1255 error = -EIO;
1194 if (!bh) 1256 if (!bh)
1195 goto cleanup; 1257 goto cleanup;
1196 if (ext4_xattr_check_block(bh)) { 1258 if (ext4_xattr_check_block(inode, bh)) {
1197 EXT4_ERROR_INODE(inode, "bad block %llu", 1259 EXT4_ERROR_INODE(inode, "bad block %llu",
1198 EXT4_I(inode)->i_file_acl); 1260 EXT4_I(inode)->i_file_acl);
1199 error = -EIO; 1261 error = -EIO;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 25b7387ff183..91f31ca7d9af 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -27,7 +27,9 @@ struct ext4_xattr_header {
27 __le32 h_refcount; /* reference count */ 27 __le32 h_refcount; /* reference count */
28 __le32 h_blocks; /* number of disk blocks used */ 28 __le32 h_blocks; /* number of disk blocks used */
29 __le32 h_hash; /* hash value of all attributes */ 29 __le32 h_hash; /* hash value of all attributes */
30 __u32 h_reserved[4]; /* zero right now */ 30 __le32 h_checksum; /* crc32c(uuid+id+xattrblock) */
31 /* id = inum if refcount=1, blknum otherwise */
32 __u32 h_reserved[3]; /* zero right now */
31}; 33};
32 34
33struct ext4_xattr_ibody_header { 35struct ext4_xattr_ibody_header {
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index c2973ea5df9a..a3d81ebf6d86 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -735,10 +735,9 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
735} 735}
736 736
737static int 737static int
738fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable) 738fat_encode_fh(struct inode *inode, __u32 *fh, int *lenp, struct inode *parent)
739{ 739{
740 int len = *lenp; 740 int len = *lenp;
741 struct inode *inode = de->d_inode;
742 u32 ipos_h, ipos_m, ipos_l; 741 u32 ipos_h, ipos_m, ipos_l;
743 742
744 if (len < 5) { 743 if (len < 5) {
@@ -754,9 +753,9 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
754 fh[1] = inode->i_generation; 753 fh[1] = inode->i_generation;
755 fh[2] = ipos_h; 754 fh[2] = ipos_h;
756 fh[3] = ipos_m | MSDOS_I(inode)->i_logstart; 755 fh[3] = ipos_m | MSDOS_I(inode)->i_logstart;
757 spin_lock(&de->d_lock); 756 fh[4] = ipos_l;
758 fh[4] = ipos_l | MSDOS_I(de->d_parent->d_inode)->i_logstart; 757 if (parent)
759 spin_unlock(&de->d_lock); 758 fh[4] |= MSDOS_I(parent)->i_logstart;
760 return 3; 759 return 3;
761} 760}
762 761
diff --git a/fs/fcntl.c b/fs/fcntl.c
index d078b75572a7..81b70e665bf0 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -442,28 +442,24 @@ static int check_fcntl_cmd(unsigned cmd)
442SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) 442SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
443{ 443{
444 struct file *filp; 444 struct file *filp;
445 int fput_needed;
445 long err = -EBADF; 446 long err = -EBADF;
446 447
447 filp = fget_raw(fd); 448 filp = fget_raw_light(fd, &fput_needed);
448 if (!filp) 449 if (!filp)
449 goto out; 450 goto out;
450 451
451 if (unlikely(filp->f_mode & FMODE_PATH)) { 452 if (unlikely(filp->f_mode & FMODE_PATH)) {
452 if (!check_fcntl_cmd(cmd)) { 453 if (!check_fcntl_cmd(cmd))
453 fput(filp); 454 goto out1;
454 goto out;
455 }
456 } 455 }
457 456
458 err = security_file_fcntl(filp, cmd, arg); 457 err = security_file_fcntl(filp, cmd, arg);
459 if (err) { 458 if (!err)
460 fput(filp); 459 err = do_fcntl(fd, cmd, arg, filp);
461 return err;
462 }
463 460
464 err = do_fcntl(fd, cmd, arg, filp); 461out1:
465 462 fput_light(filp, fput_needed);
466 fput(filp);
467out: 463out:
468 return err; 464 return err;
469} 465}
@@ -473,26 +469,21 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
473 unsigned long, arg) 469 unsigned long, arg)
474{ 470{
475 struct file * filp; 471 struct file * filp;
476 long err; 472 long err = -EBADF;
473 int fput_needed;
477 474
478 err = -EBADF; 475 filp = fget_raw_light(fd, &fput_needed);
479 filp = fget_raw(fd);
480 if (!filp) 476 if (!filp)
481 goto out; 477 goto out;
482 478
483 if (unlikely(filp->f_mode & FMODE_PATH)) { 479 if (unlikely(filp->f_mode & FMODE_PATH)) {
484 if (!check_fcntl_cmd(cmd)) { 480 if (!check_fcntl_cmd(cmd))
485 fput(filp); 481 goto out1;
486 goto out;
487 }
488 } 482 }
489 483
490 err = security_file_fcntl(filp, cmd, arg); 484 err = security_file_fcntl(filp, cmd, arg);
491 if (err) { 485 if (err)
492 fput(filp); 486 goto out1;
493 return err;
494 }
495 err = -EBADF;
496 487
497 switch (cmd) { 488 switch (cmd) {
498 case F_GETLK64: 489 case F_GETLK64:
@@ -507,7 +498,8 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
507 err = do_fcntl(fd, cmd, arg, filp); 498 err = do_fcntl(fd, cmd, arg, filp);
508 break; 499 break;
509 } 500 }
510 fput(filp); 501out1:
502 fput_light(filp, fput_needed);
511out: 503out:
512 return err; 504 return err;
513} 505}
diff --git a/fs/file_table.c b/fs/file_table.c
index 70f2a0fd6aec..a305d9e2d1b2 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -34,7 +34,6 @@ struct files_stat_struct files_stat = {
34 .max_files = NR_FILE 34 .max_files = NR_FILE
35}; 35};
36 36
37DECLARE_LGLOCK(files_lglock);
38DEFINE_LGLOCK(files_lglock); 37DEFINE_LGLOCK(files_lglock);
39 38
40/* SLAB cache for file structures */ 39/* SLAB cache for file structures */
@@ -421,9 +420,9 @@ static inline void __file_sb_list_add(struct file *file, struct super_block *sb)
421 */ 420 */
422void file_sb_list_add(struct file *file, struct super_block *sb) 421void file_sb_list_add(struct file *file, struct super_block *sb)
423{ 422{
424 lg_local_lock(files_lglock); 423 lg_local_lock(&files_lglock);
425 __file_sb_list_add(file, sb); 424 __file_sb_list_add(file, sb);
426 lg_local_unlock(files_lglock); 425 lg_local_unlock(&files_lglock);
427} 426}
428 427
429/** 428/**
@@ -436,9 +435,9 @@ void file_sb_list_add(struct file *file, struct super_block *sb)
436void file_sb_list_del(struct file *file) 435void file_sb_list_del(struct file *file)
437{ 436{
438 if (!list_empty(&file->f_u.fu_list)) { 437 if (!list_empty(&file->f_u.fu_list)) {
439 lg_local_lock_cpu(files_lglock, file_list_cpu(file)); 438 lg_local_lock_cpu(&files_lglock, file_list_cpu(file));
440 list_del_init(&file->f_u.fu_list); 439 list_del_init(&file->f_u.fu_list);
441 lg_local_unlock_cpu(files_lglock, file_list_cpu(file)); 440 lg_local_unlock_cpu(&files_lglock, file_list_cpu(file));
442 } 441 }
443} 442}
444 443
@@ -485,7 +484,7 @@ void mark_files_ro(struct super_block *sb)
485 struct file *f; 484 struct file *f;
486 485
487retry: 486retry:
488 lg_global_lock(files_lglock); 487 lg_global_lock(&files_lglock);
489 do_file_list_for_each_entry(sb, f) { 488 do_file_list_for_each_entry(sb, f) {
490 struct vfsmount *mnt; 489 struct vfsmount *mnt;
491 if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) 490 if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
@@ -502,12 +501,12 @@ retry:
502 file_release_write(f); 501 file_release_write(f);
503 mnt = mntget(f->f_path.mnt); 502 mnt = mntget(f->f_path.mnt);
504 /* This can sleep, so we can't hold the spinlock. */ 503 /* This can sleep, so we can't hold the spinlock. */
505 lg_global_unlock(files_lglock); 504 lg_global_unlock(&files_lglock);
506 mnt_drop_write(mnt); 505 mnt_drop_write(mnt);
507 mntput(mnt); 506 mntput(mnt);
508 goto retry; 507 goto retry;
509 } while_file_list_for_each_entry; 508 } while_file_list_for_each_entry;
510 lg_global_unlock(files_lglock); 509 lg_global_unlock(&files_lglock);
511} 510}
512 511
513void __init files_init(unsigned long mempages) 512void __init files_init(unsigned long mempages)
@@ -525,6 +524,6 @@ void __init files_init(unsigned long mempages)
525 n = (mempages * (PAGE_SIZE / 1024)) / 10; 524 n = (mempages * (PAGE_SIZE / 1024)) / 10;
526 files_stat.max_files = max_t(unsigned long, n, NR_FILE); 525 files_stat.max_files = max_t(unsigned long, n, NR_FILE);
527 files_defer_init(); 526 files_defer_init();
528 lg_lock_init(files_lglock); 527 lg_lock_init(&files_lglock, "files_lglock");
529 percpu_counter_init(&nr_files, 0); 528 percpu_counter_init(&nr_files, 0);
530} 529}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 504e61b7fd75..9562109d3a87 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -962,7 +962,9 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
962 if (err) 962 if (err)
963 goto out; 963 goto out;
964 964
965 file_update_time(file); 965 err = file_update_time(file);
966 if (err)
967 goto out;
966 968
967 if (file->f_flags & O_DIRECT) { 969 if (file->f_flags & O_DIRECT) {
968 written = generic_file_direct_write(iocb, iov, &nr_segs, 970 written = generic_file_direct_write(iocb, iov, &nr_segs,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 56f6dcf30768..42678a33b7bb 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -627,12 +627,10 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,
627 return ERR_PTR(err); 627 return ERR_PTR(err);
628} 628}
629 629
630static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, 630static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len,
631 int connectable) 631 struct inode *parent)
632{ 632{
633 struct inode *inode = dentry->d_inode; 633 int len = parent ? 6 : 3;
634 bool encode_parent = connectable && !S_ISDIR(inode->i_mode);
635 int len = encode_parent ? 6 : 3;
636 u64 nodeid; 634 u64 nodeid;
637 u32 generation; 635 u32 generation;
638 636
@@ -648,14 +646,9 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
648 fh[1] = (u32)(nodeid & 0xffffffff); 646 fh[1] = (u32)(nodeid & 0xffffffff);
649 fh[2] = generation; 647 fh[2] = generation;
650 648
651 if (encode_parent) { 649 if (parent) {
652 struct inode *parent;
653
654 spin_lock(&dentry->d_lock);
655 parent = dentry->d_parent->d_inode;
656 nodeid = get_fuse_inode(parent)->nodeid; 650 nodeid = get_fuse_inode(parent)->nodeid;
657 generation = parent->i_generation; 651 generation = parent->i_generation;
658 spin_unlock(&dentry->d_lock);
659 652
660 fh[3] = (u32)(nodeid >> 32); 653 fh[3] = (u32)(nodeid >> 32);
661 fh[4] = (u32)(nodeid & 0xffffffff); 654 fh[4] = (u32)(nodeid & 0xffffffff);
@@ -663,7 +656,7 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
663 } 656 }
664 657
665 *max_len = len; 658 *max_len = len;
666 return encode_parent ? 0x82 : 0x81; 659 return parent ? 0x82 : 0x81;
667} 660}
668 661
669static struct dentry *fuse_fh_to_dentry(struct super_block *sb, 662static struct dentry *fuse_fh_to_dentry(struct super_block *sb,
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 70ba891654f8..e8ed6d4a6181 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -28,15 +28,14 @@
28#define GFS2_LARGE_FH_SIZE 8 28#define GFS2_LARGE_FH_SIZE 8
29#define GFS2_OLD_FH_SIZE 10 29#define GFS2_OLD_FH_SIZE 10
30 30
31static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len, 31static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len,
32 int connectable) 32 struct inode *parent)
33{ 33{
34 __be32 *fh = (__force __be32 *)p; 34 __be32 *fh = (__force __be32 *)p;
35 struct inode *inode = dentry->d_inode;
36 struct super_block *sb = inode->i_sb; 35 struct super_block *sb = inode->i_sb;
37 struct gfs2_inode *ip = GFS2_I(inode); 36 struct gfs2_inode *ip = GFS2_I(inode);
38 37
39 if (connectable && (*len < GFS2_LARGE_FH_SIZE)) { 38 if (parent && (*len < GFS2_LARGE_FH_SIZE)) {
40 *len = GFS2_LARGE_FH_SIZE; 39 *len = GFS2_LARGE_FH_SIZE;
41 return 255; 40 return 255;
42 } else if (*len < GFS2_SMALL_FH_SIZE) { 41 } else if (*len < GFS2_SMALL_FH_SIZE) {
@@ -50,14 +49,10 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
50 fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); 49 fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
51 *len = GFS2_SMALL_FH_SIZE; 50 *len = GFS2_SMALL_FH_SIZE;
52 51
53 if (!connectable || inode == sb->s_root->d_inode) 52 if (!parent || inode == sb->s_root->d_inode)
54 return *len; 53 return *len;
55 54
56 spin_lock(&dentry->d_lock); 55 ip = GFS2_I(parent);
57 inode = dentry->d_parent->d_inode;
58 ip = GFS2_I(inode);
59 igrab(inode);
60 spin_unlock(&dentry->d_lock);
61 56
62 fh[4] = cpu_to_be32(ip->i_no_formal_ino >> 32); 57 fh[4] = cpu_to_be32(ip->i_no_formal_ino >> 32);
63 fh[5] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); 58 fh[5] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
@@ -65,8 +60,6 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
65 fh[7] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); 60 fh[7] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
66 *len = GFS2_LARGE_FH_SIZE; 61 *len = GFS2_LARGE_FH_SIZE;
67 62
68 iput(inode);
69
70 return *len; 63 return *len;
71} 64}
72 65
diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c
index 7a5eb2c718c8..cdb84a838068 100644
--- a/fs/hpfs/alloc.c
+++ b/fs/hpfs/alloc.c
@@ -16,9 +16,9 @@
16static int chk_if_allocated(struct super_block *s, secno sec, char *msg) 16static int chk_if_allocated(struct super_block *s, secno sec, char *msg)
17{ 17{
18 struct quad_buffer_head qbh; 18 struct quad_buffer_head qbh;
19 u32 *bmp; 19 __le32 *bmp;
20 if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "chk"))) goto fail; 20 if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "chk"))) goto fail;
21 if ((cpu_to_le32(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f)) & 1) { 21 if ((le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f)) & 1) {
22 hpfs_error(s, "sector '%s' - %08x not allocated in bitmap", msg, sec); 22 hpfs_error(s, "sector '%s' - %08x not allocated in bitmap", msg, sec);
23 goto fail1; 23 goto fail1;
24 } 24 }
@@ -62,7 +62,7 @@ int hpfs_chk_sectors(struct super_block *s, secno start, int len, char *msg)
62static secno alloc_in_bmp(struct super_block *s, secno near, unsigned n, unsigned forward) 62static secno alloc_in_bmp(struct super_block *s, secno near, unsigned n, unsigned forward)
63{ 63{
64 struct quad_buffer_head qbh; 64 struct quad_buffer_head qbh;
65 unsigned *bmp; 65 __le32 *bmp;
66 unsigned bs = near & ~0x3fff; 66 unsigned bs = near & ~0x3fff;
67 unsigned nr = (near & 0x3fff) & ~(n - 1); 67 unsigned nr = (near & 0x3fff) & ~(n - 1);
68 /*unsigned mnr;*/ 68 /*unsigned mnr;*/
@@ -236,7 +236,7 @@ static secno alloc_in_dirband(struct super_block *s, secno near)
236int hpfs_alloc_if_possible(struct super_block *s, secno sec) 236int hpfs_alloc_if_possible(struct super_block *s, secno sec)
237{ 237{
238 struct quad_buffer_head qbh; 238 struct quad_buffer_head qbh;
239 u32 *bmp; 239 __le32 *bmp;
240 if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "aip"))) goto end; 240 if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "aip"))) goto end;
241 if (le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) & (1 << (sec & 0x1f))) { 241 if (le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) & (1 << (sec & 0x1f))) {
242 bmp[(sec & 0x3fff) >> 5] &= cpu_to_le32(~(1 << (sec & 0x1f))); 242 bmp[(sec & 0x3fff) >> 5] &= cpu_to_le32(~(1 << (sec & 0x1f)));
@@ -254,7 +254,7 @@ int hpfs_alloc_if_possible(struct super_block *s, secno sec)
254void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n) 254void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n)
255{ 255{
256 struct quad_buffer_head qbh; 256 struct quad_buffer_head qbh;
257 u32 *bmp; 257 __le32 *bmp;
258 struct hpfs_sb_info *sbi = hpfs_sb(s); 258 struct hpfs_sb_info *sbi = hpfs_sb(s);
259 /*printk("2 - ");*/ 259 /*printk("2 - ");*/
260 if (!n) return; 260 if (!n) return;
@@ -299,7 +299,7 @@ int hpfs_check_free_dnodes(struct super_block *s, int n)
299 int n_bmps = (hpfs_sb(s)->sb_fs_size + 0x4000 - 1) >> 14; 299 int n_bmps = (hpfs_sb(s)->sb_fs_size + 0x4000 - 1) >> 14;
300 int b = hpfs_sb(s)->sb_c_bitmap & 0x0fffffff; 300 int b = hpfs_sb(s)->sb_c_bitmap & 0x0fffffff;
301 int i, j; 301 int i, j;
302 u32 *bmp; 302 __le32 *bmp;
303 struct quad_buffer_head qbh; 303 struct quad_buffer_head qbh;
304 if ((bmp = hpfs_map_dnode_bitmap(s, &qbh))) { 304 if ((bmp = hpfs_map_dnode_bitmap(s, &qbh))) {
305 for (j = 0; j < 512; j++) { 305 for (j = 0; j < 512; j++) {
@@ -351,7 +351,7 @@ void hpfs_free_dnode(struct super_block *s, dnode_secno dno)
351 hpfs_free_sectors(s, dno, 4); 351 hpfs_free_sectors(s, dno, 4);
352 } else { 352 } else {
353 struct quad_buffer_head qbh; 353 struct quad_buffer_head qbh;
354 u32 *bmp; 354 __le32 *bmp;
355 unsigned ssec = (dno - hpfs_sb(s)->sb_dirband_start) / 4; 355 unsigned ssec = (dno - hpfs_sb(s)->sb_dirband_start) / 4;
356 if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) { 356 if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) {
357 return; 357 return;
diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c
index 08b503e8ed29..4bae4a4a60b1 100644
--- a/fs/hpfs/anode.c
+++ b/fs/hpfs/anode.c
@@ -20,7 +20,7 @@ secno hpfs_bplus_lookup(struct super_block *s, struct inode *inode,
20 int c1, c2 = 0; 20 int c1, c2 = 0;
21 go_down: 21 go_down:
22 if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, a, &c1, &c2, "hpfs_bplus_lookup")) return -1; 22 if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, a, &c1, &c2, "hpfs_bplus_lookup")) return -1;
23 if (btree->internal) { 23 if (bp_internal(btree)) {
24 for (i = 0; i < btree->n_used_nodes; i++) 24 for (i = 0; i < btree->n_used_nodes; i++)
25 if (le32_to_cpu(btree->u.internal[i].file_secno) > sec) { 25 if (le32_to_cpu(btree->u.internal[i].file_secno) > sec) {
26 a = le32_to_cpu(btree->u.internal[i].down); 26 a = le32_to_cpu(btree->u.internal[i].down);
@@ -82,7 +82,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
82 brelse(bh); 82 brelse(bh);
83 return -1; 83 return -1;
84 } 84 }
85 if (btree->internal) { 85 if (bp_internal(btree)) {
86 a = le32_to_cpu(btree->u.internal[n].down); 86 a = le32_to_cpu(btree->u.internal[n].down);
87 btree->u.internal[n].file_secno = cpu_to_le32(-1); 87 btree->u.internal[n].file_secno = cpu_to_le32(-1);
88 mark_buffer_dirty(bh); 88 mark_buffer_dirty(bh);
@@ -129,12 +129,12 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
129 } 129 }
130 if (a == node && fnod) { 130 if (a == node && fnod) {
131 anode->up = cpu_to_le32(node); 131 anode->up = cpu_to_le32(node);
132 anode->btree.fnode_parent = 1; 132 anode->btree.flags |= BP_fnode_parent;
133 anode->btree.n_used_nodes = btree->n_used_nodes; 133 anode->btree.n_used_nodes = btree->n_used_nodes;
134 anode->btree.first_free = btree->first_free; 134 anode->btree.first_free = btree->first_free;
135 anode->btree.n_free_nodes = 40 - anode->btree.n_used_nodes; 135 anode->btree.n_free_nodes = 40 - anode->btree.n_used_nodes;
136 memcpy(&anode->u, &btree->u, btree->n_used_nodes * 12); 136 memcpy(&anode->u, &btree->u, btree->n_used_nodes * 12);
137 btree->internal = 1; 137 btree->flags |= BP_internal;
138 btree->n_free_nodes = 11; 138 btree->n_free_nodes = 11;
139 btree->n_used_nodes = 1; 139 btree->n_used_nodes = 1;
140 btree->first_free = cpu_to_le16((char *)&(btree->u.internal[1]) - (char *)btree); 140 btree->first_free = cpu_to_le16((char *)&(btree->u.internal[1]) - (char *)btree);
@@ -184,7 +184,10 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
184 hpfs_free_sectors(s, ra, 1); 184 hpfs_free_sectors(s, ra, 1);
185 if ((anode = hpfs_map_anode(s, na, &bh))) { 185 if ((anode = hpfs_map_anode(s, na, &bh))) {
186 anode->up = cpu_to_le32(up); 186 anode->up = cpu_to_le32(up);
187 anode->btree.fnode_parent = up == node && fnod; 187 if (up == node && fnod)
188 anode->btree.flags |= BP_fnode_parent;
189 else
190 anode->btree.flags &= ~BP_fnode_parent;
188 mark_buffer_dirty(bh); 191 mark_buffer_dirty(bh);
189 brelse(bh); 192 brelse(bh);
190 } 193 }
@@ -198,7 +201,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
198 if ((new_anode = hpfs_alloc_anode(s, a, &na, &bh))) { 201 if ((new_anode = hpfs_alloc_anode(s, a, &na, &bh))) {
199 anode = new_anode; 202 anode = new_anode;
200 /*anode->up = cpu_to_le32(up != -1 ? up : ra);*/ 203 /*anode->up = cpu_to_le32(up != -1 ? up : ra);*/
201 anode->btree.internal = 1; 204 anode->btree.flags |= BP_internal;
202 anode->btree.n_used_nodes = 1; 205 anode->btree.n_used_nodes = 1;
203 anode->btree.n_free_nodes = 59; 206 anode->btree.n_free_nodes = 59;
204 anode->btree.first_free = cpu_to_le16(16); 207 anode->btree.first_free = cpu_to_le16(16);
@@ -215,7 +218,8 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
215 } 218 }
216 if ((anode = hpfs_map_anode(s, na, &bh))) { 219 if ((anode = hpfs_map_anode(s, na, &bh))) {
217 anode->up = cpu_to_le32(node); 220 anode->up = cpu_to_le32(node);
218 if (fnod) anode->btree.fnode_parent = 1; 221 if (fnod)
222 anode->btree.flags |= BP_fnode_parent;
219 mark_buffer_dirty(bh); 223 mark_buffer_dirty(bh);
220 brelse(bh); 224 brelse(bh);
221 } 225 }
@@ -234,18 +238,19 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
234 } 238 }
235 ranode->up = cpu_to_le32(node); 239 ranode->up = cpu_to_le32(node);
236 memcpy(&ranode->btree, btree, le16_to_cpu(btree->first_free)); 240 memcpy(&ranode->btree, btree, le16_to_cpu(btree->first_free));
237 if (fnod) ranode->btree.fnode_parent = 1; 241 if (fnod)
238 ranode->btree.n_free_nodes = (ranode->btree.internal ? 60 : 40) - ranode->btree.n_used_nodes; 242 ranode->btree.flags |= BP_fnode_parent;
239 if (ranode->btree.internal) for (n = 0; n < ranode->btree.n_used_nodes; n++) { 243 ranode->btree.n_free_nodes = (bp_internal(&ranode->btree) ? 60 : 40) - ranode->btree.n_used_nodes;
244 if (bp_internal(&ranode->btree)) for (n = 0; n < ranode->btree.n_used_nodes; n++) {
240 struct anode *unode; 245 struct anode *unode;
241 if ((unode = hpfs_map_anode(s, le32_to_cpu(ranode->u.internal[n].down), &bh1))) { 246 if ((unode = hpfs_map_anode(s, le32_to_cpu(ranode->u.internal[n].down), &bh1))) {
242 unode->up = cpu_to_le32(ra); 247 unode->up = cpu_to_le32(ra);
243 unode->btree.fnode_parent = 0; 248 unode->btree.flags &= ~BP_fnode_parent;
244 mark_buffer_dirty(bh1); 249 mark_buffer_dirty(bh1);
245 brelse(bh1); 250 brelse(bh1);
246 } 251 }
247 } 252 }
248 btree->internal = 1; 253 btree->flags |= BP_internal;
249 btree->n_free_nodes = fnod ? 10 : 58; 254 btree->n_free_nodes = fnod ? 10 : 58;
250 btree->n_used_nodes = 2; 255 btree->n_used_nodes = 2;
251 btree->first_free = cpu_to_le16((char *)&btree->u.internal[2] - (char *)btree); 256 btree->first_free = cpu_to_le16((char *)&btree->u.internal[2] - (char *)btree);
@@ -278,7 +283,7 @@ void hpfs_remove_btree(struct super_block *s, struct bplus_header *btree)
278 int d1, d2; 283 int d1, d2;
279 go_down: 284 go_down:
280 d2 = 0; 285 d2 = 0;
281 while (btree1->internal) { 286 while (bp_internal(btree1)) {
282 ano = le32_to_cpu(btree1->u.internal[pos].down); 287 ano = le32_to_cpu(btree1->u.internal[pos].down);
283 if (level) brelse(bh); 288 if (level) brelse(bh);
284 if (hpfs_sb(s)->sb_chk) 289 if (hpfs_sb(s)->sb_chk)
@@ -412,13 +417,13 @@ void hpfs_truncate_btree(struct super_block *s, secno f, int fno, unsigned secs)
412 btree->n_free_nodes = 8; 417 btree->n_free_nodes = 8;
413 btree->n_used_nodes = 0; 418 btree->n_used_nodes = 0;
414 btree->first_free = cpu_to_le16(8); 419 btree->first_free = cpu_to_le16(8);
415 btree->internal = 0; 420 btree->flags &= ~BP_internal;
416 mark_buffer_dirty(bh); 421 mark_buffer_dirty(bh);
417 } else hpfs_free_sectors(s, f, 1); 422 } else hpfs_free_sectors(s, f, 1);
418 brelse(bh); 423 brelse(bh);
419 return; 424 return;
420 } 425 }
421 while (btree->internal) { 426 while (bp_internal(btree)) {
422 nodes = btree->n_used_nodes + btree->n_free_nodes; 427 nodes = btree->n_used_nodes + btree->n_free_nodes;
423 for (i = 0; i < btree->n_used_nodes; i++) 428 for (i = 0; i < btree->n_used_nodes; i++)
424 if (le32_to_cpu(btree->u.internal[i].file_secno) >= secs) goto f; 429 if (le32_to_cpu(btree->u.internal[i].file_secno) >= secs) goto f;
@@ -479,13 +484,13 @@ void hpfs_remove_fnode(struct super_block *s, fnode_secno fno)
479 struct extended_attribute *ea; 484 struct extended_attribute *ea;
480 struct extended_attribute *ea_end; 485 struct extended_attribute *ea_end;
481 if (!(fnode = hpfs_map_fnode(s, fno, &bh))) return; 486 if (!(fnode = hpfs_map_fnode(s, fno, &bh))) return;
482 if (!fnode->dirflag) hpfs_remove_btree(s, &fnode->btree); 487 if (!fnode_is_dir(fnode)) hpfs_remove_btree(s, &fnode->btree);
483 else hpfs_remove_dtree(s, le32_to_cpu(fnode->u.external[0].disk_secno)); 488 else hpfs_remove_dtree(s, le32_to_cpu(fnode->u.external[0].disk_secno));
484 ea_end = fnode_end_ea(fnode); 489 ea_end = fnode_end_ea(fnode);
485 for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea)) 490 for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
486 if (ea->indirect) 491 if (ea_indirect(ea))
487 hpfs_ea_remove(s, ea_sec(ea), ea->anode, ea_len(ea)); 492 hpfs_ea_remove(s, ea_sec(ea), ea_in_anode(ea), ea_len(ea));
488 hpfs_ea_ext_remove(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l)); 493 hpfs_ea_ext_remove(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l));
489 brelse(bh); 494 brelse(bh);
490 hpfs_free_sectors(s, fno, 1); 495 hpfs_free_sectors(s, fno, 1);
491} 496}
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 2fa0089a02a8..b8472f803f4e 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -87,7 +87,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
87 ret = -EIOERROR; 87 ret = -EIOERROR;
88 goto out; 88 goto out;
89 } 89 }
90 if (!fno->dirflag) { 90 if (!fnode_is_dir(fno)) {
91 e = 1; 91 e = 1;
92 hpfs_error(inode->i_sb, "not a directory, fnode %08lx", 92 hpfs_error(inode->i_sb, "not a directory, fnode %08lx",
93 (unsigned long)inode->i_ino); 93 (unsigned long)inode->i_ino);
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index 1e0e2ac30fd3..3228c524ebe5 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -153,7 +153,7 @@ static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno
153 } 153 }
154 de->length = cpu_to_le16(36); 154 de->length = cpu_to_le16(36);
155 de->down = 1; 155 de->down = 1;
156 *(dnode_secno *)((char *)de + 32) = cpu_to_le32(ptr); 156 *(__le32 *)((char *)de + 32) = cpu_to_le32(ptr);
157 } 157 }
158} 158}
159 159
@@ -177,7 +177,7 @@ struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d,
177 memmove((char *)de + d_size, de, (char *)de_end - (char *)de); 177 memmove((char *)de + d_size, de, (char *)de_end - (char *)de);
178 memset(de, 0, d_size); 178 memset(de, 0, d_size);
179 if (down_ptr) { 179 if (down_ptr) {
180 *(dnode_secno *)((char *)de + d_size - 4) = cpu_to_le32(down_ptr); 180 *(__le32 *)((char *)de + d_size - 4) = cpu_to_le32(down_ptr);
181 de->down = 1; 181 de->down = 1;
182 } 182 }
183 de->length = cpu_to_le16(d_size); 183 de->length = cpu_to_le16(d_size);
@@ -656,7 +656,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
656 del->down = 0; 656 del->down = 0;
657 d1->first_free = cpu_to_le32(le32_to_cpu(d1->first_free) - 4); 657 d1->first_free = cpu_to_le32(le32_to_cpu(d1->first_free) - 4);
658 } else if (down) 658 } else if (down)
659 *(dnode_secno *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down); 659 *(__le32 *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down);
660 } else goto endm; 660 } else goto endm;
661 if (!(de_cp = kmalloc(le16_to_cpu(de_prev->length), GFP_NOFS))) { 661 if (!(de_cp = kmalloc(le16_to_cpu(de_prev->length), GFP_NOFS))) {
662 printk("HPFS: out of memory for dtree balancing\n"); 662 printk("HPFS: out of memory for dtree balancing\n");
@@ -672,7 +672,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
672 de_prev->down = 1; 672 de_prev->down = 1;
673 dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) + 4); 673 dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) + 4);
674 } 674 }
675 *(dnode_secno *) ((void *) de_prev + le16_to_cpu(de_prev->length) - 4) = cpu_to_le32(ndown); 675 *(__le32 *) ((void *) de_prev + le16_to_cpu(de_prev->length) - 4) = cpu_to_le32(ndown);
676 hpfs_mark_4buffers_dirty(&qbh); 676 hpfs_mark_4buffers_dirty(&qbh);
677 hpfs_brelse4(&qbh); 677 hpfs_brelse4(&qbh);
678 for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | (p - 1), 4); 678 for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | (p - 1), 4);
@@ -1015,7 +1015,7 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno,
1015 kfree(name2); 1015 kfree(name2);
1016 return NULL; 1016 return NULL;
1017 } 1017 }
1018 if (!upf->dirflag) { 1018 if (!fnode_is_dir(upf)) {
1019 brelse(bh); 1019 brelse(bh);
1020 hpfs_error(s, "fnode %08x has non-directory parent %08x", fno, le32_to_cpu(f->up)); 1020 hpfs_error(s, "fnode %08x has non-directory parent %08x", fno, le32_to_cpu(f->up));
1021 kfree(name2); 1021 kfree(name2);
diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c
index d8b84d113c89..bcaafcd2666a 100644
--- a/fs/hpfs/ea.c
+++ b/fs/hpfs/ea.c
@@ -23,15 +23,15 @@ void hpfs_ea_ext_remove(struct super_block *s, secno a, int ano, unsigned len)
23 return; 23 return;
24 } 24 }
25 if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return; 25 if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return;
26 if (ea->indirect) { 26 if (ea_indirect(ea)) {
27 if (ea_valuelen(ea) != 8) { 27 if (ea_valuelen(ea) != 8) {
28 hpfs_error(s, "ea->indirect set while ea->valuelen!=8, %s %08x, pos %08x", 28 hpfs_error(s, "ea_indirect(ea) set while ea->valuelen!=8, %s %08x, pos %08x",
29 ano ? "anode" : "sectors", a, pos); 29 ano ? "anode" : "sectors", a, pos);
30 return; 30 return;
31 } 31 }
32 if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 9, ex+4)) 32 if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 9, ex+4))
33 return; 33 return;
34 hpfs_ea_remove(s, ea_sec(ea), ea->anode, ea_len(ea)); 34 hpfs_ea_remove(s, ea_sec(ea), ea_in_anode(ea), ea_len(ea));
35 } 35 }
36 pos += ea->namelen + ea_valuelen(ea) + 5; 36 pos += ea->namelen + ea_valuelen(ea) + 5;
37 } 37 }
@@ -81,7 +81,7 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key,
81 struct extended_attribute *ea_end = fnode_end_ea(fnode); 81 struct extended_attribute *ea_end = fnode_end_ea(fnode);
82 for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea)) 82 for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
83 if (!strcmp(ea->name, key)) { 83 if (!strcmp(ea->name, key)) {
84 if (ea->indirect) 84 if (ea_indirect(ea))
85 goto indirect; 85 goto indirect;
86 if (ea_valuelen(ea) >= size) 86 if (ea_valuelen(ea) >= size)
87 return -EINVAL; 87 return -EINVAL;
@@ -91,7 +91,7 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key,
91 } 91 }
92 a = le32_to_cpu(fnode->ea_secno); 92 a = le32_to_cpu(fnode->ea_secno);
93 len = le32_to_cpu(fnode->ea_size_l); 93 len = le32_to_cpu(fnode->ea_size_l);
94 ano = fnode->ea_anode; 94 ano = fnode_in_anode(fnode);
95 pos = 0; 95 pos = 0;
96 while (pos < len) { 96 while (pos < len) {
97 ea = (struct extended_attribute *)ex; 97 ea = (struct extended_attribute *)ex;
@@ -101,10 +101,10 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key,
101 return -EIO; 101 return -EIO;
102 } 102 }
103 if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return -EIO; 103 if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return -EIO;
104 if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea->indirect ? 8 : 0), ex + 4)) 104 if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea_indirect(ea) ? 8 : 0), ex + 4))
105 return -EIO; 105 return -EIO;
106 if (!strcmp(ea->name, key)) { 106 if (!strcmp(ea->name, key)) {
107 if (ea->indirect) 107 if (ea_indirect(ea))
108 goto indirect; 108 goto indirect;
109 if (ea_valuelen(ea) >= size) 109 if (ea_valuelen(ea) >= size)
110 return -EINVAL; 110 return -EINVAL;
@@ -119,7 +119,7 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key,
119indirect: 119indirect:
120 if (ea_len(ea) >= size) 120 if (ea_len(ea) >= size)
121 return -EINVAL; 121 return -EINVAL;
122 if (hpfs_ea_read(s, ea_sec(ea), ea->anode, 0, ea_len(ea), buf)) 122 if (hpfs_ea_read(s, ea_sec(ea), ea_in_anode(ea), 0, ea_len(ea), buf))
123 return -EIO; 123 return -EIO;
124 buf[ea_len(ea)] = 0; 124 buf[ea_len(ea)] = 0;
125 return 0; 125 return 0;
@@ -136,8 +136,8 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si
136 struct extended_attribute *ea_end = fnode_end_ea(fnode); 136 struct extended_attribute *ea_end = fnode_end_ea(fnode);
137 for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea)) 137 for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
138 if (!strcmp(ea->name, key)) { 138 if (!strcmp(ea->name, key)) {
139 if (ea->indirect) 139 if (ea_indirect(ea))
140 return get_indirect_ea(s, ea->anode, ea_sec(ea), *size = ea_len(ea)); 140 return get_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), *size = ea_len(ea));
141 if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) { 141 if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) {
142 printk("HPFS: out of memory for EA\n"); 142 printk("HPFS: out of memory for EA\n");
143 return NULL; 143 return NULL;
@@ -148,7 +148,7 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si
148 } 148 }
149 a = le32_to_cpu(fnode->ea_secno); 149 a = le32_to_cpu(fnode->ea_secno);
150 len = le32_to_cpu(fnode->ea_size_l); 150 len = le32_to_cpu(fnode->ea_size_l);
151 ano = fnode->ea_anode; 151 ano = fnode_in_anode(fnode);
152 pos = 0; 152 pos = 0;
153 while (pos < len) { 153 while (pos < len) {
154 char ex[4 + 255 + 1 + 8]; 154 char ex[4 + 255 + 1 + 8];
@@ -159,11 +159,11 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si
159 return NULL; 159 return NULL;
160 } 160 }
161 if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return NULL; 161 if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return NULL;
162 if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea->indirect ? 8 : 0), ex + 4)) 162 if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea_indirect(ea) ? 8 : 0), ex + 4))
163 return NULL; 163 return NULL;
164 if (!strcmp(ea->name, key)) { 164 if (!strcmp(ea->name, key)) {
165 if (ea->indirect) 165 if (ea_indirect(ea))
166 return get_indirect_ea(s, ea->anode, ea_sec(ea), *size = ea_len(ea)); 166 return get_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), *size = ea_len(ea));
167 if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) { 167 if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) {
168 printk("HPFS: out of memory for EA\n"); 168 printk("HPFS: out of memory for EA\n");
169 return NULL; 169 return NULL;
@@ -199,9 +199,9 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
199 struct extended_attribute *ea_end = fnode_end_ea(fnode); 199 struct extended_attribute *ea_end = fnode_end_ea(fnode);
200 for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea)) 200 for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
201 if (!strcmp(ea->name, key)) { 201 if (!strcmp(ea->name, key)) {
202 if (ea->indirect) { 202 if (ea_indirect(ea)) {
203 if (ea_len(ea) == size) 203 if (ea_len(ea) == size)
204 set_indirect_ea(s, ea->anode, ea_sec(ea), data, size); 204 set_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), data, size);
205 } else if (ea_valuelen(ea) == size) { 205 } else if (ea_valuelen(ea) == size) {
206 memcpy(ea_data(ea), data, size); 206 memcpy(ea_data(ea), data, size);
207 } 207 }
@@ -209,7 +209,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
209 } 209 }
210 a = le32_to_cpu(fnode->ea_secno); 210 a = le32_to_cpu(fnode->ea_secno);
211 len = le32_to_cpu(fnode->ea_size_l); 211 len = le32_to_cpu(fnode->ea_size_l);
212 ano = fnode->ea_anode; 212 ano = fnode_in_anode(fnode);
213 pos = 0; 213 pos = 0;
214 while (pos < len) { 214 while (pos < len) {
215 char ex[4 + 255 + 1 + 8]; 215 char ex[4 + 255 + 1 + 8];
@@ -220,12 +220,12 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
220 return; 220 return;
221 } 221 }
222 if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return; 222 if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return;
223 if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea->indirect ? 8 : 0), ex + 4)) 223 if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea_indirect(ea) ? 8 : 0), ex + 4))
224 return; 224 return;
225 if (!strcmp(ea->name, key)) { 225 if (!strcmp(ea->name, key)) {
226 if (ea->indirect) { 226 if (ea_indirect(ea)) {
227 if (ea_len(ea) == size) 227 if (ea_len(ea) == size)
228 set_indirect_ea(s, ea->anode, ea_sec(ea), data, size); 228 set_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), data, size);
229 } 229 }
230 else { 230 else {
231 if (ea_valuelen(ea) == size) 231 if (ea_valuelen(ea) == size)
@@ -246,7 +246,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
246 if (le16_to_cpu(fnode->ea_offs) < 0xc4 || le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) > 0x200) { 246 if (le16_to_cpu(fnode->ea_offs) < 0xc4 || le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) > 0x200) {
247 hpfs_error(s, "fnode %08lx: ea_offs == %03x, ea_size_s == %03x", 247 hpfs_error(s, "fnode %08lx: ea_offs == %03x, ea_size_s == %03x",
248 (unsigned long)inode->i_ino, 248 (unsigned long)inode->i_ino,
249 le32_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s)); 249 le16_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s));
250 return; 250 return;
251 } 251 }
252 if ((le16_to_cpu(fnode->ea_size_s) || !le32_to_cpu(fnode->ea_size_l)) && 252 if ((le16_to_cpu(fnode->ea_size_s) || !le32_to_cpu(fnode->ea_size_l)) &&
@@ -276,7 +276,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
276 fnode->ea_size_l = cpu_to_le32(le16_to_cpu(fnode->ea_size_s)); 276 fnode->ea_size_l = cpu_to_le32(le16_to_cpu(fnode->ea_size_s));
277 fnode->ea_size_s = cpu_to_le16(0); 277 fnode->ea_size_s = cpu_to_le16(0);
278 fnode->ea_secno = cpu_to_le32(n); 278 fnode->ea_secno = cpu_to_le32(n);
279 fnode->ea_anode = cpu_to_le32(0); 279 fnode->flags &= ~FNODE_anode;
280 mark_buffer_dirty(bh); 280 mark_buffer_dirty(bh);
281 brelse(bh); 281 brelse(bh);
282 } 282 }
@@ -288,9 +288,9 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
288 secno q = hpfs_alloc_sector(s, fno, 1, 0); 288 secno q = hpfs_alloc_sector(s, fno, 1, 0);
289 if (!q) goto bail; 289 if (!q) goto bail;
290 fnode->ea_secno = cpu_to_le32(q); 290 fnode->ea_secno = cpu_to_le32(q);
291 fnode->ea_anode = 0; 291 fnode->flags &= ~FNODE_anode;
292 len++; 292 len++;
293 } else if (!fnode->ea_anode) { 293 } else if (!fnode_in_anode(fnode)) {
294 if (hpfs_alloc_if_possible(s, le32_to_cpu(fnode->ea_secno) + len)) { 294 if (hpfs_alloc_if_possible(s, le32_to_cpu(fnode->ea_secno) + len)) {
295 len++; 295 len++;
296 } else { 296 } else {
@@ -310,7 +310,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
310 anode->u.external[0].length = cpu_to_le32(len); 310 anode->u.external[0].length = cpu_to_le32(len);
311 mark_buffer_dirty(bh); 311 mark_buffer_dirty(bh);
312 brelse(bh); 312 brelse(bh);
313 fnode->ea_anode = 1; 313 fnode->flags |= FNODE_anode;
314 fnode->ea_secno = cpu_to_le32(a_s);*/ 314 fnode->ea_secno = cpu_to_le32(a_s);*/
315 secno new_sec; 315 secno new_sec;
316 int i; 316 int i;
@@ -338,7 +338,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
338 len = (pos + 511) >> 9; 338 len = (pos + 511) >> 9;
339 } 339 }
340 } 340 }
341 if (fnode->ea_anode) { 341 if (fnode_in_anode(fnode)) {
342 if (hpfs_add_sector_to_btree(s, le32_to_cpu(fnode->ea_secno), 342 if (hpfs_add_sector_to_btree(s, le32_to_cpu(fnode->ea_secno),
343 0, len) != -1) { 343 0, len) != -1) {
344 len++; 344 len++;
@@ -351,16 +351,16 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
351 h[1] = strlen(key); 351 h[1] = strlen(key);
352 h[2] = size & 0xff; 352 h[2] = size & 0xff;
353 h[3] = size >> 8; 353 h[3] = size >> 8;
354 if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l), 4, h)) goto bail; 354 if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l), 4, h)) goto bail;
355 if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l) + 4, h[1] + 1, key)) goto bail; 355 if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l) + 4, h[1] + 1, key)) goto bail;
356 if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l) + 5 + h[1], size, data)) goto bail; 356 if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l) + 5 + h[1], size, data)) goto bail;
357 fnode->ea_size_l = cpu_to_le32(pos); 357 fnode->ea_size_l = cpu_to_le32(pos);
358 ret: 358 ret:
359 hpfs_i(inode)->i_ea_size += 5 + strlen(key) + size; 359 hpfs_i(inode)->i_ea_size += 5 + strlen(key) + size;
360 return; 360 return;
361 bail: 361 bail:
362 if (le32_to_cpu(fnode->ea_secno)) 362 if (le32_to_cpu(fnode->ea_secno))
363 if (fnode->ea_anode) hpfs_truncate_btree(s, le32_to_cpu(fnode->ea_secno), 1, (le32_to_cpu(fnode->ea_size_l) + 511) >> 9); 363 if (fnode_in_anode(fnode)) hpfs_truncate_btree(s, le32_to_cpu(fnode->ea_secno), 1, (le32_to_cpu(fnode->ea_size_l) + 511) >> 9);
364 else hpfs_free_sectors(s, le32_to_cpu(fnode->ea_secno) + ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9), len - ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9)); 364 else hpfs_free_sectors(s, le32_to_cpu(fnode->ea_secno) + ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9), len - ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9));
365 else fnode->ea_secno = fnode->ea_size_l = cpu_to_le32(0); 365 else fnode->ea_secno = fnode->ea_size_l = cpu_to_le32(0);
366} 366}
diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h
index 8b0650aae328..cce025aff1b1 100644
--- a/fs/hpfs/hpfs.h
+++ b/fs/hpfs/hpfs.h
@@ -51,11 +51,11 @@ struct hpfs_boot_block
51 u8 n_rootdir_entries[2]; 51 u8 n_rootdir_entries[2];
52 u8 n_sectors_s[2]; 52 u8 n_sectors_s[2];
53 u8 media_byte; 53 u8 media_byte;
54 u16 sectors_per_fat; 54 __le16 sectors_per_fat;
55 u16 sectors_per_track; 55 __le16 sectors_per_track;
56 u16 heads_per_cyl; 56 __le16 heads_per_cyl;
57 u32 n_hidden_sectors; 57 __le32 n_hidden_sectors;
58 u32 n_sectors_l; /* size of partition */ 58 __le32 n_sectors_l; /* size of partition */
59 u8 drive_number; 59 u8 drive_number;
60 u8 mbz; 60 u8 mbz;
61 u8 sig_28h; /* 28h */ 61 u8 sig_28h; /* 28h */
@@ -63,7 +63,7 @@ struct hpfs_boot_block
63 u8 vol_label[11]; 63 u8 vol_label[11];
64 u8 sig_hpfs[8]; /* "HPFS " */ 64 u8 sig_hpfs[8]; /* "HPFS " */
65 u8 pad[448]; 65 u8 pad[448];
66 u16 magic; /* aa55 */ 66 __le16 magic; /* aa55 */
67}; 67};
68 68
69 69
@@ -75,28 +75,28 @@ struct hpfs_boot_block
75 75
76struct hpfs_super_block 76struct hpfs_super_block
77{ 77{
78 u32 magic; /* f995 e849 */ 78 __le32 magic; /* f995 e849 */
79 u32 magic1; /* fa53 e9c5, more magic? */ 79 __le32 magic1; /* fa53 e9c5, more magic? */
80 u8 version; /* version of a filesystem usually 2 */ 80 u8 version; /* version of a filesystem usually 2 */
81 u8 funcversion; /* functional version - oldest version 81 u8 funcversion; /* functional version - oldest version
82 of filesystem that can understand 82 of filesystem that can understand
83 this disk */ 83 this disk */
84 u16 zero; /* 0 */ 84 __le16 zero; /* 0 */
85 fnode_secno root; /* fnode of root directory */ 85 __le32 root; /* fnode of root directory */
86 secno n_sectors; /* size of filesystem */ 86 __le32 n_sectors; /* size of filesystem */
87 u32 n_badblocks; /* number of bad blocks */ 87 __le32 n_badblocks; /* number of bad blocks */
88 secno bitmaps; /* pointers to free space bit maps */ 88 __le32 bitmaps; /* pointers to free space bit maps */
89 u32 zero1; /* 0 */ 89 __le32 zero1; /* 0 */
90 secno badblocks; /* bad block list */ 90 __le32 badblocks; /* bad block list */
91 u32 zero3; /* 0 */ 91 __le32 zero3; /* 0 */
92 time32_t last_chkdsk; /* date last checked, 0 if never */ 92 __le32 last_chkdsk; /* date last checked, 0 if never */
93 time32_t last_optimize; /* date last optimized, 0 if never */ 93 __le32 last_optimize; /* date last optimized, 0 if never */
94 secno n_dir_band; /* number of sectors in dir band */ 94 __le32 n_dir_band; /* number of sectors in dir band */
95 secno dir_band_start; /* first sector in dir band */ 95 __le32 dir_band_start; /* first sector in dir band */
96 secno dir_band_end; /* last sector in dir band */ 96 __le32 dir_band_end; /* last sector in dir band */
97 secno dir_band_bitmap; /* free space map, 1 dnode per bit */ 97 __le32 dir_band_bitmap; /* free space map, 1 dnode per bit */
98 u8 volume_name[32]; /* not used */ 98 u8 volume_name[32]; /* not used */
99 secno user_id_table; /* 8 preallocated sectors - user id */ 99 __le32 user_id_table; /* 8 preallocated sectors - user id */
100 u32 zero6[103]; /* 0 */ 100 u32 zero6[103]; /* 0 */
101}; 101};
102 102
@@ -109,8 +109,8 @@ struct hpfs_super_block
109 109
110struct hpfs_spare_block 110struct hpfs_spare_block
111{ 111{
112 u32 magic; /* f991 1849 */ 112 __le32 magic; /* f991 1849 */
113 u32 magic1; /* fa52 29c5, more magic? */ 113 __le32 magic1; /* fa52 29c5, more magic? */
114 114
115#ifdef __LITTLE_ENDIAN 115#ifdef __LITTLE_ENDIAN
116 u8 dirty: 1; /* 0 clean, 1 "improperly stopped" */ 116 u8 dirty: 1; /* 0 clean, 1 "improperly stopped" */
@@ -153,21 +153,21 @@ struct hpfs_spare_block
153 u8 mm_contlgulty; 153 u8 mm_contlgulty;
154 u8 unused; 154 u8 unused;
155 155
156 secno hotfix_map; /* info about remapped bad sectors */ 156 __le32 hotfix_map; /* info about remapped bad sectors */
157 u32 n_spares_used; /* number of hotfixes */ 157 __le32 n_spares_used; /* number of hotfixes */
158 u32 n_spares; /* number of spares in hotfix map */ 158 __le32 n_spares; /* number of spares in hotfix map */
159 u32 n_dnode_spares_free; /* spare dnodes unused */ 159 __le32 n_dnode_spares_free; /* spare dnodes unused */
160 u32 n_dnode_spares; /* length of spare_dnodes[] list, 160 __le32 n_dnode_spares; /* length of spare_dnodes[] list,
161 follows in this block*/ 161 follows in this block*/
162 secno code_page_dir; /* code page directory block */ 162 __le32 code_page_dir; /* code page directory block */
163 u32 n_code_pages; /* number of code pages */ 163 __le32 n_code_pages; /* number of code pages */
164 u32 super_crc; /* on HPFS386 and LAN Server this is 164 __le32 super_crc; /* on HPFS386 and LAN Server this is
165 checksum of superblock, on normal 165 checksum of superblock, on normal
166 OS/2 unused */ 166 OS/2 unused */
167 u32 spare_crc; /* on HPFS386 checksum of spareblock */ 167 __le32 spare_crc; /* on HPFS386 checksum of spareblock */
168 u32 zero1[15]; /* unused */ 168 __le32 zero1[15]; /* unused */
169 dnode_secno spare_dnodes[100]; /* emergency free dnode list */ 169 __le32 spare_dnodes[100]; /* emergency free dnode list */
170 u32 zero2[1]; /* room for more? */ 170 __le32 zero2[1]; /* room for more? */
171}; 171};
172 172
173/* The bad block list is 4 sectors long. The first word must be zero, 173/* The bad block list is 4 sectors long. The first word must be zero,
@@ -202,18 +202,18 @@ struct hpfs_spare_block
202 202
203struct code_page_directory 203struct code_page_directory
204{ 204{
205 u32 magic; /* 4945 21f7 */ 205 __le32 magic; /* 4945 21f7 */
206 u32 n_code_pages; /* number of pointers following */ 206 __le32 n_code_pages; /* number of pointers following */
207 u32 zero1[2]; 207 __le32 zero1[2];
208 struct { 208 struct {
209 u16 ix; /* index */ 209 __le16 ix; /* index */
210 u16 code_page_number; /* code page number */ 210 __le16 code_page_number; /* code page number */
211 u32 bounds; /* matches corresponding word 211 __le32 bounds; /* matches corresponding word
212 in data block */ 212 in data block */
213 secno code_page_data; /* sector number of a code_page_data 213 __le32 code_page_data; /* sector number of a code_page_data
214 containing c.p. array */ 214 containing c.p. array */
215 u16 index; /* index in c.p. array in that sector*/ 215 __le16 index; /* index in c.p. array in that sector*/
216 u16 unknown; /* some unknown value; usually 0; 216 __le16 unknown; /* some unknown value; usually 0;
217 2 in Japanese version */ 217 2 in Japanese version */
218 } array[31]; /* unknown length */ 218 } array[31]; /* unknown length */
219}; 219};
@@ -224,19 +224,19 @@ struct code_page_directory
224 224
225struct code_page_data 225struct code_page_data
226{ 226{
227 u32 magic; /* 8945 21f7 */ 227 __le32 magic; /* 8945 21f7 */
228 u32 n_used; /* # elements used in c_p_data[] */ 228 __le32 n_used; /* # elements used in c_p_data[] */
229 u32 bounds[3]; /* looks a bit like 229 __le32 bounds[3]; /* looks a bit like
230 (beg1,end1), (beg2,end2) 230 (beg1,end1), (beg2,end2)
231 one byte each */ 231 one byte each */
232 u16 offs[3]; /* offsets from start of sector 232 __le16 offs[3]; /* offsets from start of sector
233 to start of c_p_data[ix] */ 233 to start of c_p_data[ix] */
234 struct { 234 struct {
235 u16 ix; /* index */ 235 __le16 ix; /* index */
236 u16 code_page_number; /* code page number */ 236 __le16 code_page_number; /* code page number */
237 u16 unknown; /* the same as in cp directory */ 237 __le16 unknown; /* the same as in cp directory */
238 u8 map[128]; /* upcase table for chars 80..ff */ 238 u8 map[128]; /* upcase table for chars 80..ff */
239 u16 zero2; 239 __le16 zero2;
240 } code_page[3]; 240 } code_page[3];
241 u8 incognita[78]; 241 u8 incognita[78];
242}; 242};
@@ -278,8 +278,8 @@ struct code_page_data
278#define DNODE_MAGIC 0x77e40aae 278#define DNODE_MAGIC 0x77e40aae
279 279
280struct dnode { 280struct dnode {
281 u32 magic; /* 77e4 0aae */ 281 __le32 magic; /* 77e4 0aae */
282 u32 first_free; /* offset from start of dnode to 282 __le32 first_free; /* offset from start of dnode to
283 first free dir entry */ 283 first free dir entry */
284#ifdef __LITTLE_ENDIAN 284#ifdef __LITTLE_ENDIAN
285 u8 root_dnode: 1; /* Is it root dnode? */ 285 u8 root_dnode: 1; /* Is it root dnode? */
@@ -293,14 +293,14 @@ struct dnode {
293 u8 root_dnode: 1; /* Is it root dnode? */ 293 u8 root_dnode: 1; /* Is it root dnode? */
294#endif 294#endif
295 u8 increment_me2[3]; 295 u8 increment_me2[3];
296 secno up; /* (root dnode) directory's fnode 296 __le32 up; /* (root dnode) directory's fnode
297 (nonroot) parent dnode */ 297 (nonroot) parent dnode */
298 dnode_secno self; /* pointer to this dnode */ 298 __le32 self; /* pointer to this dnode */
299 u8 dirent[2028]; /* one or more dirents */ 299 u8 dirent[2028]; /* one or more dirents */
300}; 300};
301 301
302struct hpfs_dirent { 302struct hpfs_dirent {
303 u16 length; /* offset to next dirent */ 303 __le16 length; /* offset to next dirent */
304 304
305#ifdef __LITTLE_ENDIAN 305#ifdef __LITTLE_ENDIAN
306 u8 first: 1; /* set on phony ^A^A (".") entry */ 306 u8 first: 1; /* set on phony ^A^A (".") entry */
@@ -346,12 +346,12 @@ struct hpfs_dirent {
346 u8 read_only: 1; /* dos attrib */ 346 u8 read_only: 1; /* dos attrib */
347#endif 347#endif
348 348
349 fnode_secno fnode; /* fnode giving allocation info */ 349 __le32 fnode; /* fnode giving allocation info */
350 time32_t write_date; /* mtime */ 350 __le32 write_date; /* mtime */
351 u32 file_size; /* file length, bytes */ 351 __le32 file_size; /* file length, bytes */
352 time32_t read_date; /* atime */ 352 __le32 read_date; /* atime */
353 time32_t creation_date; /* ctime */ 353 __le32 creation_date; /* ctime */
354 u32 ea_size; /* total EA length, bytes */ 354 __le32 ea_size; /* total EA length, bytes */
355 u8 no_of_acls; /* number of ACL's (low 3 bits) */ 355 u8 no_of_acls; /* number of ACL's (low 3 bits) */
356 u8 ix; /* code page index (of filename), see 356 u8 ix; /* code page index (of filename), see
357 struct code_page_data */ 357 struct code_page_data */
@@ -375,50 +375,36 @@ struct hpfs_dirent {
375 375
376struct bplus_leaf_node 376struct bplus_leaf_node
377{ 377{
378 u32 file_secno; /* first file sector in extent */ 378 __le32 file_secno; /* first file sector in extent */
379 u32 length; /* length, sectors */ 379 __le32 length; /* length, sectors */
380 secno disk_secno; /* first corresponding disk sector */ 380 __le32 disk_secno; /* first corresponding disk sector */
381}; 381};
382 382
383struct bplus_internal_node 383struct bplus_internal_node
384{ 384{
385 u32 file_secno; /* subtree maps sectors < this */ 385 __le32 file_secno; /* subtree maps sectors < this */
386 anode_secno down; /* pointer to subtree */ 386 __le32 down; /* pointer to subtree */
387}; 387};
388 388
389enum {
390 BP_hbff = 1,
391 BP_fnode_parent = 0x20,
392 BP_binary_search = 0x40,
393 BP_internal = 0x80
394};
389struct bplus_header 395struct bplus_header
390{ 396{
391#ifdef __LITTLE_ENDIAN 397 u8 flags; /* bit 0 - high bit of first free entry offset
392 u8 hbff: 1; /* high bit of first free entry offset */ 398 bit 5 - we're pointed to by an fnode,
393 u8 flag1234: 4;
394 u8 fnode_parent: 1; /* ? we're pointed to by an fnode,
395 the data btree or some ea or the
396 main ea bootage pointer ea_secno */
397 /* also can get set in fnodes, which
398 may be a chkdsk glitch or may mean
399 this bit is irrelevant in fnodes,
400 or this interpretation is all wet */
401 u8 binary_search: 1; /* suggest binary search (unused) */
402 u8 internal: 1; /* 1 -> (internal) tree of anodes
403 0 -> (leaf) list of extents */
404#else
405 u8 internal: 1; /* 1 -> (internal) tree of anodes
406 0 -> (leaf) list of extents */
407 u8 binary_search: 1; /* suggest binary search (unused) */
408 u8 fnode_parent: 1; /* ? we're pointed to by an fnode,
409 the data btree or some ea or the 399 the data btree or some ea or the
410 main ea bootage pointer ea_secno */ 400 main ea bootage pointer ea_secno
411 /* also can get set in fnodes, which 401 bit 6 - suggest binary search (unused)
412 may be a chkdsk glitch or may mean 402 bit 7 - 1 -> (internal) tree of anodes
413 this bit is irrelevant in fnodes, 403 0 -> (leaf) list of extents */
414 or this interpretation is all wet */
415 u8 flag1234: 4;
416 u8 hbff: 1; /* high bit of first free entry offset */
417#endif
418 u8 fill[3]; 404 u8 fill[3];
419 u8 n_free_nodes; /* free nodes in following array */ 405 u8 n_free_nodes; /* free nodes in following array */
420 u8 n_used_nodes; /* used nodes in following array */ 406 u8 n_used_nodes; /* used nodes in following array */
421 u16 first_free; /* offset from start of header to 407 __le16 first_free; /* offset from start of header to
422 first free node in array */ 408 first free node in array */
423 union { 409 union {
424 struct bplus_internal_node internal[0]; /* (internal) 2-word entries giving 410 struct bplus_internal_node internal[0]; /* (internal) 2-word entries giving
@@ -428,6 +414,16 @@ struct bplus_header
428 } u; 414 } u;
429}; 415};
430 416
417static inline bool bp_internal(struct bplus_header *bp)
418{
419 return bp->flags & BP_internal;
420}
421
422static inline bool bp_fnode_parent(struct bplus_header *bp)
423{
424 return bp->flags & BP_fnode_parent;
425}
426
431/* fnode: root of allocation b+ tree, and EA's */ 427/* fnode: root of allocation b+ tree, and EA's */
432 428
433/* Every file and every directory has one fnode, pointed to by the directory 429/* Every file and every directory has one fnode, pointed to by the directory
@@ -436,62 +432,56 @@ struct bplus_header
436 432
437#define FNODE_MAGIC 0xf7e40aae 433#define FNODE_MAGIC 0xf7e40aae
438 434
435enum {FNODE_anode = cpu_to_le16(2), FNODE_dir = cpu_to_le16(256)};
439struct fnode 436struct fnode
440{ 437{
441 u32 magic; /* f7e4 0aae */ 438 __le32 magic; /* f7e4 0aae */
442 u32 zero1[2]; /* read history */ 439 __le32 zero1[2]; /* read history */
443 u8 len, name[15]; /* true length, truncated name */ 440 u8 len, name[15]; /* true length, truncated name */
444 fnode_secno up; /* pointer to file's directory fnode */ 441 __le32 up; /* pointer to file's directory fnode */
445 secno acl_size_l; 442 __le32 acl_size_l;
446 secno acl_secno; 443 __le32 acl_secno;
447 u16 acl_size_s; 444 __le16 acl_size_s;
448 u8 acl_anode; 445 u8 acl_anode;
449 u8 zero2; /* history bit count */ 446 u8 zero2; /* history bit count */
450 u32 ea_size_l; /* length of disk-resident ea's */ 447 __le32 ea_size_l; /* length of disk-resident ea's */
451 secno ea_secno; /* first sector of disk-resident ea's*/ 448 __le32 ea_secno; /* first sector of disk-resident ea's*/
452 u16 ea_size_s; /* length of fnode-resident ea's */ 449 __le16 ea_size_s; /* length of fnode-resident ea's */
453
454#ifdef __LITTLE_ENDIAN
455 u8 flag0: 1;
456 u8 ea_anode: 1; /* 1 -> ea_secno is an anode */
457 u8 flag234567: 6;
458#else
459 u8 flag234567: 6;
460 u8 ea_anode: 1; /* 1 -> ea_secno is an anode */
461 u8 flag0: 1;
462#endif
463 450
464#ifdef __LITTLE_ENDIAN 451 __le16 flags; /* bit 1 set -> ea_secno is an anode */
465 u8 dirflag: 1; /* 1 -> directory. first & only extent 452 /* bit 8 set -> directory. first & only extent
466 points to dnode. */
467 u8 flag9012345: 7;
468#else
469 u8 flag9012345: 7;
470 u8 dirflag: 1; /* 1 -> directory. first & only extent
471 points to dnode. */ 453 points to dnode. */
472#endif
473
474 struct bplus_header btree; /* b+ tree, 8 extents or 12 subtrees */ 454 struct bplus_header btree; /* b+ tree, 8 extents or 12 subtrees */
475 union { 455 union {
476 struct bplus_leaf_node external[8]; 456 struct bplus_leaf_node external[8];
477 struct bplus_internal_node internal[12]; 457 struct bplus_internal_node internal[12];
478 } u; 458 } u;
479 459
480 u32 file_size; /* file length, bytes */ 460 __le32 file_size; /* file length, bytes */
481 u32 n_needea; /* number of EA's with NEEDEA set */ 461 __le32 n_needea; /* number of EA's with NEEDEA set */
482 u8 user_id[16]; /* unused */ 462 u8 user_id[16]; /* unused */
483 u16 ea_offs; /* offset from start of fnode 463 __le16 ea_offs; /* offset from start of fnode
484 to first fnode-resident ea */ 464 to first fnode-resident ea */
485 u8 dasd_limit_treshhold; 465 u8 dasd_limit_treshhold;
486 u8 dasd_limit_delta; 466 u8 dasd_limit_delta;
487 u32 dasd_limit; 467 __le32 dasd_limit;
488 u32 dasd_usage; 468 __le32 dasd_usage;
489 u8 ea[316]; /* zero or more EA's, packed together 469 u8 ea[316]; /* zero or more EA's, packed together
490 with no alignment padding. 470 with no alignment padding.
491 (Do not use this name, get here 471 (Do not use this name, get here
492 via fnode + ea_offs. I think.) */ 472 via fnode + ea_offs. I think.) */
493}; 473};
494 474
475static inline bool fnode_in_anode(struct fnode *p)
476{
477 return (p->flags & FNODE_anode) != 0;
478}
479
480static inline bool fnode_is_dir(struct fnode *p)
481{
482 return (p->flags & FNODE_dir) != 0;
483}
484
495 485
496/* anode: 99.44% pure allocation tree */ 486/* anode: 99.44% pure allocation tree */
497 487
@@ -499,9 +489,9 @@ struct fnode
499 489
500struct anode 490struct anode
501{ 491{
502 u32 magic; /* 37e4 0aae */ 492 __le32 magic; /* 37e4 0aae */
503 anode_secno self; /* pointer to this anode */ 493 __le32 self; /* pointer to this anode */
504 secno up; /* parent anode or fnode */ 494 __le32 up; /* parent anode or fnode */
505 495
506 struct bplus_header btree; /* b+tree, 40 extents or 60 subtrees */ 496 struct bplus_header btree; /* b+tree, 40 extents or 60 subtrees */
507 union { 497 union {
@@ -509,7 +499,7 @@ struct anode
509 struct bplus_internal_node internal[60]; 499 struct bplus_internal_node internal[60];
510 } u; 500 } u;
511 501
512 u32 fill[3]; /* unused */ 502 __le32 fill[3]; /* unused */
513}; 503};
514 504
515 505
@@ -528,32 +518,23 @@ struct anode
528 run, or in multiple runs. Flags in the fnode tell whether the EA list 518 run, or in multiple runs. Flags in the fnode tell whether the EA list
529 is immediate, in a single run, or in multiple runs. */ 519 is immediate, in a single run, or in multiple runs. */
530 520
521enum {EA_indirect = 1, EA_anode = 2, EA_needea = 128 };
531struct extended_attribute 522struct extended_attribute
532{ 523{
533#ifdef __LITTLE_ENDIAN 524 u8 flags; /* bit 0 set -> value gives sector number
534 u8 indirect: 1; /* 1 -> value gives sector number
535 where real value starts */ 525 where real value starts */
536 u8 anode: 1; /* 1 -> sector is an anode 526 /* bit 1 set -> sector is an anode
537 that points to fragmented value */ 527 that points to fragmented value */
538 u8 flag23456: 5; 528 /* bit 7 set -> required ea */
539 u8 needea: 1; /* required ea */
540#else
541 u8 needea: 1; /* required ea */
542 u8 flag23456: 5;
543 u8 anode: 1; /* 1 -> sector is an anode
544 that points to fragmented value */
545 u8 indirect: 1; /* 1 -> value gives sector number
546 where real value starts */
547#endif
548 u8 namelen; /* length of name, bytes */ 529 u8 namelen; /* length of name, bytes */
549 u8 valuelen_lo; /* length of value, bytes */ 530 u8 valuelen_lo; /* length of value, bytes */
550 u8 valuelen_hi; /* length of value, bytes */ 531 u8 valuelen_hi; /* length of value, bytes */
551 u8 name[0]; 532 u8 name[];
552 /* 533 /*
553 u8 name[namelen]; ascii attrib name 534 u8 name[namelen]; ascii attrib name
554 u8 nul; terminating '\0', not counted 535 u8 nul; terminating '\0', not counted
555 u8 value[valuelen]; value, arbitrary 536 u8 value[valuelen]; value, arbitrary
556 if this.indirect, valuelen is 8 and the value is 537 if this.flags & 1, valuelen is 8 and the value is
557 u32 length; real length of value, bytes 538 u32 length; real length of value, bytes
558 secno secno; sector address where it starts 539 secno secno; sector address where it starts
559 if this.anode, the above sector number is the root of an anode tree 540 if this.anode, the above sector number is the root of an anode tree
@@ -561,6 +542,16 @@ struct extended_attribute
561 */ 542 */
562}; 543};
563 544
545static inline bool ea_indirect(struct extended_attribute *ea)
546{
547 return ea->flags & EA_indirect;
548}
549
550static inline bool ea_in_anode(struct extended_attribute *ea)
551{
552 return ea->flags & EA_anode;
553}
554
564/* 555/*
565 Local Variables: 556 Local Variables:
566 comment-column: 40 557 comment-column: 40
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 6d2d5008fa43..c07ef1f1ced6 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -75,7 +75,7 @@ struct hpfs_sb_info {
75 unsigned char *sb_cp_table; /* code page tables: */ 75 unsigned char *sb_cp_table; /* code page tables: */
76 /* 128 bytes uppercasing table & */ 76 /* 128 bytes uppercasing table & */
77 /* 128 bytes lowercasing table */ 77 /* 128 bytes lowercasing table */
78 unsigned *sb_bmp_dir; /* main bitmap directory */ 78 __le32 *sb_bmp_dir; /* main bitmap directory */
79 unsigned sb_c_bitmap; /* current bitmap */ 79 unsigned sb_c_bitmap; /* current bitmap */
80 unsigned sb_max_fwd_alloc; /* max forwad allocation */ 80 unsigned sb_max_fwd_alloc; /* max forwad allocation */
81 int sb_timeshift; 81 int sb_timeshift;
@@ -93,7 +93,7 @@ struct quad_buffer_head {
93static inline dnode_secno de_down_pointer (struct hpfs_dirent *de) 93static inline dnode_secno de_down_pointer (struct hpfs_dirent *de)
94{ 94{
95 CHKCOND(de->down,("HPFS: de_down_pointer: !de->down\n")); 95 CHKCOND(de->down,("HPFS: de_down_pointer: !de->down\n"));
96 return le32_to_cpu(*(dnode_secno *) ((void *) de + le16_to_cpu(de->length) - 4)); 96 return le32_to_cpu(*(__le32 *) ((void *) de + le16_to_cpu(de->length) - 4));
97} 97}
98 98
99/* The first dir entry in a dnode */ 99/* The first dir entry in a dnode */
@@ -141,12 +141,12 @@ static inline struct extended_attribute *next_ea(struct extended_attribute *ea)
141 141
142static inline secno ea_sec(struct extended_attribute *ea) 142static inline secno ea_sec(struct extended_attribute *ea)
143{ 143{
144 return le32_to_cpu(get_unaligned((secno *)((char *)ea + 9 + ea->namelen))); 144 return le32_to_cpu(get_unaligned((__le32 *)((char *)ea + 9 + ea->namelen)));
145} 145}
146 146
147static inline secno ea_len(struct extended_attribute *ea) 147static inline secno ea_len(struct extended_attribute *ea)
148{ 148{
149 return le32_to_cpu(get_unaligned((secno *)((char *)ea + 5 + ea->namelen))); 149 return le32_to_cpu(get_unaligned((__le32 *)((char *)ea + 5 + ea->namelen)));
150} 150}
151 151
152static inline char *ea_data(struct extended_attribute *ea) 152static inline char *ea_data(struct extended_attribute *ea)
@@ -171,7 +171,7 @@ static inline void copy_de(struct hpfs_dirent *dst, struct hpfs_dirent *src)
171 dst->not_8x3 = n; 171 dst->not_8x3 = n;
172} 172}
173 173
174static inline unsigned tstbits(u32 *bmp, unsigned b, unsigned n) 174static inline unsigned tstbits(__le32 *bmp, unsigned b, unsigned n)
175{ 175{
176 int i; 176 int i;
177 if ((b >= 0x4000) || (b + n - 1 >= 0x4000)) return n; 177 if ((b >= 0x4000) || (b + n - 1 >= 0x4000)) return n;
@@ -268,10 +268,10 @@ void hpfs_evict_inode(struct inode *);
268 268
269/* map.c */ 269/* map.c */
270 270
271unsigned *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *); 271__le32 *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
272unsigned *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *); 272__le32 *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
273unsigned char *hpfs_load_code_page(struct super_block *, secno); 273unsigned char *hpfs_load_code_page(struct super_block *, secno);
274secno *hpfs_load_bitmap_directory(struct super_block *, secno bmp); 274__le32 *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
275struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **); 275struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **);
276struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **); 276struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **);
277struct dnode *hpfs_map_dnode(struct super_block *s, dnode_secno, struct quad_buffer_head *); 277struct dnode *hpfs_map_dnode(struct super_block *s, dnode_secno, struct quad_buffer_head *);
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index b43066cbdc6a..ed671e0ea784 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -110,7 +110,7 @@ void hpfs_read_inode(struct inode *i)
110 } 110 }
111 } 111 }
112 } 112 }
113 if (fnode->dirflag) { 113 if (fnode_is_dir(fnode)) {
114 int n_dnodes, n_subdirs; 114 int n_dnodes, n_subdirs;
115 i->i_mode |= S_IFDIR; 115 i->i_mode |= S_IFDIR;
116 i->i_op = &hpfs_dir_iops; 116 i->i_op = &hpfs_dir_iops;
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index a790821366a7..4acb19d78359 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -8,12 +8,12 @@
8 8
9#include "hpfs_fn.h" 9#include "hpfs_fn.h"
10 10
11unsigned *hpfs_map_dnode_bitmap(struct super_block *s, struct quad_buffer_head *qbh) 11__le32 *hpfs_map_dnode_bitmap(struct super_block *s, struct quad_buffer_head *qbh)
12{ 12{
13 return hpfs_map_4sectors(s, hpfs_sb(s)->sb_dmap, qbh, 0); 13 return hpfs_map_4sectors(s, hpfs_sb(s)->sb_dmap, qbh, 0);
14} 14}
15 15
16unsigned int *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block, 16__le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
17 struct quad_buffer_head *qbh, char *id) 17 struct quad_buffer_head *qbh, char *id)
18{ 18{
19 secno sec; 19 secno sec;
@@ -89,18 +89,18 @@ unsigned char *hpfs_load_code_page(struct super_block *s, secno cps)
89 return cp_table; 89 return cp_table;
90} 90}
91 91
92secno *hpfs_load_bitmap_directory(struct super_block *s, secno bmp) 92__le32 *hpfs_load_bitmap_directory(struct super_block *s, secno bmp)
93{ 93{
94 struct buffer_head *bh; 94 struct buffer_head *bh;
95 int n = (hpfs_sb(s)->sb_fs_size + 0x200000 - 1) >> 21; 95 int n = (hpfs_sb(s)->sb_fs_size + 0x200000 - 1) >> 21;
96 int i; 96 int i;
97 secno *b; 97 __le32 *b;
98 if (!(b = kmalloc(n * 512, GFP_KERNEL))) { 98 if (!(b = kmalloc(n * 512, GFP_KERNEL))) {
99 printk("HPFS: can't allocate memory for bitmap directory\n"); 99 printk("HPFS: can't allocate memory for bitmap directory\n");
100 return NULL; 100 return NULL;
101 } 101 }
102 for (i=0;i<n;i++) { 102 for (i=0;i<n;i++) {
103 secno *d = hpfs_map_sector(s, bmp+i, &bh, n - i - 1); 103 __le32 *d = hpfs_map_sector(s, bmp+i, &bh, n - i - 1);
104 if (!d) { 104 if (!d) {
105 kfree(b); 105 kfree(b);
106 return NULL; 106 return NULL;
@@ -130,16 +130,16 @@ struct fnode *hpfs_map_fnode(struct super_block *s, ino_t ino, struct buffer_hea
130 (unsigned long)ino); 130 (unsigned long)ino);
131 goto bail; 131 goto bail;
132 } 132 }
133 if (!fnode->dirflag) { 133 if (!fnode_is_dir(fnode)) {
134 if ((unsigned)fnode->btree.n_used_nodes + (unsigned)fnode->btree.n_free_nodes != 134 if ((unsigned)fnode->btree.n_used_nodes + (unsigned)fnode->btree.n_free_nodes !=
135 (fnode->btree.internal ? 12 : 8)) { 135 (bp_internal(&fnode->btree) ? 12 : 8)) {
136 hpfs_error(s, 136 hpfs_error(s,
137 "bad number of nodes in fnode %08lx", 137 "bad number of nodes in fnode %08lx",
138 (unsigned long)ino); 138 (unsigned long)ino);
139 goto bail; 139 goto bail;
140 } 140 }
141 if (le16_to_cpu(fnode->btree.first_free) != 141 if (le16_to_cpu(fnode->btree.first_free) !=
142 8 + fnode->btree.n_used_nodes * (fnode->btree.internal ? 8 : 12)) { 142 8 + fnode->btree.n_used_nodes * (bp_internal(&fnode->btree) ? 8 : 12)) {
143 hpfs_error(s, 143 hpfs_error(s,
144 "bad first_free pointer in fnode %08lx", 144 "bad first_free pointer in fnode %08lx",
145 (unsigned long)ino); 145 (unsigned long)ino);
@@ -187,12 +187,12 @@ struct anode *hpfs_map_anode(struct super_block *s, anode_secno ano, struct buff
187 goto bail; 187 goto bail;
188 } 188 }
189 if ((unsigned)anode->btree.n_used_nodes + (unsigned)anode->btree.n_free_nodes != 189 if ((unsigned)anode->btree.n_used_nodes + (unsigned)anode->btree.n_free_nodes !=
190 (anode->btree.internal ? 60 : 40)) { 190 (bp_internal(&anode->btree) ? 60 : 40)) {
191 hpfs_error(s, "bad number of nodes in anode %08x", ano); 191 hpfs_error(s, "bad number of nodes in anode %08x", ano);
192 goto bail; 192 goto bail;
193 } 193 }
194 if (le16_to_cpu(anode->btree.first_free) != 194 if (le16_to_cpu(anode->btree.first_free) !=
195 8 + anode->btree.n_used_nodes * (anode->btree.internal ? 8 : 12)) { 195 8 + anode->btree.n_used_nodes * (bp_internal(&anode->btree) ? 8 : 12)) {
196 hpfs_error(s, "bad first_free pointer in anode %08x", ano); 196 hpfs_error(s, "bad first_free pointer in anode %08x", ano);
197 goto bail; 197 goto bail;
198 } 198 }
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 30dd7b10b507..9083ef8af58c 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -70,7 +70,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
70 fnode->len = len; 70 fnode->len = len;
71 memcpy(fnode->name, name, len > 15 ? 15 : len); 71 memcpy(fnode->name, name, len > 15 ? 15 : len);
72 fnode->up = cpu_to_le32(dir->i_ino); 72 fnode->up = cpu_to_le32(dir->i_ino);
73 fnode->dirflag = 1; 73 fnode->flags |= FNODE_dir;
74 fnode->btree.n_free_nodes = 7; 74 fnode->btree.n_free_nodes = 7;
75 fnode->btree.n_used_nodes = 1; 75 fnode->btree.n_used_nodes = 1;
76 fnode->btree.first_free = cpu_to_le16(0x14); 76 fnode->btree.first_free = cpu_to_le16(0x14);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 54f6eccb79d9..706a12c083ea 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -572,7 +572,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
572 mark_buffer_dirty(bh2); 572 mark_buffer_dirty(bh2);
573 } 573 }
574 574
575 if (le32_to_cpu(spareblock->hotfixes_used) || le32_to_cpu(spareblock->n_spares_used)) { 575 if (spareblock->hotfixes_used || spareblock->n_spares_used) {
576 if (errs >= 2) { 576 if (errs >= 2) {
577 printk("HPFS: Hotfixes not supported here, try chkdsk\n"); 577 printk("HPFS: Hotfixes not supported here, try chkdsk\n");
578 mark_dirty(s, 0); 578 mark_dirty(s, 0);
@@ -645,7 +645,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
645 root->i_mtime.tv_nsec = 0; 645 root->i_mtime.tv_nsec = 0;
646 root->i_ctime.tv_sec = local_to_gmt(s, le32_to_cpu(de->creation_date)); 646 root->i_ctime.tv_sec = local_to_gmt(s, le32_to_cpu(de->creation_date));
647 root->i_ctime.tv_nsec = 0; 647 root->i_ctime.tv_nsec = 0;
648 hpfs_i(root)->i_ea_size = le16_to_cpu(de->ea_size); 648 hpfs_i(root)->i_ea_size = le32_to_cpu(de->ea_size);
649 hpfs_i(root)->i_parent_dir = root->i_ino; 649 hpfs_i(root)->i_parent_dir = root->i_ino;
650 if (root->i_size == -1) 650 if (root->i_size == -1)
651 root->i_size = 2048; 651 root->i_size = 2048;
diff --git a/fs/inode.c b/fs/inode.c
index c474c1d7062b..c99163b1b310 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1487,10 +1487,30 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
1487 return 0; 1487 return 0;
1488} 1488}
1489 1489
1490/*
1491 * This does the actual work of updating an inodes time or version. Must have
1492 * had called mnt_want_write() before calling this.
1493 */
1494static int update_time(struct inode *inode, struct timespec *time, int flags)
1495{
1496 if (inode->i_op->update_time)
1497 return inode->i_op->update_time(inode, time, flags);
1498
1499 if (flags & S_ATIME)
1500 inode->i_atime = *time;
1501 if (flags & S_VERSION)
1502 inode_inc_iversion(inode);
1503 if (flags & S_CTIME)
1504 inode->i_ctime = *time;
1505 if (flags & S_MTIME)
1506 inode->i_mtime = *time;
1507 mark_inode_dirty_sync(inode);
1508 return 0;
1509}
1510
1490/** 1511/**
1491 * touch_atime - update the access time 1512 * touch_atime - update the access time
1492 * @mnt: mount the inode is accessed on 1513 * @path: the &struct path to update
1493 * @dentry: dentry accessed
1494 * 1514 *
1495 * Update the accessed time on an inode and mark it for writeback. 1515 * Update the accessed time on an inode and mark it for writeback.
1496 * This function automatically handles read only file systems and media, 1516 * This function automatically handles read only file systems and media,
@@ -1525,12 +1545,83 @@ void touch_atime(struct path *path)
1525 if (mnt_want_write(mnt)) 1545 if (mnt_want_write(mnt))
1526 return; 1546 return;
1527 1547
1528 inode->i_atime = now; 1548 /*
1529 mark_inode_dirty_sync(inode); 1549 * File systems can error out when updating inodes if they need to
1550 * allocate new space to modify an inode (such is the case for
1551 * Btrfs), but since we touch atime while walking down the path we
1552 * really don't care if we failed to update the atime of the file,
1553 * so just ignore the return value.
1554 */
1555 update_time(inode, &now, S_ATIME);
1530 mnt_drop_write(mnt); 1556 mnt_drop_write(mnt);
1531} 1557}
1532EXPORT_SYMBOL(touch_atime); 1558EXPORT_SYMBOL(touch_atime);
1533 1559
1560/*
1561 * The logic we want is
1562 *
1563 * if suid or (sgid and xgrp)
1564 * remove privs
1565 */
1566int should_remove_suid(struct dentry *dentry)
1567{
1568 umode_t mode = dentry->d_inode->i_mode;
1569 int kill = 0;
1570
1571 /* suid always must be killed */
1572 if (unlikely(mode & S_ISUID))
1573 kill = ATTR_KILL_SUID;
1574
1575 /*
1576 * sgid without any exec bits is just a mandatory locking mark; leave
1577 * it alone. If some exec bits are set, it's a real sgid; kill it.
1578 */
1579 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1580 kill |= ATTR_KILL_SGID;
1581
1582 if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
1583 return kill;
1584
1585 return 0;
1586}
1587EXPORT_SYMBOL(should_remove_suid);
1588
1589static int __remove_suid(struct dentry *dentry, int kill)
1590{
1591 struct iattr newattrs;
1592
1593 newattrs.ia_valid = ATTR_FORCE | kill;
1594 return notify_change(dentry, &newattrs);
1595}
1596
1597int file_remove_suid(struct file *file)
1598{
1599 struct dentry *dentry = file->f_path.dentry;
1600 struct inode *inode = dentry->d_inode;
1601 int killsuid;
1602 int killpriv;
1603 int error = 0;
1604
1605 /* Fast path for nothing security related */
1606 if (IS_NOSEC(inode))
1607 return 0;
1608
1609 killsuid = should_remove_suid(dentry);
1610 killpriv = security_inode_need_killpriv(dentry);
1611
1612 if (killpriv < 0)
1613 return killpriv;
1614 if (killpriv)
1615 error = security_inode_killpriv(dentry);
1616 if (!error && killsuid)
1617 error = __remove_suid(dentry, killsuid);
1618 if (!error && (inode->i_sb->s_flags & MS_NOSEC))
1619 inode->i_flags |= S_NOSEC;
1620
1621 return error;
1622}
1623EXPORT_SYMBOL(file_remove_suid);
1624
1534/** 1625/**
1535 * file_update_time - update mtime and ctime time 1626 * file_update_time - update mtime and ctime time
1536 * @file: file accessed 1627 * @file: file accessed
@@ -1540,18 +1631,20 @@ EXPORT_SYMBOL(touch_atime);
1540 * usage in the file write path of filesystems, and filesystems may 1631 * usage in the file write path of filesystems, and filesystems may
1541 * choose to explicitly ignore update via this function with the 1632 * choose to explicitly ignore update via this function with the
1542 * S_NOCMTIME inode flag, e.g. for network filesystem where these 1633 * S_NOCMTIME inode flag, e.g. for network filesystem where these
1543 * timestamps are handled by the server. 1634 * timestamps are handled by the server. This can return an error for
1635 * file systems who need to allocate space in order to update an inode.
1544 */ 1636 */
1545 1637
1546void file_update_time(struct file *file) 1638int file_update_time(struct file *file)
1547{ 1639{
1548 struct inode *inode = file->f_path.dentry->d_inode; 1640 struct inode *inode = file->f_path.dentry->d_inode;
1549 struct timespec now; 1641 struct timespec now;
1550 enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0; 1642 int sync_it = 0;
1643 int ret;
1551 1644
1552 /* First try to exhaust all avenues to not sync */ 1645 /* First try to exhaust all avenues to not sync */
1553 if (IS_NOCMTIME(inode)) 1646 if (IS_NOCMTIME(inode))
1554 return; 1647 return 0;
1555 1648
1556 now = current_fs_time(inode->i_sb); 1649 now = current_fs_time(inode->i_sb);
1557 if (!timespec_equal(&inode->i_mtime, &now)) 1650 if (!timespec_equal(&inode->i_mtime, &now))
@@ -1564,21 +1657,16 @@ void file_update_time(struct file *file)
1564 sync_it |= S_VERSION; 1657 sync_it |= S_VERSION;
1565 1658
1566 if (!sync_it) 1659 if (!sync_it)
1567 return; 1660 return 0;
1568 1661
1569 /* Finally allowed to write? Takes lock. */ 1662 /* Finally allowed to write? Takes lock. */
1570 if (mnt_want_write_file(file)) 1663 if (mnt_want_write_file(file))
1571 return; 1664 return 0;
1572 1665
1573 /* Only change inode inside the lock region */ 1666 ret = update_time(inode, &now, sync_it);
1574 if (sync_it & S_VERSION)
1575 inode_inc_iversion(inode);
1576 if (sync_it & S_CTIME)
1577 inode->i_ctime = now;
1578 if (sync_it & S_MTIME)
1579 inode->i_mtime = now;
1580 mark_inode_dirty_sync(inode);
1581 mnt_drop_write_file(file); 1667 mnt_drop_write_file(file);
1668
1669 return ret;
1582} 1670}
1583EXPORT_SYMBOL(file_update_time); 1671EXPORT_SYMBOL(file_update_time);
1584 1672
diff --git a/fs/internal.h b/fs/internal.h
index 9962c59ba280..18bc216ea09d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -56,7 +56,7 @@ extern int sb_prepare_remount_readonly(struct super_block *);
56 56
57extern void __init mnt_init(void); 57extern void __init mnt_init(void);
58 58
59DECLARE_BRLOCK(vfsmount_lock); 59extern struct lglock vfsmount_lock;
60 60
61 61
62/* 62/*
@@ -100,6 +100,7 @@ extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
100 100
101extern long do_handle_open(int mountdirfd, 101extern long do_handle_open(int mountdirfd,
102 struct file_handle __user *ufh, int open_flag); 102 struct file_handle __user *ufh, int open_flag);
103extern int open_check_o_direct(struct file *f);
103 104
104/* 105/*
105 * inode.c 106 * inode.c
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index dd4687ff30d0..aa4356d09eee 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -107,12 +107,11 @@ static struct dentry *isofs_export_get_parent(struct dentry *child)
107} 107}
108 108
109static int 109static int
110isofs_export_encode_fh(struct dentry *dentry, 110isofs_export_encode_fh(struct inode *inode,
111 __u32 *fh32, 111 __u32 *fh32,
112 int *max_len, 112 int *max_len,
113 int connectable) 113 struct inode *parent)
114{ 114{
115 struct inode * inode = dentry->d_inode;
116 struct iso_inode_info * ei = ISOFS_I(inode); 115 struct iso_inode_info * ei = ISOFS_I(inode);
117 int len = *max_len; 116 int len = *max_len;
118 int type = 1; 117 int type = 1;
@@ -124,7 +123,7 @@ isofs_export_encode_fh(struct dentry *dentry,
124 * offset of the inode and the upper 16 bits of fh32[1] to 123 * offset of the inode and the upper 16 bits of fh32[1] to
125 * hold the offset of the parent. 124 * hold the offset of the parent.
126 */ 125 */
127 if (connectable && (len < 5)) { 126 if (parent && (len < 5)) {
128 *max_len = 5; 127 *max_len = 5;
129 return 255; 128 return 255;
130 } else if (len < 3) { 129 } else if (len < 3) {
@@ -136,16 +135,12 @@ isofs_export_encode_fh(struct dentry *dentry,
136 fh32[0] = ei->i_iget5_block; 135 fh32[0] = ei->i_iget5_block;
137 fh16[2] = (__u16)ei->i_iget5_offset; /* fh16 [sic] */ 136 fh16[2] = (__u16)ei->i_iget5_offset; /* fh16 [sic] */
138 fh32[2] = inode->i_generation; 137 fh32[2] = inode->i_generation;
139 if (connectable && !S_ISDIR(inode->i_mode)) { 138 if (parent) {
140 struct inode *parent;
141 struct iso_inode_info *eparent; 139 struct iso_inode_info *eparent;
142 spin_lock(&dentry->d_lock);
143 parent = dentry->d_parent->d_inode;
144 eparent = ISOFS_I(parent); 140 eparent = ISOFS_I(parent);
145 fh32[3] = eparent->i_iget5_block; 141 fh32[3] = eparent->i_iget5_block;
146 fh16[3] = (__u16)eparent->i_iget5_offset; /* fh16 [sic] */ 142 fh16[3] = (__u16)eparent->i_iget5_offset; /* fh16 [sic] */
147 fh32[4] = parent->i_generation; 143 fh32[4] = parent->i_generation;
148 spin_unlock(&dentry->d_lock);
149 len = 5; 144 len = 5;
150 type = 2; 145 type = 2;
151 } 146 }
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
index f32f346f4b0a..69a48c2944da 100644
--- a/fs/jbd2/Kconfig
+++ b/fs/jbd2/Kconfig
@@ -1,6 +1,8 @@
1config JBD2 1config JBD2
2 tristate 2 tristate
3 select CRC32 3 select CRC32
4 select CRYPTO
5 select CRYPTO_CRC32C
4 help 6 help
5 This is a generic journaling layer for block devices that support 7 This is a generic journaling layer for block devices that support
6 both 32-bit and 64-bit block numbers. It is currently used by 8 both 32-bit and 64-bit block numbers. It is currently used by
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 840f70f50792..216f4299f65e 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -85,6 +85,24 @@ nope:
85 __brelse(bh); 85 __brelse(bh);
86} 86}
87 87
88static void jbd2_commit_block_csum_set(journal_t *j,
89 struct journal_head *descriptor)
90{
91 struct commit_header *h;
92 __u32 csum;
93
94 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
95 return;
96
97 h = (struct commit_header *)(jh2bh(descriptor)->b_data);
98 h->h_chksum_type = 0;
99 h->h_chksum_size = 0;
100 h->h_chksum[0] = 0;
101 csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
102 j->j_blocksize);
103 h->h_chksum[0] = cpu_to_be32(csum);
104}
105
88/* 106/*
89 * Done it all: now submit the commit record. We should have 107 * Done it all: now submit the commit record. We should have
90 * cleaned up our previous buffers by now, so if we are in abort 108 * cleaned up our previous buffers by now, so if we are in abort
@@ -128,6 +146,7 @@ static int journal_submit_commit_record(journal_t *journal,
128 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; 146 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
129 tmp->h_chksum[0] = cpu_to_be32(crc32_sum); 147 tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
130 } 148 }
149 jbd2_commit_block_csum_set(journal, descriptor);
131 150
132 JBUFFER_TRACE(descriptor, "submit commit block"); 151 JBUFFER_TRACE(descriptor, "submit commit block");
133 lock_buffer(bh); 152 lock_buffer(bh);
@@ -301,6 +320,44 @@ static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
301 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); 320 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
302} 321}
303 322
323static void jbd2_descr_block_csum_set(journal_t *j,
324 struct journal_head *descriptor)
325{
326 struct jbd2_journal_block_tail *tail;
327 __u32 csum;
328
329 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
330 return;
331
332 tail = (struct jbd2_journal_block_tail *)
333 (jh2bh(descriptor)->b_data + j->j_blocksize -
334 sizeof(struct jbd2_journal_block_tail));
335 tail->t_checksum = 0;
336 csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
337 j->j_blocksize);
338 tail->t_checksum = cpu_to_be32(csum);
339}
340
341static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
342 struct buffer_head *bh, __u32 sequence)
343{
344 struct page *page = bh->b_page;
345 __u8 *addr;
346 __u32 csum;
347
348 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
349 return;
350
351 sequence = cpu_to_be32(sequence);
352 addr = kmap_atomic(page, KM_USER0);
353 csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
354 sizeof(sequence));
355 csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data),
356 bh->b_size);
357 kunmap_atomic(addr, KM_USER0);
358
359 tag->t_checksum = cpu_to_be32(csum);
360}
304/* 361/*
305 * jbd2_journal_commit_transaction 362 * jbd2_journal_commit_transaction
306 * 363 *
@@ -334,6 +391,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
334 unsigned long first_block; 391 unsigned long first_block;
335 tid_t first_tid; 392 tid_t first_tid;
336 int update_tail; 393 int update_tail;
394 int csum_size = 0;
395
396 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
397 csum_size = sizeof(struct jbd2_journal_block_tail);
337 398
338 /* 399 /*
339 * First job: lock down the current transaction and wait for 400 * First job: lock down the current transaction and wait for
@@ -627,7 +688,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
627 688
628 tag = (journal_block_tag_t *) tagp; 689 tag = (journal_block_tag_t *) tagp;
629 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); 690 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
630 tag->t_flags = cpu_to_be32(tag_flag); 691 tag->t_flags = cpu_to_be16(tag_flag);
692 jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh),
693 commit_transaction->t_tid);
631 tagp += tag_bytes; 694 tagp += tag_bytes;
632 space_left -= tag_bytes; 695 space_left -= tag_bytes;
633 696
@@ -643,7 +706,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
643 706
644 if (bufs == journal->j_wbufsize || 707 if (bufs == journal->j_wbufsize ||
645 commit_transaction->t_buffers == NULL || 708 commit_transaction->t_buffers == NULL ||
646 space_left < tag_bytes + 16) { 709 space_left < tag_bytes + 16 + csum_size) {
647 710
648 jbd_debug(4, "JBD2: Submit %d IOs\n", bufs); 711 jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
649 712
@@ -651,8 +714,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
651 submitting the IOs. "tag" still points to 714 submitting the IOs. "tag" still points to
652 the last tag we set up. */ 715 the last tag we set up. */
653 716
654 tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG); 717 tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
655 718
719 jbd2_descr_block_csum_set(journal, descriptor);
656start_journal_io: 720start_journal_io:
657 for (i = 0; i < bufs; i++) { 721 for (i = 0; i < bufs; i++) {
658 struct buffer_head *bh = wbuf[i]; 722 struct buffer_head *bh = wbuf[i];
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 1afb701622b0..e9a3c4c85594 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -97,6 +97,43 @@ EXPORT_SYMBOL(jbd2_inode_cache);
97static void __journal_abort_soft (journal_t *journal, int errno); 97static void __journal_abort_soft (journal_t *journal, int errno);
98static int jbd2_journal_create_slab(size_t slab_size); 98static int jbd2_journal_create_slab(size_t slab_size);
99 99
100/* Checksumming functions */
101int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
102{
103 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
104 return 1;
105
106 return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
107}
108
109static __u32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
110{
111 __u32 csum, old_csum;
112
113 old_csum = sb->s_checksum;
114 sb->s_checksum = 0;
115 csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t));
116 sb->s_checksum = old_csum;
117
118 return cpu_to_be32(csum);
119}
120
121int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
122{
123 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
124 return 1;
125
126 return sb->s_checksum == jbd2_superblock_csum(j, sb);
127}
128
129void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
130{
131 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
132 return;
133
134 sb->s_checksum = jbd2_superblock_csum(j, sb);
135}
136
100/* 137/*
101 * Helper function used to manage commit timeouts 138 * Helper function used to manage commit timeouts
102 */ 139 */
@@ -1348,6 +1385,7 @@ static void jbd2_journal_update_sb_errno(journal_t *journal)
1348 jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", 1385 jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
1349 journal->j_errno); 1386 journal->j_errno);
1350 sb->s_errno = cpu_to_be32(journal->j_errno); 1387 sb->s_errno = cpu_to_be32(journal->j_errno);
1388 jbd2_superblock_csum_set(journal, sb);
1351 read_unlock(&journal->j_state_lock); 1389 read_unlock(&journal->j_state_lock);
1352 1390
1353 jbd2_write_superblock(journal, WRITE_SYNC); 1391 jbd2_write_superblock(journal, WRITE_SYNC);
@@ -1376,6 +1414,9 @@ static int journal_get_superblock(journal_t *journal)
1376 } 1414 }
1377 } 1415 }
1378 1416
1417 if (buffer_verified(bh))
1418 return 0;
1419
1379 sb = journal->j_superblock; 1420 sb = journal->j_superblock;
1380 1421
1381 err = -EINVAL; 1422 err = -EINVAL;
@@ -1413,6 +1454,43 @@ static int journal_get_superblock(journal_t *journal)
1413 goto out; 1454 goto out;
1414 } 1455 }
1415 1456
1457 if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) &&
1458 JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
1459 /* Can't have checksum v1 and v2 on at the same time! */
1460 printk(KERN_ERR "JBD: Can't enable checksumming v1 and v2 "
1461 "at the same time!\n");
1462 goto out;
1463 }
1464
1465 if (!jbd2_verify_csum_type(journal, sb)) {
1466 printk(KERN_ERR "JBD: Unknown checksum type\n");
1467 goto out;
1468 }
1469
1470 /* Load the checksum driver */
1471 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
1472 journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
1473 if (IS_ERR(journal->j_chksum_driver)) {
1474 printk(KERN_ERR "JBD: Cannot load crc32c driver.\n");
1475 err = PTR_ERR(journal->j_chksum_driver);
1476 journal->j_chksum_driver = NULL;
1477 goto out;
1478 }
1479 }
1480
1481 /* Check superblock checksum */
1482 if (!jbd2_superblock_csum_verify(journal, sb)) {
1483 printk(KERN_ERR "JBD: journal checksum error\n");
1484 goto out;
1485 }
1486
1487 /* Precompute checksum seed for all metadata */
1488 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
1489 journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
1490 sizeof(sb->s_uuid));
1491
1492 set_buffer_verified(bh);
1493
1416 return 0; 1494 return 0;
1417 1495
1418out: 1496out:
@@ -1564,6 +1642,8 @@ int jbd2_journal_destroy(journal_t *journal)
1564 iput(journal->j_inode); 1642 iput(journal->j_inode);
1565 if (journal->j_revoke) 1643 if (journal->j_revoke)
1566 jbd2_journal_destroy_revoke(journal); 1644 jbd2_journal_destroy_revoke(journal);
1645 if (journal->j_chksum_driver)
1646 crypto_free_shash(journal->j_chksum_driver);
1567 kfree(journal->j_wbuf); 1647 kfree(journal->j_wbuf);
1568 kfree(journal); 1648 kfree(journal);
1569 1649
@@ -1653,6 +1733,10 @@ int jbd2_journal_check_available_features (journal_t *journal, unsigned long com
1653int jbd2_journal_set_features (journal_t *journal, unsigned long compat, 1733int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
1654 unsigned long ro, unsigned long incompat) 1734 unsigned long ro, unsigned long incompat)
1655{ 1735{
1736#define INCOMPAT_FEATURE_ON(f) \
1737 ((incompat & (f)) && !(sb->s_feature_incompat & cpu_to_be32(f)))
1738#define COMPAT_FEATURE_ON(f) \
1739 ((compat & (f)) && !(sb->s_feature_compat & cpu_to_be32(f)))
1656 journal_superblock_t *sb; 1740 journal_superblock_t *sb;
1657 1741
1658 if (jbd2_journal_check_used_features(journal, compat, ro, incompat)) 1742 if (jbd2_journal_check_used_features(journal, compat, ro, incompat))
@@ -1661,16 +1745,54 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
1661 if (!jbd2_journal_check_available_features(journal, compat, ro, incompat)) 1745 if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
1662 return 0; 1746 return 0;
1663 1747
1748 /* Asking for checksumming v2 and v1? Only give them v2. */
1749 if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2 &&
1750 compat & JBD2_FEATURE_COMPAT_CHECKSUM)
1751 compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
1752
1664 jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", 1753 jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
1665 compat, ro, incompat); 1754 compat, ro, incompat);
1666 1755
1667 sb = journal->j_superblock; 1756 sb = journal->j_superblock;
1668 1757
1758 /* If enabling v2 checksums, update superblock */
1759 if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
1760 sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
1761 sb->s_feature_compat &=
1762 ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
1763
1764 /* Load the checksum driver */
1765 if (journal->j_chksum_driver == NULL) {
1766 journal->j_chksum_driver = crypto_alloc_shash("crc32c",
1767 0, 0);
1768 if (IS_ERR(journal->j_chksum_driver)) {
1769 printk(KERN_ERR "JBD: Cannot load crc32c "
1770 "driver.\n");
1771 journal->j_chksum_driver = NULL;
1772 return 0;
1773 }
1774 }
1775
1776 /* Precompute checksum seed for all metadata */
1777 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
1778 JBD2_FEATURE_INCOMPAT_CSUM_V2))
1779 journal->j_csum_seed = jbd2_chksum(journal, ~0,
1780 sb->s_uuid,
1781 sizeof(sb->s_uuid));
1782 }
1783
1784 /* If enabling v1 checksums, downgrade superblock */
1785 if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM))
1786 sb->s_feature_incompat &=
1787 ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2);
1788
1669 sb->s_feature_compat |= cpu_to_be32(compat); 1789 sb->s_feature_compat |= cpu_to_be32(compat);
1670 sb->s_feature_ro_compat |= cpu_to_be32(ro); 1790 sb->s_feature_ro_compat |= cpu_to_be32(ro);
1671 sb->s_feature_incompat |= cpu_to_be32(incompat); 1791 sb->s_feature_incompat |= cpu_to_be32(incompat);
1672 1792
1673 return 1; 1793 return 1;
1794#undef COMPAT_FEATURE_ON
1795#undef INCOMPAT_FEATURE_ON
1674} 1796}
1675 1797
1676/* 1798/*
@@ -1975,10 +2097,16 @@ int jbd2_journal_blocks_per_page(struct inode *inode)
1975 */ 2097 */
1976size_t journal_tag_bytes(journal_t *journal) 2098size_t journal_tag_bytes(journal_t *journal)
1977{ 2099{
2100 journal_block_tag_t tag;
2101 size_t x = 0;
2102
2103 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
2104 x += sizeof(tag.t_checksum);
2105
1978 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) 2106 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
1979 return JBD2_TAG_SIZE64; 2107 return x + JBD2_TAG_SIZE64;
1980 else 2108 else
1981 return JBD2_TAG_SIZE32; 2109 return x + JBD2_TAG_SIZE32;
1982} 2110}
1983 2111
1984/* 2112/*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index c1a03354a22f..0131e4362534 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -174,6 +174,25 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
174 return 0; 174 return 0;
175} 175}
176 176
177static int jbd2_descr_block_csum_verify(journal_t *j,
178 void *buf)
179{
180 struct jbd2_journal_block_tail *tail;
181 __u32 provided, calculated;
182
183 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
184 return 1;
185
186 tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize -
187 sizeof(struct jbd2_journal_block_tail));
188 provided = tail->t_checksum;
189 tail->t_checksum = 0;
190 calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
191 tail->t_checksum = provided;
192
193 provided = be32_to_cpu(provided);
194 return provided == calculated;
195}
177 196
178/* 197/*
179 * Count the number of in-use tags in a journal descriptor block. 198 * Count the number of in-use tags in a journal descriptor block.
@@ -186,6 +205,9 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
186 int nr = 0, size = journal->j_blocksize; 205 int nr = 0, size = journal->j_blocksize;
187 int tag_bytes = journal_tag_bytes(journal); 206 int tag_bytes = journal_tag_bytes(journal);
188 207
208 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
209 size -= sizeof(struct jbd2_journal_block_tail);
210
189 tagp = &bh->b_data[sizeof(journal_header_t)]; 211 tagp = &bh->b_data[sizeof(journal_header_t)];
190 212
191 while ((tagp - bh->b_data + tag_bytes) <= size) { 213 while ((tagp - bh->b_data + tag_bytes) <= size) {
@@ -193,10 +215,10 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
193 215
194 nr++; 216 nr++;
195 tagp += tag_bytes; 217 tagp += tag_bytes;
196 if (!(tag->t_flags & cpu_to_be32(JBD2_FLAG_SAME_UUID))) 218 if (!(tag->t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID)))
197 tagp += 16; 219 tagp += 16;
198 220
199 if (tag->t_flags & cpu_to_be32(JBD2_FLAG_LAST_TAG)) 221 if (tag->t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG))
200 break; 222 break;
201 } 223 }
202 224
@@ -353,6 +375,41 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
353 return 0; 375 return 0;
354} 376}
355 377
378static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
379{
380 struct commit_header *h;
381 __u32 provided, calculated;
382
383 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
384 return 1;
385
386 h = buf;
387 provided = h->h_chksum[0];
388 h->h_chksum[0] = 0;
389 calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
390 h->h_chksum[0] = provided;
391
392 provided = be32_to_cpu(provided);
393 return provided == calculated;
394}
395
396static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
397 void *buf, __u32 sequence)
398{
399 __u32 provided, calculated;
400
401 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
402 return 1;
403
404 sequence = cpu_to_be32(sequence);
405 calculated = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
406 sizeof(sequence));
407 calculated = jbd2_chksum(j, calculated, buf, j->j_blocksize);
408 provided = be32_to_cpu(tag->t_checksum);
409
410 return provided == cpu_to_be32(calculated);
411}
412
356static int do_one_pass(journal_t *journal, 413static int do_one_pass(journal_t *journal,
357 struct recovery_info *info, enum passtype pass) 414 struct recovery_info *info, enum passtype pass)
358{ 415{
@@ -366,6 +423,7 @@ static int do_one_pass(journal_t *journal,
366 int blocktype; 423 int blocktype;
367 int tag_bytes = journal_tag_bytes(journal); 424 int tag_bytes = journal_tag_bytes(journal);
368 __u32 crc32_sum = ~0; /* Transactional Checksums */ 425 __u32 crc32_sum = ~0; /* Transactional Checksums */
426 int descr_csum_size = 0;
369 427
370 /* 428 /*
371 * First thing is to establish what we expect to find in the log 429 * First thing is to establish what we expect to find in the log
@@ -451,6 +509,18 @@ static int do_one_pass(journal_t *journal,
451 509
452 switch(blocktype) { 510 switch(blocktype) {
453 case JBD2_DESCRIPTOR_BLOCK: 511 case JBD2_DESCRIPTOR_BLOCK:
512 /* Verify checksum first */
513 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
514 JBD2_FEATURE_INCOMPAT_CSUM_V2))
515 descr_csum_size =
516 sizeof(struct jbd2_journal_block_tail);
517 if (descr_csum_size > 0 &&
518 !jbd2_descr_block_csum_verify(journal,
519 bh->b_data)) {
520 err = -EIO;
521 goto failed;
522 }
523
454 /* If it is a valid descriptor block, replay it 524 /* If it is a valid descriptor block, replay it
455 * in pass REPLAY; if journal_checksums enabled, then 525 * in pass REPLAY; if journal_checksums enabled, then
456 * calculate checksums in PASS_SCAN, otherwise, 526 * calculate checksums in PASS_SCAN, otherwise,
@@ -481,11 +551,11 @@ static int do_one_pass(journal_t *journal,
481 551
482 tagp = &bh->b_data[sizeof(journal_header_t)]; 552 tagp = &bh->b_data[sizeof(journal_header_t)];
483 while ((tagp - bh->b_data + tag_bytes) 553 while ((tagp - bh->b_data + tag_bytes)
484 <= journal->j_blocksize) { 554 <= journal->j_blocksize - descr_csum_size) {
485 unsigned long io_block; 555 unsigned long io_block;
486 556
487 tag = (journal_block_tag_t *) tagp; 557 tag = (journal_block_tag_t *) tagp;
488 flags = be32_to_cpu(tag->t_flags); 558 flags = be16_to_cpu(tag->t_flags);
489 559
490 io_block = next_log_block++; 560 io_block = next_log_block++;
491 wrap(journal, next_log_block); 561 wrap(journal, next_log_block);
@@ -516,6 +586,19 @@ static int do_one_pass(journal_t *journal,
516 goto skip_write; 586 goto skip_write;
517 } 587 }
518 588
589 /* Look for block corruption */
590 if (!jbd2_block_tag_csum_verify(
591 journal, tag, obh->b_data,
592 be32_to_cpu(tmp->h_sequence))) {
593 brelse(obh);
594 success = -EIO;
595 printk(KERN_ERR "JBD: Invalid "
596 "checksum recovering "
597 "block %llu in log\n",
598 blocknr);
599 continue;
600 }
601
519 /* Find a buffer for the new 602 /* Find a buffer for the new
520 * data being restored */ 603 * data being restored */
521 nbh = __getblk(journal->j_fs_dev, 604 nbh = __getblk(journal->j_fs_dev,
@@ -650,6 +733,19 @@ static int do_one_pass(journal_t *journal,
650 } 733 }
651 crc32_sum = ~0; 734 crc32_sum = ~0;
652 } 735 }
736 if (pass == PASS_SCAN &&
737 !jbd2_commit_block_csum_verify(journal,
738 bh->b_data)) {
739 info->end_transaction = next_commit_ID;
740
741 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
742 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
743 journal->j_failed_commit =
744 next_commit_ID;
745 brelse(bh);
746 break;
747 }
748 }
653 brelse(bh); 749 brelse(bh);
654 next_commit_ID++; 750 next_commit_ID++;
655 continue; 751 continue;
@@ -706,6 +802,25 @@ static int do_one_pass(journal_t *journal,
706 return err; 802 return err;
707} 803}
708 804
805static int jbd2_revoke_block_csum_verify(journal_t *j,
806 void *buf)
807{
808 struct jbd2_journal_revoke_tail *tail;
809 __u32 provided, calculated;
810
811 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
812 return 1;
813
814 tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
815 sizeof(struct jbd2_journal_revoke_tail));
816 provided = tail->r_checksum;
817 tail->r_checksum = 0;
818 calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
819 tail->r_checksum = provided;
820
821 provided = be32_to_cpu(provided);
822 return provided == calculated;
823}
709 824
710/* Scan a revoke record, marking all blocks mentioned as revoked. */ 825/* Scan a revoke record, marking all blocks mentioned as revoked. */
711 826
@@ -720,6 +835,9 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
720 offset = sizeof(jbd2_journal_revoke_header_t); 835 offset = sizeof(jbd2_journal_revoke_header_t);
721 max = be32_to_cpu(header->r_count); 836 max = be32_to_cpu(header->r_count);
722 837
838 if (!jbd2_revoke_block_csum_verify(journal, header))
839 return -EINVAL;
840
723 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) 841 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
724 record_len = 8; 842 record_len = 8;
725 843
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 6973705d6a3d..f30b80b4ce8b 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -578,6 +578,7 @@ static void write_one_revoke_record(journal_t *journal,
578 struct jbd2_revoke_record_s *record, 578 struct jbd2_revoke_record_s *record,
579 int write_op) 579 int write_op)
580{ 580{
581 int csum_size = 0;
581 struct journal_head *descriptor; 582 struct journal_head *descriptor;
582 int offset; 583 int offset;
583 journal_header_t *header; 584 journal_header_t *header;
@@ -592,9 +593,13 @@ static void write_one_revoke_record(journal_t *journal,
592 descriptor = *descriptorp; 593 descriptor = *descriptorp;
593 offset = *offsetp; 594 offset = *offsetp;
594 595
596 /* Do we need to leave space at the end for a checksum? */
597 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
598 csum_size = sizeof(struct jbd2_journal_revoke_tail);
599
595 /* Make sure we have a descriptor with space left for the record */ 600 /* Make sure we have a descriptor with space left for the record */
596 if (descriptor) { 601 if (descriptor) {
597 if (offset == journal->j_blocksize) { 602 if (offset >= journal->j_blocksize - csum_size) {
598 flush_descriptor(journal, descriptor, offset, write_op); 603 flush_descriptor(journal, descriptor, offset, write_op);
599 descriptor = NULL; 604 descriptor = NULL;
600 } 605 }
@@ -631,6 +636,24 @@ static void write_one_revoke_record(journal_t *journal,
631 *offsetp = offset; 636 *offsetp = offset;
632} 637}
633 638
639static void jbd2_revoke_csum_set(journal_t *j,
640 struct journal_head *descriptor)
641{
642 struct jbd2_journal_revoke_tail *tail;
643 __u32 csum;
644
645 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
646 return;
647
648 tail = (struct jbd2_journal_revoke_tail *)
649 (jh2bh(descriptor)->b_data + j->j_blocksize -
650 sizeof(struct jbd2_journal_revoke_tail));
651 tail->r_checksum = 0;
652 csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
653 j->j_blocksize);
654 tail->r_checksum = cpu_to_be32(csum);
655}
656
634/* 657/*
635 * Flush a revoke descriptor out to the journal. If we are aborting, 658 * Flush a revoke descriptor out to the journal. If we are aborting,
636 * this is a noop; otherwise we are generating a buffer which needs to 659 * this is a noop; otherwise we are generating a buffer which needs to
@@ -652,6 +675,8 @@ static void flush_descriptor(journal_t *journal,
652 675
653 header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data; 676 header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data;
654 header->r_count = cpu_to_be32(offset); 677 header->r_count = cpu_to_be32(offset);
678 jbd2_revoke_csum_set(journal, descriptor);
679
655 set_buffer_jwrite(bh); 680 set_buffer_jwrite(bh);
656 BUFFER_TRACE(bh, "write"); 681 BUFFER_TRACE(bh, "write");
657 set_buffer_dirty(bh); 682 set_buffer_dirty(bh);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index ddcd3549c6c2..fb1ab9533b67 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -162,8 +162,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
162 162
163alloc_transaction: 163alloc_transaction:
164 if (!journal->j_running_transaction) { 164 if (!journal->j_running_transaction) {
165 new_transaction = kmem_cache_alloc(transaction_cache, 165 new_transaction = kmem_cache_zalloc(transaction_cache,
166 gfp_mask | __GFP_ZERO); 166 gfp_mask);
167 if (!new_transaction) { 167 if (!new_transaction) {
168 /* 168 /*
169 * If __GFP_FS is not present, then we may be 169 * If __GFP_FS is not present, then we may be
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 55a0c1dceadf..44dca1f041c5 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -126,6 +126,10 @@ struct jffs2_sb_info {
126 struct jffs2_inodirty *wbuf_inodes; 126 struct jffs2_inodirty *wbuf_inodes;
127 struct rw_semaphore wbuf_sem; /* Protects the write buffer */ 127 struct rw_semaphore wbuf_sem; /* Protects the write buffer */
128 128
129 struct delayed_work wbuf_dwork; /* write-buffer write-out work */
130 int wbuf_queued; /* non-zero delayed work is queued */
131 spinlock_t wbuf_dwork_lock; /* protects wbuf_dwork and and wbuf_queued */
132
129 unsigned char *oobbuf; 133 unsigned char *oobbuf;
130 int oobavail; /* How many bytes are available for JFFS2 in OOB */ 134 int oobavail; /* How many bytes are available for JFFS2 in OOB */
131#endif 135#endif
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 1cd3aec9d9ae..bcd983d7e7f9 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -95,6 +95,7 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
95#define jffs2_ubivol(c) (0) 95#define jffs2_ubivol(c) (0)
96#define jffs2_ubivol_setup(c) (0) 96#define jffs2_ubivol_setup(c) (0)
97#define jffs2_ubivol_cleanup(c) do {} while (0) 97#define jffs2_ubivol_cleanup(c) do {} while (0)
98#define jffs2_dirty_trigger(c) do {} while (0)
98 99
99#else /* NAND and/or ECC'd NOR support present */ 100#else /* NAND and/or ECC'd NOR support present */
100 101
@@ -135,14 +136,10 @@ void jffs2_ubivol_cleanup(struct jffs2_sb_info *c);
135#define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE)) 136#define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE))
136int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c); 137int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c);
137void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c); 138void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c);
139void jffs2_dirty_trigger(struct jffs2_sb_info *c);
138 140
139#endif /* WRITEBUFFER */ 141#endif /* WRITEBUFFER */
140 142
141static inline void jffs2_dirty_trigger(struct jffs2_sb_info *c)
142{
143 OFNI_BS_2SFFJ(c)->s_dirt = 1;
144}
145
146/* background.c */ 143/* background.c */
147int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c); 144int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c);
148void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c); 145void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index f9916f312bd8..bc586f204228 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -63,21 +63,6 @@ static void jffs2_i_init_once(void *foo)
63 inode_init_once(&f->vfs_inode); 63 inode_init_once(&f->vfs_inode);
64} 64}
65 65
66static void jffs2_write_super(struct super_block *sb)
67{
68 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
69
70 lock_super(sb);
71 sb->s_dirt = 0;
72
73 if (!(sb->s_flags & MS_RDONLY)) {
74 jffs2_dbg(1, "%s()\n", __func__);
75 jffs2_flush_wbuf_gc(c, 0);
76 }
77
78 unlock_super(sb);
79}
80
81static const char *jffs2_compr_name(unsigned int compr) 66static const char *jffs2_compr_name(unsigned int compr)
82{ 67{
83 switch (compr) { 68 switch (compr) {
@@ -113,8 +98,6 @@ static int jffs2_sync_fs(struct super_block *sb, int wait)
113{ 98{
114 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); 99 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
115 100
116 jffs2_write_super(sb);
117
118 mutex_lock(&c->alloc_sem); 101 mutex_lock(&c->alloc_sem);
119 jffs2_flush_wbuf_pad(c); 102 jffs2_flush_wbuf_pad(c);
120 mutex_unlock(&c->alloc_sem); 103 mutex_unlock(&c->alloc_sem);
@@ -251,7 +234,6 @@ static const struct super_operations jffs2_super_operations =
251 .alloc_inode = jffs2_alloc_inode, 234 .alloc_inode = jffs2_alloc_inode,
252 .destroy_inode =jffs2_destroy_inode, 235 .destroy_inode =jffs2_destroy_inode,
253 .put_super = jffs2_put_super, 236 .put_super = jffs2_put_super,
254 .write_super = jffs2_write_super,
255 .statfs = jffs2_statfs, 237 .statfs = jffs2_statfs,
256 .remount_fs = jffs2_remount_fs, 238 .remount_fs = jffs2_remount_fs,
257 .evict_inode = jffs2_evict_inode, 239 .evict_inode = jffs2_evict_inode,
@@ -319,9 +301,6 @@ static void jffs2_put_super (struct super_block *sb)
319 301
320 jffs2_dbg(2, "%s()\n", __func__); 302 jffs2_dbg(2, "%s()\n", __func__);
321 303
322 if (sb->s_dirt)
323 jffs2_write_super(sb);
324
325 mutex_lock(&c->alloc_sem); 304 mutex_lock(&c->alloc_sem);
326 jffs2_flush_wbuf_pad(c); 305 jffs2_flush_wbuf_pad(c);
327 mutex_unlock(&c->alloc_sem); 306 mutex_unlock(&c->alloc_sem);
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 74d9be19df3f..6f4529d3697f 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -20,6 +20,7 @@
20#include <linux/mtd/nand.h> 20#include <linux/mtd/nand.h>
21#include <linux/jiffies.h> 21#include <linux/jiffies.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/writeback.h>
23 24
24#include "nodelist.h" 25#include "nodelist.h"
25 26
@@ -85,7 +86,7 @@ static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino)
85{ 86{
86 struct jffs2_inodirty *new; 87 struct jffs2_inodirty *new;
87 88
88 /* Mark the superblock dirty so that kupdated will flush... */ 89 /* Schedule delayed write-buffer write-out */
89 jffs2_dirty_trigger(c); 90 jffs2_dirty_trigger(c);
90 91
91 if (jffs2_wbuf_pending_for_ino(c, ino)) 92 if (jffs2_wbuf_pending_for_ino(c, ino))
@@ -1148,6 +1149,47 @@ int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *
1148 return 1; 1149 return 1;
1149} 1150}
1150 1151
1152static struct jffs2_sb_info *work_to_sb(struct work_struct *work)
1153{
1154 struct delayed_work *dwork;
1155
1156 dwork = container_of(work, struct delayed_work, work);
1157 return container_of(dwork, struct jffs2_sb_info, wbuf_dwork);
1158}
1159
1160static void delayed_wbuf_sync(struct work_struct *work)
1161{
1162 struct jffs2_sb_info *c = work_to_sb(work);
1163 struct super_block *sb = OFNI_BS_2SFFJ(c);
1164
1165 spin_lock(&c->wbuf_dwork_lock);
1166 c->wbuf_queued = 0;
1167 spin_unlock(&c->wbuf_dwork_lock);
1168
1169 if (!(sb->s_flags & MS_RDONLY)) {
1170 jffs2_dbg(1, "%s()\n", __func__);
1171 jffs2_flush_wbuf_gc(c, 0);
1172 }
1173}
1174
1175void jffs2_dirty_trigger(struct jffs2_sb_info *c)
1176{
1177 struct super_block *sb = OFNI_BS_2SFFJ(c);
1178 unsigned long delay;
1179
1180 if (sb->s_flags & MS_RDONLY)
1181 return;
1182
1183 spin_lock(&c->wbuf_dwork_lock);
1184 if (!c->wbuf_queued) {
1185 jffs2_dbg(1, "%s()\n", __func__);
1186 delay = msecs_to_jiffies(dirty_writeback_interval * 10);
1187 queue_delayed_work(system_long_wq, &c->wbuf_dwork, delay);
1188 c->wbuf_queued = 1;
1189 }
1190 spin_unlock(&c->wbuf_dwork_lock);
1191}
1192
1151int jffs2_nand_flash_setup(struct jffs2_sb_info *c) 1193int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
1152{ 1194{
1153 struct nand_ecclayout *oinfo = c->mtd->ecclayout; 1195 struct nand_ecclayout *oinfo = c->mtd->ecclayout;
@@ -1169,6 +1211,8 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
1169 1211
1170 /* Initialise write buffer */ 1212 /* Initialise write buffer */
1171 init_rwsem(&c->wbuf_sem); 1213 init_rwsem(&c->wbuf_sem);
1214 spin_lock_init(&c->wbuf_dwork_lock);
1215 INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
1172 c->wbuf_pagesize = c->mtd->writesize; 1216 c->wbuf_pagesize = c->mtd->writesize;
1173 c->wbuf_ofs = 0xFFFFFFFF; 1217 c->wbuf_ofs = 0xFFFFFFFF;
1174 1218
@@ -1207,8 +1251,8 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
1207 1251
1208 /* Initialize write buffer */ 1252 /* Initialize write buffer */
1209 init_rwsem(&c->wbuf_sem); 1253 init_rwsem(&c->wbuf_sem);
1210 1254 spin_lock_init(&c->wbuf_dwork_lock);
1211 1255 INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
1212 c->wbuf_pagesize = c->mtd->erasesize; 1256 c->wbuf_pagesize = c->mtd->erasesize;
1213 1257
1214 /* Find a suitable c->sector_size 1258 /* Find a suitable c->sector_size
@@ -1267,6 +1311,9 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
1267 1311
1268 /* Initialize write buffer */ 1312 /* Initialize write buffer */
1269 init_rwsem(&c->wbuf_sem); 1313 init_rwsem(&c->wbuf_sem);
1314 spin_lock_init(&c->wbuf_dwork_lock);
1315 INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
1316
1270 c->wbuf_pagesize = c->mtd->writesize; 1317 c->wbuf_pagesize = c->mtd->writesize;
1271 c->wbuf_ofs = 0xFFFFFFFF; 1318 c->wbuf_ofs = 0xFFFFFFFF;
1272 1319
@@ -1299,6 +1346,8 @@ int jffs2_ubivol_setup(struct jffs2_sb_info *c) {
1299 return 0; 1346 return 0;
1300 1347
1301 init_rwsem(&c->wbuf_sem); 1348 init_rwsem(&c->wbuf_sem);
1349 spin_lock_init(&c->wbuf_dwork_lock);
1350 INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
1302 1351
1303 c->wbuf_pagesize = c->mtd->writesize; 1352 c->wbuf_pagesize = c->mtd->writesize;
1304 c->wbuf_ofs = 0xFFFFFFFF; 1353 c->wbuf_ofs = 0xFFFFFFFF;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 1ead0750cdbb..80938fda67e0 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -251,39 +251,40 @@ out_err:
251 return err; 251 return err;
252} 252}
253 253
254static int lockd_up_net(struct net *net) 254static int lockd_up_net(struct svc_serv *serv, struct net *net)
255{ 255{
256 struct lockd_net *ln = net_generic(net, lockd_net_id); 256 struct lockd_net *ln = net_generic(net, lockd_net_id);
257 struct svc_serv *serv = nlmsvc_rqst->rq_server;
258 int error; 257 int error;
259 258
260 if (ln->nlmsvc_users) 259 if (ln->nlmsvc_users++)
261 return 0; 260 return 0;
262 261
263 error = svc_rpcb_setup(serv, net); 262 error = svc_bind(serv, net);
264 if (error) 263 if (error)
265 goto err_rpcb; 264 goto err_bind;
266 265
267 error = make_socks(serv, net); 266 error = make_socks(serv, net);
268 if (error < 0) 267 if (error < 0)
269 goto err_socks; 268 goto err_socks;
269 dprintk("lockd_up_net: per-net data created; net=%p\n", net);
270 return 0; 270 return 0;
271 271
272err_socks: 272err_socks:
273 svc_rpcb_cleanup(serv, net); 273 svc_rpcb_cleanup(serv, net);
274err_rpcb: 274err_bind:
275 ln->nlmsvc_users--;
275 return error; 276 return error;
276} 277}
277 278
278static void lockd_down_net(struct net *net) 279static void lockd_down_net(struct svc_serv *serv, struct net *net)
279{ 280{
280 struct lockd_net *ln = net_generic(net, lockd_net_id); 281 struct lockd_net *ln = net_generic(net, lockd_net_id);
281 struct svc_serv *serv = nlmsvc_rqst->rq_server;
282 282
283 if (ln->nlmsvc_users) { 283 if (ln->nlmsvc_users) {
284 if (--ln->nlmsvc_users == 0) { 284 if (--ln->nlmsvc_users == 0) {
285 nlm_shutdown_hosts_net(net); 285 nlm_shutdown_hosts_net(net);
286 svc_shutdown_net(serv, net); 286 svc_shutdown_net(serv, net);
287 dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net);
287 } 288 }
288 } else { 289 } else {
289 printk(KERN_ERR "lockd_down_net: no users! task=%p, net=%p\n", 290 printk(KERN_ERR "lockd_down_net: no users! task=%p, net=%p\n",
@@ -292,21 +293,60 @@ static void lockd_down_net(struct net *net)
292 } 293 }
293} 294}
294 295
295/* 296static int lockd_start_svc(struct svc_serv *serv)
296 * Bring up the lockd process if it's not already up. 297{
297 */ 298 int error;
298int lockd_up(struct net *net) 299
300 if (nlmsvc_rqst)
301 return 0;
302
303 /*
304 * Create the kernel thread and wait for it to start.
305 */
306 nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
307 if (IS_ERR(nlmsvc_rqst)) {
308 error = PTR_ERR(nlmsvc_rqst);
309 printk(KERN_WARNING
310 "lockd_up: svc_rqst allocation failed, error=%d\n",
311 error);
312 goto out_rqst;
313 }
314
315 svc_sock_update_bufs(serv);
316 serv->sv_maxconn = nlm_max_connections;
317
318 nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
319 if (IS_ERR(nlmsvc_task)) {
320 error = PTR_ERR(nlmsvc_task);
321 printk(KERN_WARNING
322 "lockd_up: kthread_run failed, error=%d\n", error);
323 goto out_task;
324 }
325 dprintk("lockd_up: service started\n");
326 return 0;
327
328out_task:
329 svc_exit_thread(nlmsvc_rqst);
330 nlmsvc_task = NULL;
331out_rqst:
332 nlmsvc_rqst = NULL;
333 return error;
334}
335
336static struct svc_serv *lockd_create_svc(void)
299{ 337{
300 struct svc_serv *serv; 338 struct svc_serv *serv;
301 int error = 0;
302 339
303 mutex_lock(&nlmsvc_mutex);
304 /* 340 /*
305 * Check whether we're already up and running. 341 * Check whether we're already up and running.
306 */ 342 */
307 if (nlmsvc_rqst) { 343 if (nlmsvc_rqst) {
308 error = lockd_up_net(net); 344 /*
309 goto out; 345 * Note: increase service usage, because later in case of error
346 * svc_destroy() will be called.
347 */
348 svc_get(nlmsvc_rqst->rq_server);
349 return nlmsvc_rqst->rq_server;
310 } 350 }
311 351
312 /* 352 /*
@@ -317,59 +357,53 @@ int lockd_up(struct net *net)
317 printk(KERN_WARNING 357 printk(KERN_WARNING
318 "lockd_up: no pid, %d users??\n", nlmsvc_users); 358 "lockd_up: no pid, %d users??\n", nlmsvc_users);
319 359
320 error = -ENOMEM;
321 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL); 360 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
322 if (!serv) { 361 if (!serv) {
323 printk(KERN_WARNING "lockd_up: create service failed\n"); 362 printk(KERN_WARNING "lockd_up: create service failed\n");
324 goto out; 363 return ERR_PTR(-ENOMEM);
325 } 364 }
365 dprintk("lockd_up: service created\n");
366 return serv;
367}
326 368
327 error = make_socks(serv, net); 369/*
328 if (error < 0) 370 * Bring up the lockd process if it's not already up.
329 goto destroy_and_out; 371 */
372int lockd_up(struct net *net)
373{
374 struct svc_serv *serv;
375 int error;
330 376
331 /* 377 mutex_lock(&nlmsvc_mutex);
332 * Create the kernel thread and wait for it to start. 378
333 */ 379 serv = lockd_create_svc();
334 nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); 380 if (IS_ERR(serv)) {
335 if (IS_ERR(nlmsvc_rqst)) { 381 error = PTR_ERR(serv);
336 error = PTR_ERR(nlmsvc_rqst); 382 goto err_create;
337 nlmsvc_rqst = NULL;
338 printk(KERN_WARNING
339 "lockd_up: svc_rqst allocation failed, error=%d\n",
340 error);
341 goto destroy_and_out;
342 } 383 }
343 384
344 svc_sock_update_bufs(serv); 385 error = lockd_up_net(serv, net);
345 serv->sv_maxconn = nlm_max_connections; 386 if (error < 0)
387 goto err_net;
346 388
347 nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name); 389 error = lockd_start_svc(serv);
348 if (IS_ERR(nlmsvc_task)) { 390 if (error < 0)
349 error = PTR_ERR(nlmsvc_task); 391 goto err_start;
350 svc_exit_thread(nlmsvc_rqst);
351 nlmsvc_task = NULL;
352 nlmsvc_rqst = NULL;
353 printk(KERN_WARNING
354 "lockd_up: kthread_run failed, error=%d\n", error);
355 goto destroy_and_out;
356 }
357 392
393 nlmsvc_users++;
358 /* 394 /*
359 * Note: svc_serv structures have an initial use count of 1, 395 * Note: svc_serv structures have an initial use count of 1,
360 * so we exit through here on both success and failure. 396 * so we exit through here on both success and failure.
361 */ 397 */
362destroy_and_out: 398err_net:
363 svc_destroy(serv); 399 svc_destroy(serv);
364out: 400err_create:
365 if (!error) {
366 struct lockd_net *ln = net_generic(net, lockd_net_id);
367
368 ln->nlmsvc_users++;
369 nlmsvc_users++;
370 }
371 mutex_unlock(&nlmsvc_mutex); 401 mutex_unlock(&nlmsvc_mutex);
372 return error; 402 return error;
403
404err_start:
405 lockd_down_net(serv, net);
406 goto err_net;
373} 407}
374EXPORT_SYMBOL_GPL(lockd_up); 408EXPORT_SYMBOL_GPL(lockd_up);
375 409
@@ -380,11 +414,10 @@ void
380lockd_down(struct net *net) 414lockd_down(struct net *net)
381{ 415{
382 mutex_lock(&nlmsvc_mutex); 416 mutex_lock(&nlmsvc_mutex);
417 lockd_down_net(nlmsvc_rqst->rq_server, net);
383 if (nlmsvc_users) { 418 if (nlmsvc_users) {
384 if (--nlmsvc_users) { 419 if (--nlmsvc_users)
385 lockd_down_net(net);
386 goto out; 420 goto out;
387 }
388 } else { 421 } else {
389 printk(KERN_ERR "lockd_down: no users! task=%p\n", 422 printk(KERN_ERR "lockd_down: no users! task=%p\n",
390 nlmsvc_task); 423 nlmsvc_task);
@@ -396,7 +429,9 @@ lockd_down(struct net *net)
396 BUG(); 429 BUG();
397 } 430 }
398 kthread_stop(nlmsvc_task); 431 kthread_stop(nlmsvc_task);
432 dprintk("lockd_down: service stopped\n");
399 svc_exit_thread(nlmsvc_rqst); 433 svc_exit_thread(nlmsvc_rqst);
434 dprintk("lockd_down: service destroyed\n");
400 nlmsvc_task = NULL; 435 nlmsvc_task = NULL;
401 nlmsvc_rqst = NULL; 436 nlmsvc_rqst = NULL;
402out: 437out:
diff --git a/fs/locks.c b/fs/locks.c
index 4f441e46cef4..814c51d0de47 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1636,12 +1636,13 @@ EXPORT_SYMBOL(flock_lock_file_wait);
1636SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) 1636SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
1637{ 1637{
1638 struct file *filp; 1638 struct file *filp;
1639 int fput_needed;
1639 struct file_lock *lock; 1640 struct file_lock *lock;
1640 int can_sleep, unlock; 1641 int can_sleep, unlock;
1641 int error; 1642 int error;
1642 1643
1643 error = -EBADF; 1644 error = -EBADF;
1644 filp = fget(fd); 1645 filp = fget_light(fd, &fput_needed);
1645 if (!filp) 1646 if (!filp)
1646 goto out; 1647 goto out;
1647 1648
@@ -1674,7 +1675,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
1674 locks_free_lock(lock); 1675 locks_free_lock(lock);
1675 1676
1676 out_putf: 1677 out_putf:
1677 fput(filp); 1678 fput_light(filp, fput_needed);
1678 out: 1679 out:
1679 return error; 1680 return error;
1680} 1681}
diff --git a/fs/namei.c b/fs/namei.c
index c651f02c9fec..7d694194024a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -449,7 +449,7 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
449 mntget(nd->path.mnt); 449 mntget(nd->path.mnt);
450 450
451 rcu_read_unlock(); 451 rcu_read_unlock();
452 br_read_unlock(vfsmount_lock); 452 br_read_unlock(&vfsmount_lock);
453 nd->flags &= ~LOOKUP_RCU; 453 nd->flags &= ~LOOKUP_RCU;
454 return 0; 454 return 0;
455 455
@@ -507,14 +507,14 @@ static int complete_walk(struct nameidata *nd)
507 if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) { 507 if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
508 spin_unlock(&dentry->d_lock); 508 spin_unlock(&dentry->d_lock);
509 rcu_read_unlock(); 509 rcu_read_unlock();
510 br_read_unlock(vfsmount_lock); 510 br_read_unlock(&vfsmount_lock);
511 return -ECHILD; 511 return -ECHILD;
512 } 512 }
513 BUG_ON(nd->inode != dentry->d_inode); 513 BUG_ON(nd->inode != dentry->d_inode);
514 spin_unlock(&dentry->d_lock); 514 spin_unlock(&dentry->d_lock);
515 mntget(nd->path.mnt); 515 mntget(nd->path.mnt);
516 rcu_read_unlock(); 516 rcu_read_unlock();
517 br_read_unlock(vfsmount_lock); 517 br_read_unlock(&vfsmount_lock);
518 } 518 }
519 519
520 if (likely(!(nd->flags & LOOKUP_JUMPED))) 520 if (likely(!(nd->flags & LOOKUP_JUMPED)))
@@ -681,15 +681,15 @@ int follow_up(struct path *path)
681 struct mount *parent; 681 struct mount *parent;
682 struct dentry *mountpoint; 682 struct dentry *mountpoint;
683 683
684 br_read_lock(vfsmount_lock); 684 br_read_lock(&vfsmount_lock);
685 parent = mnt->mnt_parent; 685 parent = mnt->mnt_parent;
686 if (&parent->mnt == path->mnt) { 686 if (&parent->mnt == path->mnt) {
687 br_read_unlock(vfsmount_lock); 687 br_read_unlock(&vfsmount_lock);
688 return 0; 688 return 0;
689 } 689 }
690 mntget(&parent->mnt); 690 mntget(&parent->mnt);
691 mountpoint = dget(mnt->mnt_mountpoint); 691 mountpoint = dget(mnt->mnt_mountpoint);
692 br_read_unlock(vfsmount_lock); 692 br_read_unlock(&vfsmount_lock);
693 dput(path->dentry); 693 dput(path->dentry);
694 path->dentry = mountpoint; 694 path->dentry = mountpoint;
695 mntput(path->mnt); 695 mntput(path->mnt);
@@ -947,7 +947,7 @@ failed:
947 if (!(nd->flags & LOOKUP_ROOT)) 947 if (!(nd->flags & LOOKUP_ROOT))
948 nd->root.mnt = NULL; 948 nd->root.mnt = NULL;
949 rcu_read_unlock(); 949 rcu_read_unlock();
950 br_read_unlock(vfsmount_lock); 950 br_read_unlock(&vfsmount_lock);
951 return -ECHILD; 951 return -ECHILD;
952} 952}
953 953
@@ -1125,8 +1125,8 @@ static struct dentry *__lookup_hash(struct qstr *name,
1125 * small and for now I'd prefer to have fast path as straight as possible. 1125 * small and for now I'd prefer to have fast path as straight as possible.
1126 * It _is_ time-critical. 1126 * It _is_ time-critical.
1127 */ 1127 */
1128static int do_lookup(struct nameidata *nd, struct qstr *name, 1128static int lookup_fast(struct nameidata *nd, struct qstr *name,
1129 struct path *path, struct inode **inode) 1129 struct path *path, struct inode **inode)
1130{ 1130{
1131 struct vfsmount *mnt = nd->path.mnt; 1131 struct vfsmount *mnt = nd->path.mnt;
1132 struct dentry *dentry, *parent = nd->path.dentry; 1132 struct dentry *dentry, *parent = nd->path.dentry;
@@ -1208,7 +1208,7 @@ unlazy:
1208 goto need_lookup; 1208 goto need_lookup;
1209 } 1209 }
1210 } 1210 }
1211done: 1211
1212 path->mnt = mnt; 1212 path->mnt = mnt;
1213 path->dentry = dentry; 1213 path->dentry = dentry;
1214 err = follow_managed(path, nd->flags); 1214 err = follow_managed(path, nd->flags);
@@ -1222,6 +1222,17 @@ done:
1222 return 0; 1222 return 0;
1223 1223
1224need_lookup: 1224need_lookup:
1225 return 1;
1226}
1227
1228/* Fast lookup failed, do it the slow way */
1229static int lookup_slow(struct nameidata *nd, struct qstr *name,
1230 struct path *path)
1231{
1232 struct dentry *dentry, *parent;
1233 int err;
1234
1235 parent = nd->path.dentry;
1225 BUG_ON(nd->inode != parent->d_inode); 1236 BUG_ON(nd->inode != parent->d_inode);
1226 1237
1227 mutex_lock(&parent->d_inode->i_mutex); 1238 mutex_lock(&parent->d_inode->i_mutex);
@@ -1229,7 +1240,16 @@ need_lookup:
1229 mutex_unlock(&parent->d_inode->i_mutex); 1240 mutex_unlock(&parent->d_inode->i_mutex);
1230 if (IS_ERR(dentry)) 1241 if (IS_ERR(dentry))
1231 return PTR_ERR(dentry); 1242 return PTR_ERR(dentry);
1232 goto done; 1243 path->mnt = nd->path.mnt;
1244 path->dentry = dentry;
1245 err = follow_managed(path, nd->flags);
1246 if (unlikely(err < 0)) {
1247 path_put_conditional(path, nd);
1248 return err;
1249 }
1250 if (err)
1251 nd->flags |= LOOKUP_JUMPED;
1252 return 0;
1233} 1253}
1234 1254
1235static inline int may_lookup(struct nameidata *nd) 1255static inline int may_lookup(struct nameidata *nd)
@@ -1265,7 +1285,7 @@ static void terminate_walk(struct nameidata *nd)
1265 if (!(nd->flags & LOOKUP_ROOT)) 1285 if (!(nd->flags & LOOKUP_ROOT))
1266 nd->root.mnt = NULL; 1286 nd->root.mnt = NULL;
1267 rcu_read_unlock(); 1287 rcu_read_unlock();
1268 br_read_unlock(vfsmount_lock); 1288 br_read_unlock(&vfsmount_lock);
1269 } 1289 }
1270} 1290}
1271 1291
@@ -1301,21 +1321,26 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
1301 */ 1321 */
1302 if (unlikely(type != LAST_NORM)) 1322 if (unlikely(type != LAST_NORM))
1303 return handle_dots(nd, type); 1323 return handle_dots(nd, type);
1304 err = do_lookup(nd, name, path, &inode); 1324 err = lookup_fast(nd, name, path, &inode);
1305 if (unlikely(err)) { 1325 if (unlikely(err)) {
1306 terminate_walk(nd); 1326 if (err < 0)
1307 return err; 1327 goto out_err;
1308 } 1328
1309 if (!inode) { 1329 err = lookup_slow(nd, name, path);
1310 path_to_nameidata(path, nd); 1330 if (err < 0)
1311 terminate_walk(nd); 1331 goto out_err;
1312 return -ENOENT; 1332
1333 inode = path->dentry->d_inode;
1313 } 1334 }
1335 err = -ENOENT;
1336 if (!inode)
1337 goto out_path_put;
1338
1314 if (should_follow_link(inode, follow)) { 1339 if (should_follow_link(inode, follow)) {
1315 if (nd->flags & LOOKUP_RCU) { 1340 if (nd->flags & LOOKUP_RCU) {
1316 if (unlikely(unlazy_walk(nd, path->dentry))) { 1341 if (unlikely(unlazy_walk(nd, path->dentry))) {
1317 terminate_walk(nd); 1342 err = -ECHILD;
1318 return -ECHILD; 1343 goto out_err;
1319 } 1344 }
1320 } 1345 }
1321 BUG_ON(inode != path->dentry->d_inode); 1346 BUG_ON(inode != path->dentry->d_inode);
@@ -1324,6 +1349,12 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
1324 path_to_nameidata(path, nd); 1349 path_to_nameidata(path, nd);
1325 nd->inode = inode; 1350 nd->inode = inode;
1326 return 0; 1351 return 0;
1352
1353out_path_put:
1354 path_to_nameidata(path, nd);
1355out_err:
1356 terminate_walk(nd);
1357 return err;
1327} 1358}
1328 1359
1329/* 1360/*
@@ -1620,7 +1651,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1620 nd->path = nd->root; 1651 nd->path = nd->root;
1621 nd->inode = inode; 1652 nd->inode = inode;
1622 if (flags & LOOKUP_RCU) { 1653 if (flags & LOOKUP_RCU) {
1623 br_read_lock(vfsmount_lock); 1654 br_read_lock(&vfsmount_lock);
1624 rcu_read_lock(); 1655 rcu_read_lock();
1625 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); 1656 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1626 } else { 1657 } else {
@@ -1633,7 +1664,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1633 1664
1634 if (*name=='/') { 1665 if (*name=='/') {
1635 if (flags & LOOKUP_RCU) { 1666 if (flags & LOOKUP_RCU) {
1636 br_read_lock(vfsmount_lock); 1667 br_read_lock(&vfsmount_lock);
1637 rcu_read_lock(); 1668 rcu_read_lock();
1638 set_root_rcu(nd); 1669 set_root_rcu(nd);
1639 } else { 1670 } else {
@@ -1646,7 +1677,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1646 struct fs_struct *fs = current->fs; 1677 struct fs_struct *fs = current->fs;
1647 unsigned seq; 1678 unsigned seq;
1648 1679
1649 br_read_lock(vfsmount_lock); 1680 br_read_lock(&vfsmount_lock);
1650 rcu_read_lock(); 1681 rcu_read_lock();
1651 1682
1652 do { 1683 do {
@@ -1682,7 +1713,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1682 if (fput_needed) 1713 if (fput_needed)
1683 *fp = file; 1714 *fp = file;
1684 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); 1715 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1685 br_read_lock(vfsmount_lock); 1716 br_read_lock(&vfsmount_lock);
1686 rcu_read_lock(); 1717 rcu_read_lock();
1687 } else { 1718 } else {
1688 path_get(&file->f_path); 1719 path_get(&file->f_path);
@@ -2169,6 +2200,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2169 int want_write = 0; 2200 int want_write = 0;
2170 int acc_mode = op->acc_mode; 2201 int acc_mode = op->acc_mode;
2171 struct file *filp; 2202 struct file *filp;
2203 struct inode *inode;
2204 int symlink_ok = 0;
2205 struct path save_parent = { .dentry = NULL, .mnt = NULL };
2206 bool retried = false;
2172 int error; 2207 int error;
2173 2208
2174 nd->flags &= ~LOOKUP_PARENT; 2209 nd->flags &= ~LOOKUP_PARENT;
@@ -2200,30 +2235,23 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2200 } 2235 }
2201 2236
2202 if (!(open_flag & O_CREAT)) { 2237 if (!(open_flag & O_CREAT)) {
2203 int symlink_ok = 0;
2204 if (nd->last.name[nd->last.len]) 2238 if (nd->last.name[nd->last.len])
2205 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; 2239 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
2206 if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) 2240 if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
2207 symlink_ok = 1; 2241 symlink_ok = 1;
2208 /* we _can_ be in RCU mode here */ 2242 /* we _can_ be in RCU mode here */
2209 error = walk_component(nd, path, &nd->last, LAST_NORM, 2243 error = lookup_fast(nd, &nd->last, path, &inode);
2210 !symlink_ok); 2244 if (unlikely(error)) {
2211 if (error < 0) 2245 if (error < 0)
2212 return ERR_PTR(error); 2246 goto exit;
2213 if (error) /* symlink */
2214 return NULL;
2215 /* sayonara */
2216 error = complete_walk(nd);
2217 if (error)
2218 return ERR_PTR(error);
2219 2247
2220 error = -ENOTDIR; 2248 error = lookup_slow(nd, &nd->last, path);
2221 if (nd->flags & LOOKUP_DIRECTORY) { 2249 if (error < 0)
2222 if (!nd->inode->i_op->lookup)
2223 goto exit; 2250 goto exit;
2251
2252 inode = path->dentry->d_inode;
2224 } 2253 }
2225 audit_inode(pathname, nd->path.dentry); 2254 goto finish_lookup;
2226 goto ok;
2227 } 2255 }
2228 2256
2229 /* create side of things */ 2257 /* create side of things */
@@ -2241,6 +2269,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2241 if (nd->last.name[nd->last.len]) 2269 if (nd->last.name[nd->last.len])
2242 goto exit; 2270 goto exit;
2243 2271
2272retry_lookup:
2244 mutex_lock(&dir->d_inode->i_mutex); 2273 mutex_lock(&dir->d_inode->i_mutex);
2245 2274
2246 dentry = lookup_hash(nd); 2275 dentry = lookup_hash(nd);
@@ -2302,22 +2331,49 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2302 if (error) 2331 if (error)
2303 nd->flags |= LOOKUP_JUMPED; 2332 nd->flags |= LOOKUP_JUMPED;
2304 2333
2334 BUG_ON(nd->flags & LOOKUP_RCU);
2335 inode = path->dentry->d_inode;
2336finish_lookup:
2337 /* we _can_ be in RCU mode here */
2305 error = -ENOENT; 2338 error = -ENOENT;
2306 if (!path->dentry->d_inode) 2339 if (!inode) {
2307 goto exit_dput; 2340 path_to_nameidata(path, nd);
2341 goto exit;
2342 }
2308 2343
2309 if (path->dentry->d_inode->i_op->follow_link) 2344 if (should_follow_link(inode, !symlink_ok)) {
2345 if (nd->flags & LOOKUP_RCU) {
2346 if (unlikely(unlazy_walk(nd, path->dentry))) {
2347 error = -ECHILD;
2348 goto exit;
2349 }
2350 }
2351 BUG_ON(inode != path->dentry->d_inode);
2310 return NULL; 2352 return NULL;
2353 }
2311 2354
2312 path_to_nameidata(path, nd); 2355 if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) {
2313 nd->inode = path->dentry->d_inode; 2356 path_to_nameidata(path, nd);
2357 } else {
2358 save_parent.dentry = nd->path.dentry;
2359 save_parent.mnt = mntget(path->mnt);
2360 nd->path.dentry = path->dentry;
2361
2362 }
2363 nd->inode = inode;
2314 /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ 2364 /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
2315 error = complete_walk(nd); 2365 error = complete_walk(nd);
2316 if (error) 2366 if (error) {
2367 path_put(&save_parent);
2317 return ERR_PTR(error); 2368 return ERR_PTR(error);
2369 }
2318 error = -EISDIR; 2370 error = -EISDIR;
2319 if (S_ISDIR(nd->inode->i_mode)) 2371 if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode))
2372 goto exit;
2373 error = -ENOTDIR;
2374 if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup)
2320 goto exit; 2375 goto exit;
2376 audit_inode(pathname, nd->path.dentry);
2321ok: 2377ok:
2322 if (!S_ISREG(nd->inode->i_mode)) 2378 if (!S_ISREG(nd->inode->i_mode))
2323 will_truncate = 0; 2379 will_truncate = 0;
@@ -2333,6 +2389,20 @@ common:
2333 if (error) 2389 if (error)
2334 goto exit; 2390 goto exit;
2335 filp = nameidata_to_filp(nd); 2391 filp = nameidata_to_filp(nd);
2392 if (filp == ERR_PTR(-EOPENSTALE) && save_parent.dentry && !retried) {
2393 BUG_ON(save_parent.dentry != dir);
2394 path_put(&nd->path);
2395 nd->path = save_parent;
2396 nd->inode = dir->d_inode;
2397 save_parent.mnt = NULL;
2398 save_parent.dentry = NULL;
2399 if (want_write) {
2400 mnt_drop_write(nd->path.mnt);
2401 want_write = 0;
2402 }
2403 retried = true;
2404 goto retry_lookup;
2405 }
2336 if (!IS_ERR(filp)) { 2406 if (!IS_ERR(filp)) {
2337 error = ima_file_check(filp, op->acc_mode); 2407 error = ima_file_check(filp, op->acc_mode);
2338 if (error) { 2408 if (error) {
@@ -2352,7 +2422,8 @@ common:
2352out: 2422out:
2353 if (want_write) 2423 if (want_write)
2354 mnt_drop_write(nd->path.mnt); 2424 mnt_drop_write(nd->path.mnt);
2355 path_put(&nd->path); 2425 path_put(&save_parent);
2426 terminate_walk(nd);
2356 return filp; 2427 return filp;
2357 2428
2358exit_mutex_unlock: 2429exit_mutex_unlock:
@@ -2415,6 +2486,12 @@ out:
2415 if (base) 2486 if (base)
2416 fput(base); 2487 fput(base);
2417 release_open_intent(nd); 2488 release_open_intent(nd);
2489 if (filp == ERR_PTR(-EOPENSTALE)) {
2490 if (flags & LOOKUP_RCU)
2491 filp = ERR_PTR(-ECHILD);
2492 else
2493 filp = ERR_PTR(-ESTALE);
2494 }
2418 return filp; 2495 return filp;
2419 2496
2420out_filp: 2497out_filp:
diff --git a/fs/namespace.c b/fs/namespace.c
index e6081996c9a2..1e4a5fe3d7b7 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -397,7 +397,7 @@ static int mnt_make_readonly(struct mount *mnt)
397{ 397{
398 int ret = 0; 398 int ret = 0;
399 399
400 br_write_lock(vfsmount_lock); 400 br_write_lock(&vfsmount_lock);
401 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; 401 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
402 /* 402 /*
403 * After storing MNT_WRITE_HOLD, we'll read the counters. This store 403 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -431,15 +431,15 @@ static int mnt_make_readonly(struct mount *mnt)
431 */ 431 */
432 smp_wmb(); 432 smp_wmb();
433 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; 433 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
434 br_write_unlock(vfsmount_lock); 434 br_write_unlock(&vfsmount_lock);
435 return ret; 435 return ret;
436} 436}
437 437
438static void __mnt_unmake_readonly(struct mount *mnt) 438static void __mnt_unmake_readonly(struct mount *mnt)
439{ 439{
440 br_write_lock(vfsmount_lock); 440 br_write_lock(&vfsmount_lock);
441 mnt->mnt.mnt_flags &= ~MNT_READONLY; 441 mnt->mnt.mnt_flags &= ~MNT_READONLY;
442 br_write_unlock(vfsmount_lock); 442 br_write_unlock(&vfsmount_lock);
443} 443}
444 444
445int sb_prepare_remount_readonly(struct super_block *sb) 445int sb_prepare_remount_readonly(struct super_block *sb)
@@ -451,7 +451,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
451 if (atomic_long_read(&sb->s_remove_count)) 451 if (atomic_long_read(&sb->s_remove_count))
452 return -EBUSY; 452 return -EBUSY;
453 453
454 br_write_lock(vfsmount_lock); 454 br_write_lock(&vfsmount_lock);
455 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { 455 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
456 if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { 456 if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
457 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; 457 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
@@ -473,7 +473,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
473 if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) 473 if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
474 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; 474 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
475 } 475 }
476 br_write_unlock(vfsmount_lock); 476 br_write_unlock(&vfsmount_lock);
477 477
478 return err; 478 return err;
479} 479}
@@ -522,14 +522,14 @@ struct vfsmount *lookup_mnt(struct path *path)
522{ 522{
523 struct mount *child_mnt; 523 struct mount *child_mnt;
524 524
525 br_read_lock(vfsmount_lock); 525 br_read_lock(&vfsmount_lock);
526 child_mnt = __lookup_mnt(path->mnt, path->dentry, 1); 526 child_mnt = __lookup_mnt(path->mnt, path->dentry, 1);
527 if (child_mnt) { 527 if (child_mnt) {
528 mnt_add_count(child_mnt, 1); 528 mnt_add_count(child_mnt, 1);
529 br_read_unlock(vfsmount_lock); 529 br_read_unlock(&vfsmount_lock);
530 return &child_mnt->mnt; 530 return &child_mnt->mnt;
531 } else { 531 } else {
532 br_read_unlock(vfsmount_lock); 532 br_read_unlock(&vfsmount_lock);
533 return NULL; 533 return NULL;
534 } 534 }
535} 535}
@@ -714,9 +714,9 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
714 mnt->mnt.mnt_sb = root->d_sb; 714 mnt->mnt.mnt_sb = root->d_sb;
715 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 715 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
716 mnt->mnt_parent = mnt; 716 mnt->mnt_parent = mnt;
717 br_write_lock(vfsmount_lock); 717 br_write_lock(&vfsmount_lock);
718 list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); 718 list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
719 br_write_unlock(vfsmount_lock); 719 br_write_unlock(&vfsmount_lock);
720 return &mnt->mnt; 720 return &mnt->mnt;
721} 721}
722EXPORT_SYMBOL_GPL(vfs_kern_mount); 722EXPORT_SYMBOL_GPL(vfs_kern_mount);
@@ -745,9 +745,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
745 mnt->mnt.mnt_root = dget(root); 745 mnt->mnt.mnt_root = dget(root);
746 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 746 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
747 mnt->mnt_parent = mnt; 747 mnt->mnt_parent = mnt;
748 br_write_lock(vfsmount_lock); 748 br_write_lock(&vfsmount_lock);
749 list_add_tail(&mnt->mnt_instance, &sb->s_mounts); 749 list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
750 br_write_unlock(vfsmount_lock); 750 br_write_unlock(&vfsmount_lock);
751 751
752 if (flag & CL_SLAVE) { 752 if (flag & CL_SLAVE) {
753 list_add(&mnt->mnt_slave, &old->mnt_slave_list); 753 list_add(&mnt->mnt_slave, &old->mnt_slave_list);
@@ -803,35 +803,36 @@ static void mntput_no_expire(struct mount *mnt)
803{ 803{
804put_again: 804put_again:
805#ifdef CONFIG_SMP 805#ifdef CONFIG_SMP
806 br_read_lock(vfsmount_lock); 806 br_read_lock(&vfsmount_lock);
807 if (likely(atomic_read(&mnt->mnt_longterm))) { 807 if (likely(atomic_read(&mnt->mnt_longterm))) {
808 mnt_add_count(mnt, -1); 808 mnt_add_count(mnt, -1);
809 br_read_unlock(vfsmount_lock); 809 br_read_unlock(&vfsmount_lock);
810 return; 810 return;
811 } 811 }
812 br_read_unlock(vfsmount_lock); 812 br_read_unlock(&vfsmount_lock);
813 813
814 br_write_lock(vfsmount_lock); 814 br_write_lock(&vfsmount_lock);
815 mnt_add_count(mnt, -1); 815 mnt_add_count(mnt, -1);
816 if (mnt_get_count(mnt)) { 816 if (mnt_get_count(mnt)) {
817 br_write_unlock(vfsmount_lock); 817 br_write_unlock(&vfsmount_lock);
818 return; 818 return;
819 } 819 }
820#else 820#else
821 mnt_add_count(mnt, -1); 821 mnt_add_count(mnt, -1);
822 if (likely(mnt_get_count(mnt))) 822 if (likely(mnt_get_count(mnt)))
823 return; 823 return;
824 br_write_lock(vfsmount_lock); 824 br_write_lock(&vfsmount_lock);
825#endif 825#endif
826 if (unlikely(mnt->mnt_pinned)) { 826 if (unlikely(mnt->mnt_pinned)) {
827 mnt_add_count(mnt, mnt->mnt_pinned + 1); 827 mnt_add_count(mnt, mnt->mnt_pinned + 1);
828 mnt->mnt_pinned = 0; 828 mnt->mnt_pinned = 0;
829 br_write_unlock(vfsmount_lock); 829 br_write_unlock(&vfsmount_lock);
830 acct_auto_close_mnt(&mnt->mnt); 830 acct_auto_close_mnt(&mnt->mnt);
831 goto put_again; 831 goto put_again;
832 } 832 }
833
833 list_del(&mnt->mnt_instance); 834 list_del(&mnt->mnt_instance);
834 br_write_unlock(vfsmount_lock); 835 br_write_unlock(&vfsmount_lock);
835 mntfree(mnt); 836 mntfree(mnt);
836} 837}
837 838
@@ -857,21 +858,21 @@ EXPORT_SYMBOL(mntget);
857 858
858void mnt_pin(struct vfsmount *mnt) 859void mnt_pin(struct vfsmount *mnt)
859{ 860{
860 br_write_lock(vfsmount_lock); 861 br_write_lock(&vfsmount_lock);
861 real_mount(mnt)->mnt_pinned++; 862 real_mount(mnt)->mnt_pinned++;
862 br_write_unlock(vfsmount_lock); 863 br_write_unlock(&vfsmount_lock);
863} 864}
864EXPORT_SYMBOL(mnt_pin); 865EXPORT_SYMBOL(mnt_pin);
865 866
866void mnt_unpin(struct vfsmount *m) 867void mnt_unpin(struct vfsmount *m)
867{ 868{
868 struct mount *mnt = real_mount(m); 869 struct mount *mnt = real_mount(m);
869 br_write_lock(vfsmount_lock); 870 br_write_lock(&vfsmount_lock);
870 if (mnt->mnt_pinned) { 871 if (mnt->mnt_pinned) {
871 mnt_add_count(mnt, 1); 872 mnt_add_count(mnt, 1);
872 mnt->mnt_pinned--; 873 mnt->mnt_pinned--;
873 } 874 }
874 br_write_unlock(vfsmount_lock); 875 br_write_unlock(&vfsmount_lock);
875} 876}
876EXPORT_SYMBOL(mnt_unpin); 877EXPORT_SYMBOL(mnt_unpin);
877 878
@@ -988,12 +989,12 @@ int may_umount_tree(struct vfsmount *m)
988 BUG_ON(!m); 989 BUG_ON(!m);
989 990
990 /* write lock needed for mnt_get_count */ 991 /* write lock needed for mnt_get_count */
991 br_write_lock(vfsmount_lock); 992 br_write_lock(&vfsmount_lock);
992 for (p = mnt; p; p = next_mnt(p, mnt)) { 993 for (p = mnt; p; p = next_mnt(p, mnt)) {
993 actual_refs += mnt_get_count(p); 994 actual_refs += mnt_get_count(p);
994 minimum_refs += 2; 995 minimum_refs += 2;
995 } 996 }
996 br_write_unlock(vfsmount_lock); 997 br_write_unlock(&vfsmount_lock);
997 998
998 if (actual_refs > minimum_refs) 999 if (actual_refs > minimum_refs)
999 return 0; 1000 return 0;
@@ -1020,10 +1021,10 @@ int may_umount(struct vfsmount *mnt)
1020{ 1021{
1021 int ret = 1; 1022 int ret = 1;
1022 down_read(&namespace_sem); 1023 down_read(&namespace_sem);
1023 br_write_lock(vfsmount_lock); 1024 br_write_lock(&vfsmount_lock);
1024 if (propagate_mount_busy(real_mount(mnt), 2)) 1025 if (propagate_mount_busy(real_mount(mnt), 2))
1025 ret = 0; 1026 ret = 0;
1026 br_write_unlock(vfsmount_lock); 1027 br_write_unlock(&vfsmount_lock);
1027 up_read(&namespace_sem); 1028 up_read(&namespace_sem);
1028 return ret; 1029 return ret;
1029} 1030}
@@ -1040,13 +1041,13 @@ void release_mounts(struct list_head *head)
1040 struct dentry *dentry; 1041 struct dentry *dentry;
1041 struct mount *m; 1042 struct mount *m;
1042 1043
1043 br_write_lock(vfsmount_lock); 1044 br_write_lock(&vfsmount_lock);
1044 dentry = mnt->mnt_mountpoint; 1045 dentry = mnt->mnt_mountpoint;
1045 m = mnt->mnt_parent; 1046 m = mnt->mnt_parent;
1046 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 1047 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
1047 mnt->mnt_parent = mnt; 1048 mnt->mnt_parent = mnt;
1048 m->mnt_ghosts--; 1049 m->mnt_ghosts--;
1049 br_write_unlock(vfsmount_lock); 1050 br_write_unlock(&vfsmount_lock);
1050 dput(dentry); 1051 dput(dentry);
1051 mntput(&m->mnt); 1052 mntput(&m->mnt);
1052 } 1053 }
@@ -1073,8 +1074,9 @@ void umount_tree(struct mount *mnt, int propagate, struct list_head *kill)
1073 list_del_init(&p->mnt_expire); 1074 list_del_init(&p->mnt_expire);
1074 list_del_init(&p->mnt_list); 1075 list_del_init(&p->mnt_list);
1075 __touch_mnt_namespace(p->mnt_ns); 1076 __touch_mnt_namespace(p->mnt_ns);
1077 if (p->mnt_ns)
1078 __mnt_make_shortterm(p);
1076 p->mnt_ns = NULL; 1079 p->mnt_ns = NULL;
1077 __mnt_make_shortterm(p);
1078 list_del_init(&p->mnt_child); 1080 list_del_init(&p->mnt_child);
1079 if (mnt_has_parent(p)) { 1081 if (mnt_has_parent(p)) {
1080 p->mnt_parent->mnt_ghosts++; 1082 p->mnt_parent->mnt_ghosts++;
@@ -1112,12 +1114,12 @@ static int do_umount(struct mount *mnt, int flags)
1112 * probably don't strictly need the lock here if we examined 1114 * probably don't strictly need the lock here if we examined
1113 * all race cases, but it's a slowpath. 1115 * all race cases, but it's a slowpath.
1114 */ 1116 */
1115 br_write_lock(vfsmount_lock); 1117 br_write_lock(&vfsmount_lock);
1116 if (mnt_get_count(mnt) != 2) { 1118 if (mnt_get_count(mnt) != 2) {
1117 br_write_unlock(vfsmount_lock); 1119 br_write_unlock(&vfsmount_lock);
1118 return -EBUSY; 1120 return -EBUSY;
1119 } 1121 }
1120 br_write_unlock(vfsmount_lock); 1122 br_write_unlock(&vfsmount_lock);
1121 1123
1122 if (!xchg(&mnt->mnt_expiry_mark, 1)) 1124 if (!xchg(&mnt->mnt_expiry_mark, 1))
1123 return -EAGAIN; 1125 return -EAGAIN;
@@ -1159,7 +1161,7 @@ static int do_umount(struct mount *mnt, int flags)
1159 } 1161 }
1160 1162
1161 down_write(&namespace_sem); 1163 down_write(&namespace_sem);
1162 br_write_lock(vfsmount_lock); 1164 br_write_lock(&vfsmount_lock);
1163 event++; 1165 event++;
1164 1166
1165 if (!(flags & MNT_DETACH)) 1167 if (!(flags & MNT_DETACH))
@@ -1171,7 +1173,7 @@ static int do_umount(struct mount *mnt, int flags)
1171 umount_tree(mnt, 1, &umount_list); 1173 umount_tree(mnt, 1, &umount_list);
1172 retval = 0; 1174 retval = 0;
1173 } 1175 }
1174 br_write_unlock(vfsmount_lock); 1176 br_write_unlock(&vfsmount_lock);
1175 up_write(&namespace_sem); 1177 up_write(&namespace_sem);
1176 release_mounts(&umount_list); 1178 release_mounts(&umount_list);
1177 return retval; 1179 return retval;
@@ -1286,19 +1288,19 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1286 q = clone_mnt(p, p->mnt.mnt_root, flag); 1288 q = clone_mnt(p, p->mnt.mnt_root, flag);
1287 if (!q) 1289 if (!q)
1288 goto Enomem; 1290 goto Enomem;
1289 br_write_lock(vfsmount_lock); 1291 br_write_lock(&vfsmount_lock);
1290 list_add_tail(&q->mnt_list, &res->mnt_list); 1292 list_add_tail(&q->mnt_list, &res->mnt_list);
1291 attach_mnt(q, &path); 1293 attach_mnt(q, &path);
1292 br_write_unlock(vfsmount_lock); 1294 br_write_unlock(&vfsmount_lock);
1293 } 1295 }
1294 } 1296 }
1295 return res; 1297 return res;
1296Enomem: 1298Enomem:
1297 if (res) { 1299 if (res) {
1298 LIST_HEAD(umount_list); 1300 LIST_HEAD(umount_list);
1299 br_write_lock(vfsmount_lock); 1301 br_write_lock(&vfsmount_lock);
1300 umount_tree(res, 0, &umount_list); 1302 umount_tree(res, 0, &umount_list);
1301 br_write_unlock(vfsmount_lock); 1303 br_write_unlock(&vfsmount_lock);
1302 release_mounts(&umount_list); 1304 release_mounts(&umount_list);
1303 } 1305 }
1304 return NULL; 1306 return NULL;
@@ -1318,9 +1320,9 @@ void drop_collected_mounts(struct vfsmount *mnt)
1318{ 1320{
1319 LIST_HEAD(umount_list); 1321 LIST_HEAD(umount_list);
1320 down_write(&namespace_sem); 1322 down_write(&namespace_sem);
1321 br_write_lock(vfsmount_lock); 1323 br_write_lock(&vfsmount_lock);
1322 umount_tree(real_mount(mnt), 0, &umount_list); 1324 umount_tree(real_mount(mnt), 0, &umount_list);
1323 br_write_unlock(vfsmount_lock); 1325 br_write_unlock(&vfsmount_lock);
1324 up_write(&namespace_sem); 1326 up_write(&namespace_sem);
1325 release_mounts(&umount_list); 1327 release_mounts(&umount_list);
1326} 1328}
@@ -1448,7 +1450,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
1448 if (err) 1450 if (err)
1449 goto out_cleanup_ids; 1451 goto out_cleanup_ids;
1450 1452
1451 br_write_lock(vfsmount_lock); 1453 br_write_lock(&vfsmount_lock);
1452 1454
1453 if (IS_MNT_SHARED(dest_mnt)) { 1455 if (IS_MNT_SHARED(dest_mnt)) {
1454 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 1456 for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1467,7 +1469,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
1467 list_del_init(&child->mnt_hash); 1469 list_del_init(&child->mnt_hash);
1468 commit_tree(child); 1470 commit_tree(child);
1469 } 1471 }
1470 br_write_unlock(vfsmount_lock); 1472 br_write_unlock(&vfsmount_lock);
1471 1473
1472 return 0; 1474 return 0;
1473 1475
@@ -1565,10 +1567,10 @@ static int do_change_type(struct path *path, int flag)
1565 goto out_unlock; 1567 goto out_unlock;
1566 } 1568 }
1567 1569
1568 br_write_lock(vfsmount_lock); 1570 br_write_lock(&vfsmount_lock);
1569 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) 1571 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
1570 change_mnt_propagation(m, type); 1572 change_mnt_propagation(m, type);
1571 br_write_unlock(vfsmount_lock); 1573 br_write_unlock(&vfsmount_lock);
1572 1574
1573 out_unlock: 1575 out_unlock:
1574 up_write(&namespace_sem); 1576 up_write(&namespace_sem);
@@ -1617,9 +1619,9 @@ static int do_loopback(struct path *path, char *old_name,
1617 1619
1618 err = graft_tree(mnt, path); 1620 err = graft_tree(mnt, path);
1619 if (err) { 1621 if (err) {
1620 br_write_lock(vfsmount_lock); 1622 br_write_lock(&vfsmount_lock);
1621 umount_tree(mnt, 0, &umount_list); 1623 umount_tree(mnt, 0, &umount_list);
1622 br_write_unlock(vfsmount_lock); 1624 br_write_unlock(&vfsmount_lock);
1623 } 1625 }
1624out2: 1626out2:
1625 unlock_mount(path); 1627 unlock_mount(path);
@@ -1677,16 +1679,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
1677 else 1679 else
1678 err = do_remount_sb(sb, flags, data, 0); 1680 err = do_remount_sb(sb, flags, data, 0);
1679 if (!err) { 1681 if (!err) {
1680 br_write_lock(vfsmount_lock); 1682 br_write_lock(&vfsmount_lock);
1681 mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK; 1683 mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
1682 mnt->mnt.mnt_flags = mnt_flags; 1684 mnt->mnt.mnt_flags = mnt_flags;
1683 br_write_unlock(vfsmount_lock); 1685 br_write_unlock(&vfsmount_lock);
1684 } 1686 }
1685 up_write(&sb->s_umount); 1687 up_write(&sb->s_umount);
1686 if (!err) { 1688 if (!err) {
1687 br_write_lock(vfsmount_lock); 1689 br_write_lock(&vfsmount_lock);
1688 touch_mnt_namespace(mnt->mnt_ns); 1690 touch_mnt_namespace(mnt->mnt_ns);
1689 br_write_unlock(vfsmount_lock); 1691 br_write_unlock(&vfsmount_lock);
1690 } 1692 }
1691 return err; 1693 return err;
1692} 1694}
@@ -1893,9 +1895,9 @@ fail:
1893 /* remove m from any expiration list it may be on */ 1895 /* remove m from any expiration list it may be on */
1894 if (!list_empty(&mnt->mnt_expire)) { 1896 if (!list_empty(&mnt->mnt_expire)) {
1895 down_write(&namespace_sem); 1897 down_write(&namespace_sem);
1896 br_write_lock(vfsmount_lock); 1898 br_write_lock(&vfsmount_lock);
1897 list_del_init(&mnt->mnt_expire); 1899 list_del_init(&mnt->mnt_expire);
1898 br_write_unlock(vfsmount_lock); 1900 br_write_unlock(&vfsmount_lock);
1899 up_write(&namespace_sem); 1901 up_write(&namespace_sem);
1900 } 1902 }
1901 mntput(m); 1903 mntput(m);
@@ -1911,11 +1913,11 @@ fail:
1911void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) 1913void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
1912{ 1914{
1913 down_write(&namespace_sem); 1915 down_write(&namespace_sem);
1914 br_write_lock(vfsmount_lock); 1916 br_write_lock(&vfsmount_lock);
1915 1917
1916 list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); 1918 list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
1917 1919
1918 br_write_unlock(vfsmount_lock); 1920 br_write_unlock(&vfsmount_lock);
1919 up_write(&namespace_sem); 1921 up_write(&namespace_sem);
1920} 1922}
1921EXPORT_SYMBOL(mnt_set_expiry); 1923EXPORT_SYMBOL(mnt_set_expiry);
@@ -1935,7 +1937,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
1935 return; 1937 return;
1936 1938
1937 down_write(&namespace_sem); 1939 down_write(&namespace_sem);
1938 br_write_lock(vfsmount_lock); 1940 br_write_lock(&vfsmount_lock);
1939 1941
1940 /* extract from the expiration list every vfsmount that matches the 1942 /* extract from the expiration list every vfsmount that matches the
1941 * following criteria: 1943 * following criteria:
@@ -1954,7 +1956,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
1954 touch_mnt_namespace(mnt->mnt_ns); 1956 touch_mnt_namespace(mnt->mnt_ns);
1955 umount_tree(mnt, 1, &umounts); 1957 umount_tree(mnt, 1, &umounts);
1956 } 1958 }
1957 br_write_unlock(vfsmount_lock); 1959 br_write_unlock(&vfsmount_lock);
1958 up_write(&namespace_sem); 1960 up_write(&namespace_sem);
1959 1961
1960 release_mounts(&umounts); 1962 release_mounts(&umounts);
@@ -2218,9 +2220,9 @@ void mnt_make_shortterm(struct vfsmount *m)
2218 struct mount *mnt = real_mount(m); 2220 struct mount *mnt = real_mount(m);
2219 if (atomic_add_unless(&mnt->mnt_longterm, -1, 1)) 2221 if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
2220 return; 2222 return;
2221 br_write_lock(vfsmount_lock); 2223 br_write_lock(&vfsmount_lock);
2222 atomic_dec(&mnt->mnt_longterm); 2224 atomic_dec(&mnt->mnt_longterm);
2223 br_write_unlock(vfsmount_lock); 2225 br_write_unlock(&vfsmount_lock);
2224#endif 2226#endif
2225} 2227}
2226 2228
@@ -2250,9 +2252,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2250 return ERR_PTR(-ENOMEM); 2252 return ERR_PTR(-ENOMEM);
2251 } 2253 }
2252 new_ns->root = new; 2254 new_ns->root = new;
2253 br_write_lock(vfsmount_lock); 2255 br_write_lock(&vfsmount_lock);
2254 list_add_tail(&new_ns->list, &new->mnt_list); 2256 list_add_tail(&new_ns->list, &new->mnt_list);
2255 br_write_unlock(vfsmount_lock); 2257 br_write_unlock(&vfsmount_lock);
2256 2258
2257 /* 2259 /*
2258 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts 2260 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2416,9 +2418,9 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
2416int path_is_under(struct path *path1, struct path *path2) 2418int path_is_under(struct path *path1, struct path *path2)
2417{ 2419{
2418 int res; 2420 int res;
2419 br_read_lock(vfsmount_lock); 2421 br_read_lock(&vfsmount_lock);
2420 res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); 2422 res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
2421 br_read_unlock(vfsmount_lock); 2423 br_read_unlock(&vfsmount_lock);
2422 return res; 2424 return res;
2423} 2425}
2424EXPORT_SYMBOL(path_is_under); 2426EXPORT_SYMBOL(path_is_under);
@@ -2505,7 +2507,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2505 /* make sure we can reach put_old from new_root */ 2507 /* make sure we can reach put_old from new_root */
2506 if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new)) 2508 if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new))
2507 goto out4; 2509 goto out4;
2508 br_write_lock(vfsmount_lock); 2510 br_write_lock(&vfsmount_lock);
2509 detach_mnt(new_mnt, &parent_path); 2511 detach_mnt(new_mnt, &parent_path);
2510 detach_mnt(root_mnt, &root_parent); 2512 detach_mnt(root_mnt, &root_parent);
2511 /* mount old root on put_old */ 2513 /* mount old root on put_old */
@@ -2513,7 +2515,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2513 /* mount new_root on / */ 2515 /* mount new_root on / */
2514 attach_mnt(new_mnt, &root_parent); 2516 attach_mnt(new_mnt, &root_parent);
2515 touch_mnt_namespace(current->nsproxy->mnt_ns); 2517 touch_mnt_namespace(current->nsproxy->mnt_ns);
2516 br_write_unlock(vfsmount_lock); 2518 br_write_unlock(&vfsmount_lock);
2517 chroot_fs_refs(&root, &new); 2519 chroot_fs_refs(&root, &new);
2518 error = 0; 2520 error = 0;
2519out4: 2521out4:
@@ -2576,7 +2578,7 @@ void __init mnt_init(void)
2576 for (u = 0; u < HASH_SIZE; u++) 2578 for (u = 0; u < HASH_SIZE; u++)
2577 INIT_LIST_HEAD(&mount_hashtable[u]); 2579 INIT_LIST_HEAD(&mount_hashtable[u]);
2578 2580
2579 br_lock_init(vfsmount_lock); 2581 br_lock_init(&vfsmount_lock);
2580 2582
2581 err = sysfs_init(); 2583 err = sysfs_init();
2582 if (err) 2584 if (err)
@@ -2596,9 +2598,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
2596 if (!atomic_dec_and_test(&ns->count)) 2598 if (!atomic_dec_and_test(&ns->count))
2597 return; 2599 return;
2598 down_write(&namespace_sem); 2600 down_write(&namespace_sem);
2599 br_write_lock(vfsmount_lock); 2601 br_write_lock(&vfsmount_lock);
2600 umount_tree(ns->root, 0, &umount_list); 2602 umount_tree(ns->root, 0, &umount_list);
2601 br_write_unlock(vfsmount_lock); 2603 br_write_unlock(&vfsmount_lock);
2602 up_write(&namespace_sem); 2604 up_write(&namespace_sem);
2603 release_mounts(&umount_list); 2605 release_mounts(&umount_list);
2604 kfree(ns); 2606 kfree(ns);
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 3ff5fcc1528f..122e260247f5 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -221,6 +221,10 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
221 221
222 already_written = 0; 222 already_written = 0;
223 223
224 errno = file_update_time(file);
225 if (errno)
226 goto outrel;
227
224 bouncebuffer = vmalloc(bufsize); 228 bouncebuffer = vmalloc(bufsize);
225 if (!bouncebuffer) { 229 if (!bouncebuffer) {
226 errno = -EIO; /* -ENOMEM */ 230 errno = -EIO; /* -ENOMEM */
@@ -252,8 +256,6 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
252 } 256 }
253 vfree(bouncebuffer); 257 vfree(bouncebuffer);
254 258
255 file_update_time(file);
256
257 *ppos = pos; 259 *ppos = pos;
258 260
259 if (pos > i_size_read(inode)) { 261 if (pos > i_size_read(inode)) {
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
index 4af803f13516..54cc0cdb3dcb 100644
--- a/fs/ncpfs/ncp_fs_sb.h
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -23,17 +23,17 @@ struct ncp_mount_data_kernel {
23 unsigned long flags; /* NCP_MOUNT_* flags */ 23 unsigned long flags; /* NCP_MOUNT_* flags */
24 unsigned int int_flags; /* internal flags */ 24 unsigned int int_flags; /* internal flags */
25#define NCP_IMOUNT_LOGGEDIN_POSSIBLE 0x0001 25#define NCP_IMOUNT_LOGGEDIN_POSSIBLE 0x0001
26 __kernel_uid32_t mounted_uid; /* Who may umount() this filesystem? */ 26 uid_t mounted_uid; /* Who may umount() this filesystem? */
27 struct pid *wdog_pid; /* Who cares for our watchdog packets? */ 27 struct pid *wdog_pid; /* Who cares for our watchdog packets? */
28 unsigned int ncp_fd; /* The socket to the ncp port */ 28 unsigned int ncp_fd; /* The socket to the ncp port */
29 unsigned int time_out; /* How long should I wait after 29 unsigned int time_out; /* How long should I wait after
30 sending a NCP request? */ 30 sending a NCP request? */
31 unsigned int retry_count; /* And how often should I retry? */ 31 unsigned int retry_count; /* And how often should I retry? */
32 unsigned char mounted_vol[NCP_VOLNAME_LEN + 1]; 32 unsigned char mounted_vol[NCP_VOLNAME_LEN + 1];
33 __kernel_uid32_t uid; 33 uid_t uid;
34 __kernel_gid32_t gid; 34 gid_t gid;
35 __kernel_mode_t file_mode; 35 umode_t file_mode;
36 __kernel_mode_t dir_mode; 36 umode_t dir_mode;
37 int info_fd; 37 int info_fd;
38}; 38};
39 39
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index eb95f5091c1a..970659daa323 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -17,6 +17,7 @@
17#include <linux/kthread.h> 17#include <linux/kthread.h>
18#include <linux/sunrpc/svcauth_gss.h> 18#include <linux/sunrpc/svcauth_gss.h>
19#include <linux/sunrpc/bc_xprt.h> 19#include <linux/sunrpc/bc_xprt.h>
20#include <linux/nsproxy.h>
20 21
21#include <net/inet_sock.h> 22#include <net/inet_sock.h>
22 23
@@ -253,6 +254,7 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
253 char svc_name[12]; 254 char svc_name[12];
254 int ret = 0; 255 int ret = 0;
255 int minorversion_setup; 256 int minorversion_setup;
257 struct net *net = current->nsproxy->net_ns;
256 258
257 mutex_lock(&nfs_callback_mutex); 259 mutex_lock(&nfs_callback_mutex);
258 if (cb_info->users++ || cb_info->task != NULL) { 260 if (cb_info->users++ || cb_info->task != NULL) {
@@ -265,6 +267,12 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
265 goto out_err; 267 goto out_err;
266 } 268 }
267 269
270 ret = svc_bind(serv, net);
271 if (ret < 0) {
272 printk(KERN_WARNING "NFS: bind callback service failed\n");
273 goto out_err;
274 }
275
268 minorversion_setup = nfs_minorversion_callback_svc_setup(minorversion, 276 minorversion_setup = nfs_minorversion_callback_svc_setup(minorversion,
269 serv, xprt, &rqstp, &callback_svc); 277 serv, xprt, &rqstp, &callback_svc);
270 if (!minorversion_setup) { 278 if (!minorversion_setup) {
@@ -306,6 +314,8 @@ out_err:
306 dprintk("NFS: Couldn't create callback socket or server thread; " 314 dprintk("NFS: Couldn't create callback socket or server thread; "
307 "err = %d\n", ret); 315 "err = %d\n", ret);
308 cb_info->users--; 316 cb_info->users--;
317 if (serv)
318 svc_shutdown_net(serv, net);
309 goto out; 319 goto out;
310} 320}
311 321
@@ -320,6 +330,7 @@ void nfs_callback_down(int minorversion)
320 cb_info->users--; 330 cb_info->users--;
321 if (cb_info->users == 0 && cb_info->task != NULL) { 331 if (cb_info->users == 0 && cb_info->task != NULL) {
322 kthread_stop(cb_info->task); 332 kthread_stop(cb_info->task);
333 svc_shutdown_net(cb_info->serv, current->nsproxy->net_ns);
323 svc_exit_thread(cb_info->rqst); 334 svc_exit_thread(cb_info->rqst);
324 cb_info->serv = NULL; 335 cb_info->serv = NULL;
325 cb_info->rqst = NULL; 336 cb_info->rqst = NULL;
@@ -332,7 +343,7 @@ void nfs_callback_down(int minorversion)
332int 343int
333check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) 344check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
334{ 345{
335 char *p = svc_gss_principal(rqstp); 346 char *p = rqstp->rq_cred.cr_principal;
336 347
337 if (rqstp->rq_authop->flavour != RPC_AUTH_GSS) 348 if (rqstp->rq_authop->flavour != RPC_AUTH_GSS)
338 return 1; 349 return 1;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 0989a2099688..f430057ff3b3 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1354,10 +1354,10 @@ out:
1354} 1354}
1355 1355
1356#ifdef CONFIG_NFS_V4 1356#ifdef CONFIG_NFS_V4
1357static int nfs_open_revalidate(struct dentry *, struct nameidata *); 1357static int nfs4_lookup_revalidate(struct dentry *, struct nameidata *);
1358 1358
1359const struct dentry_operations nfs4_dentry_operations = { 1359const struct dentry_operations nfs4_dentry_operations = {
1360 .d_revalidate = nfs_open_revalidate, 1360 .d_revalidate = nfs4_lookup_revalidate,
1361 .d_delete = nfs_dentry_delete, 1361 .d_delete = nfs_dentry_delete,
1362 .d_iput = nfs_dentry_iput, 1362 .d_iput = nfs_dentry_iput,
1363 .d_automount = nfs_d_automount, 1363 .d_automount = nfs_d_automount,
@@ -1519,13 +1519,11 @@ no_open:
1519 return nfs_lookup(dir, dentry, nd); 1519 return nfs_lookup(dir, dentry, nd);
1520} 1520}
1521 1521
1522static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) 1522static int nfs4_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
1523{ 1523{
1524 struct dentry *parent = NULL; 1524 struct dentry *parent = NULL;
1525 struct inode *inode; 1525 struct inode *inode;
1526 struct inode *dir; 1526 struct inode *dir;
1527 struct nfs_open_context *ctx;
1528 struct iattr attr;
1529 int openflags, ret = 0; 1527 int openflags, ret = 0;
1530 1528
1531 if (nd->flags & LOOKUP_RCU) 1529 if (nd->flags & LOOKUP_RCU)
@@ -1554,57 +1552,13 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1554 /* We cannot do exclusive creation on a positive dentry */ 1552 /* We cannot do exclusive creation on a positive dentry */
1555 if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) 1553 if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
1556 goto no_open_dput; 1554 goto no_open_dput;
1557 /* We can't create new files here */
1558 openflags &= ~(O_CREAT|O_EXCL);
1559
1560 ctx = create_nfs_open_context(dentry, openflags);
1561 ret = PTR_ERR(ctx);
1562 if (IS_ERR(ctx))
1563 goto out;
1564 1555
1565 attr.ia_valid = ATTR_OPEN; 1556 /* Let f_op->open() actually open (and revalidate) the file */
1566 if (openflags & O_TRUNC) { 1557 ret = 1;
1567 attr.ia_valid |= ATTR_SIZE;
1568 attr.ia_size = 0;
1569 nfs_wb_all(inode);
1570 }
1571
1572 /*
1573 * Note: we're not holding inode->i_mutex and so may be racing with
1574 * operations that change the directory. We therefore save the
1575 * change attribute *before* we do the RPC call.
1576 */
1577 inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr);
1578 if (IS_ERR(inode)) {
1579 ret = PTR_ERR(inode);
1580 switch (ret) {
1581 case -EPERM:
1582 case -EACCES:
1583 case -EDQUOT:
1584 case -ENOSPC:
1585 case -EROFS:
1586 goto out_put_ctx;
1587 default:
1588 goto out_drop;
1589 }
1590 }
1591 iput(inode);
1592 if (inode != dentry->d_inode)
1593 goto out_drop;
1594 1558
1595 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1596 ret = nfs_intent_set_file(nd, ctx);
1597 if (ret >= 0)
1598 ret = 1;
1599out: 1559out:
1600 dput(parent); 1560 dput(parent);
1601 return ret; 1561 return ret;
1602out_drop:
1603 d_drop(dentry);
1604 ret = 0;
1605out_put_ctx:
1606 put_nfs_open_context(ctx);
1607 goto out;
1608 1562
1609no_open_dput: 1563no_open_dput:
1610 dput(parent); 1564 dput(parent);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 56311ca5f9f8..a6708e6b438d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -879,12 +879,81 @@ const struct file_operations nfs_file_operations = {
879static int 879static int
880nfs4_file_open(struct inode *inode, struct file *filp) 880nfs4_file_open(struct inode *inode, struct file *filp)
881{ 881{
882 struct nfs_open_context *ctx;
883 struct dentry *dentry = filp->f_path.dentry;
884 struct dentry *parent = NULL;
885 struct inode *dir;
886 unsigned openflags = filp->f_flags;
887 struct iattr attr;
888 int err;
889
890 BUG_ON(inode != dentry->d_inode);
882 /* 891 /*
883 * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to 892 * If no cached dentry exists or if it's negative, NFSv4 handled the
884 * this point, then something is very wrong 893 * opens in ->lookup() or ->create().
894 *
895 * We only get this far for a cached positive dentry. We skipped
896 * revalidation, so handle it here by dropping the dentry and returning
897 * -EOPENSTALE. The VFS will retry the lookup/create/open.
885 */ 898 */
886 dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp); 899
887 return -ENOTDIR; 900 dprintk("NFS: open file(%s/%s)\n",
901 dentry->d_parent->d_name.name,
902 dentry->d_name.name);
903
904 if ((openflags & O_ACCMODE) == 3)
905 openflags--;
906
907 /* We can't create new files here */
908 openflags &= ~(O_CREAT|O_EXCL);
909
910 parent = dget_parent(dentry);
911 dir = parent->d_inode;
912
913 ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode);
914 err = PTR_ERR(ctx);
915 if (IS_ERR(ctx))
916 goto out;
917
918 attr.ia_valid = ATTR_OPEN;
919 if (openflags & O_TRUNC) {
920 attr.ia_valid |= ATTR_SIZE;
921 attr.ia_size = 0;
922 nfs_wb_all(inode);
923 }
924
925 inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr);
926 if (IS_ERR(inode)) {
927 err = PTR_ERR(inode);
928 switch (err) {
929 case -EPERM:
930 case -EACCES:
931 case -EDQUOT:
932 case -ENOSPC:
933 case -EROFS:
934 goto out_put_ctx;
935 default:
936 goto out_drop;
937 }
938 }
939 iput(inode);
940 if (inode != dentry->d_inode)
941 goto out_drop;
942
943 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
944 nfs_file_set_open_context(filp, ctx);
945 err = 0;
946
947out_put_ctx:
948 put_nfs_open_context(ctx);
949out:
950 dput(parent);
951 return err;
952
953out_drop:
954 d_drop(dentry);
955 err = -EOPENSTALE;
956 goto out_put_ctx;
888} 957}
889 958
890const struct file_operations nfs4_file_operations = { 959const struct file_operations nfs4_file_operations = {
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 204438cc914e..34a10d78b839 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -11,7 +11,7 @@ int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
11 struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; 11 struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
12 12
13 for (f = exp->ex_flavors; f < end; f++) { 13 for (f = exp->ex_flavors; f < end; f++) {
14 if (f->pseudoflavor == rqstp->rq_flavor) 14 if (f->pseudoflavor == rqstp->rq_cred.cr_flavor)
15 return f->flags; 15 return f->flags;
16 } 16 }
17 return exp->ex_flags; 17 return exp->ex_flags;
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index dcb52b884519..ba233499b9a5 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -706,7 +706,7 @@ static struct cache_head *svc_export_alloc(void)
706 return NULL; 706 return NULL;
707} 707}
708 708
709struct cache_detail svc_export_cache_template = { 709static struct cache_detail svc_export_cache_template = {
710 .owner = THIS_MODULE, 710 .owner = THIS_MODULE,
711 .hash_size = EXPORT_HASHMAX, 711 .hash_size = EXPORT_HASHMAX,
712 .name = "nfsd.export", 712 .name = "nfsd.export",
@@ -904,13 +904,13 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
904 return 0; 904 return 0;
905 /* ip-address based client; check sec= export option: */ 905 /* ip-address based client; check sec= export option: */
906 for (f = exp->ex_flavors; f < end; f++) { 906 for (f = exp->ex_flavors; f < end; f++) {
907 if (f->pseudoflavor == rqstp->rq_flavor) 907 if (f->pseudoflavor == rqstp->rq_cred.cr_flavor)
908 return 0; 908 return 0;
909 } 909 }
910 /* defaults in absence of sec= options: */ 910 /* defaults in absence of sec= options: */
911 if (exp->ex_nflavors == 0) { 911 if (exp->ex_nflavors == 0) {
912 if (rqstp->rq_flavor == RPC_AUTH_NULL || 912 if (rqstp->rq_cred.cr_flavor == RPC_AUTH_NULL ||
913 rqstp->rq_flavor == RPC_AUTH_UNIX) 913 rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX)
914 return 0; 914 return 0;
915 } 915 }
916 return nfserr_wrongsec; 916 return nfserr_wrongsec;
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
index 9559ce468732..e6c38159622f 100644
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c
@@ -58,6 +58,7 @@ static int nfsd_inject_set(void *op_ptr, u64 val)
58 58
59static int nfsd_inject_get(void *data, u64 *val) 59static int nfsd_inject_get(void *data, u64 *val)
60{ 60{
61 *val = 0;
61 return 0; 62 return 0;
62} 63}
63 64
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c8e9f637153a..a5fd6b982f27 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -650,9 +650,10 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
650 struct rpc_clnt *client; 650 struct rpc_clnt *client;
651 651
652 if (clp->cl_minorversion == 0) { 652 if (clp->cl_minorversion == 0) {
653 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) 653 if (!clp->cl_cred.cr_principal &&
654 (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
654 return -EINVAL; 655 return -EINVAL;
655 args.client_name = clp->cl_principal; 656 args.client_name = clp->cl_cred.cr_principal;
656 args.prognumber = conn->cb_prog, 657 args.prognumber = conn->cb_prog,
657 args.protocol = XPRT_TRANSPORT_TCP; 658 args.protocol = XPRT_TRANSPORT_TCP;
658 args.authflavor = clp->cl_flavor; 659 args.authflavor = clp->cl_flavor;
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 286a7f8f2024..dae36f1dee95 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -605,7 +605,7 @@ numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namel
605static __be32 605static __be32
606do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id) 606do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id)
607{ 607{
608 if (nfs4_disable_idmapping && rqstp->rq_flavor < RPC_AUTH_GSS) 608 if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)
609 if (numeric_name_to_id(rqstp, type, name, namelen, id)) 609 if (numeric_name_to_id(rqstp, type, name, namelen, id))
610 return 0; 610 return 0;
611 /* 611 /*
@@ -618,7 +618,7 @@ do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u
618static int 618static int
619do_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name) 619do_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
620{ 620{
621 if (nfs4_disable_idmapping && rqstp->rq_flavor < RPC_AUTH_GSS) 621 if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)
622 return sprintf(name, "%u", id); 622 return sprintf(name, "%u", id);
623 return idmap_id_to_name(rqstp, type, id, name); 623 return idmap_id_to_name(rqstp, type, id, name);
624} 624}
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index ed3f9206a0ee..5ff0b7b9fc08 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -570,7 +570,7 @@ static ssize_t
570cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) 570cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
571{ 571{
572 struct cld_upcall *tmp, *cup; 572 struct cld_upcall *tmp, *cup;
573 struct cld_msg *cmsg = (struct cld_msg *)src; 573 struct cld_msg __user *cmsg = (struct cld_msg __user *)src;
574 uint32_t xid; 574 uint32_t xid;
575 struct nfsd_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info, 575 struct nfsd_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
576 nfsd_net_id); 576 nfsd_net_id);
@@ -1029,7 +1029,7 @@ rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr)
1029 return ret; 1029 return ret;
1030} 1030}
1031 1031
1032struct notifier_block nfsd4_cld_block = { 1032static struct notifier_block nfsd4_cld_block = {
1033 .notifier_call = rpc_pipefs_event, 1033 .notifier_call = rpc_pipefs_event,
1034}; 1034};
1035 1035
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 03f82c0bc35d..8fdc9ec5c5d3 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -42,6 +42,7 @@
42#include <linux/sunrpc/clnt.h> 42#include <linux/sunrpc/clnt.h>
43#include "xdr4.h" 43#include "xdr4.h"
44#include "vfs.h" 44#include "vfs.h"
45#include "current_stateid.h"
45 46
46#define NFSDDBG_FACILITY NFSDDBG_PROC 47#define NFSDDBG_FACILITY NFSDDBG_PROC
47 48
@@ -447,37 +448,69 @@ static struct list_head close_lru;
447 * 448 *
448 * which we should reject. 449 * which we should reject.
449 */ 450 */
450static void 451static unsigned int
451set_access(unsigned int *access, unsigned long bmap) { 452bmap_to_share_mode(unsigned long bmap) {
452 int i; 453 int i;
454 unsigned int access = 0;
453 455
454 *access = 0;
455 for (i = 1; i < 4; i++) { 456 for (i = 1; i < 4; i++) {
456 if (test_bit(i, &bmap)) 457 if (test_bit(i, &bmap))
457 *access |= i; 458 access |= i;
458 }
459}
460
461static void
462set_deny(unsigned int *deny, unsigned long bmap) {
463 int i;
464
465 *deny = 0;
466 for (i = 0; i < 4; i++) {
467 if (test_bit(i, &bmap))
468 *deny |= i ;
469 } 459 }
460 return access;
470} 461}
471 462
472static int 463static bool
473test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) { 464test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) {
474 unsigned int access, deny; 465 unsigned int access, deny;
475 466
476 set_access(&access, stp->st_access_bmap); 467 access = bmap_to_share_mode(stp->st_access_bmap);
477 set_deny(&deny, stp->st_deny_bmap); 468 deny = bmap_to_share_mode(stp->st_deny_bmap);
478 if ((access & open->op_share_deny) || (deny & open->op_share_access)) 469 if ((access & open->op_share_deny) || (deny & open->op_share_access))
479 return 0; 470 return false;
480 return 1; 471 return true;
472}
473
474/* set share access for a given stateid */
475static inline void
476set_access(u32 access, struct nfs4_ol_stateid *stp)
477{
478 __set_bit(access, &stp->st_access_bmap);
479}
480
481/* clear share access for a given stateid */
482static inline void
483clear_access(u32 access, struct nfs4_ol_stateid *stp)
484{
485 __clear_bit(access, &stp->st_access_bmap);
486}
487
488/* test whether a given stateid has access */
489static inline bool
490test_access(u32 access, struct nfs4_ol_stateid *stp)
491{
492 return test_bit(access, &stp->st_access_bmap);
493}
494
495/* set share deny for a given stateid */
496static inline void
497set_deny(u32 access, struct nfs4_ol_stateid *stp)
498{
499 __set_bit(access, &stp->st_deny_bmap);
500}
501
502/* clear share deny for a given stateid */
503static inline void
504clear_deny(u32 access, struct nfs4_ol_stateid *stp)
505{
506 __clear_bit(access, &stp->st_deny_bmap);
507}
508
509/* test whether a given stateid is denying specific access */
510static inline bool
511test_deny(u32 access, struct nfs4_ol_stateid *stp)
512{
513 return test_bit(access, &stp->st_deny_bmap);
481} 514}
482 515
483static int nfs4_access_to_omode(u32 access) 516static int nfs4_access_to_omode(u32 access)
@@ -493,6 +526,20 @@ static int nfs4_access_to_omode(u32 access)
493 BUG(); 526 BUG();
494} 527}
495 528
529/* release all access and file references for a given stateid */
530static void
531release_all_access(struct nfs4_ol_stateid *stp)
532{
533 int i;
534
535 for (i = 1; i < 4; i++) {
536 if (test_access(i, stp))
537 nfs4_file_put_access(stp->st_file,
538 nfs4_access_to_omode(i));
539 clear_access(i, stp);
540 }
541}
542
496static void unhash_generic_stateid(struct nfs4_ol_stateid *stp) 543static void unhash_generic_stateid(struct nfs4_ol_stateid *stp)
497{ 544{
498 list_del(&stp->st_perfile); 545 list_del(&stp->st_perfile);
@@ -501,16 +548,7 @@ static void unhash_generic_stateid(struct nfs4_ol_stateid *stp)
501 548
502static void close_generic_stateid(struct nfs4_ol_stateid *stp) 549static void close_generic_stateid(struct nfs4_ol_stateid *stp)
503{ 550{
504 int i; 551 release_all_access(stp);
505
506 if (stp->st_access_bmap) {
507 for (i = 1; i < 4; i++) {
508 if (test_bit(i, &stp->st_access_bmap))
509 nfs4_file_put_access(stp->st_file,
510 nfs4_access_to_omode(i));
511 __clear_bit(i, &stp->st_access_bmap);
512 }
513 }
514 put_nfs4_file(stp->st_file); 552 put_nfs4_file(stp->st_file);
515 stp->st_file = NULL; 553 stp->st_file = NULL;
516} 554}
@@ -885,7 +923,7 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
885 struct nfsd4_session *new; 923 struct nfsd4_session *new;
886 struct nfsd4_channel_attrs *fchan = &cses->fore_channel; 924 struct nfsd4_channel_attrs *fchan = &cses->fore_channel;
887 int numslots, slotsize; 925 int numslots, slotsize;
888 int status; 926 __be32 status;
889 int idx; 927 int idx;
890 928
891 /* 929 /*
@@ -984,7 +1022,8 @@ static inline void
984renew_client_locked(struct nfs4_client *clp) 1022renew_client_locked(struct nfs4_client *clp)
985{ 1023{
986 if (is_client_expired(clp)) { 1024 if (is_client_expired(clp)) {
987 dprintk("%s: client (clientid %08x/%08x) already expired\n", 1025 WARN_ON(1);
1026 printk("%s: client (clientid %08x/%08x) already expired\n",
988 __func__, 1027 __func__,
989 clp->cl_clientid.cl_boot, 1028 clp->cl_clientid.cl_boot,
990 clp->cl_clientid.cl_id); 1029 clp->cl_clientid.cl_id);
@@ -1049,9 +1088,7 @@ free_client(struct nfs4_client *clp)
1049 list_del(&ses->se_perclnt); 1088 list_del(&ses->se_perclnt);
1050 nfsd4_put_session_locked(ses); 1089 nfsd4_put_session_locked(ses);
1051 } 1090 }
1052 if (clp->cl_cred.cr_group_info) 1091 free_svc_cred(&clp->cl_cred);
1053 put_group_info(clp->cl_cred.cr_group_info);
1054 kfree(clp->cl_principal);
1055 kfree(clp->cl_name.data); 1092 kfree(clp->cl_name.data);
1056 kfree(clp); 1093 kfree(clp);
1057} 1094}
@@ -1132,12 +1169,21 @@ static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
1132 target->cl_clientid.cl_id = source->cl_clientid.cl_id; 1169 target->cl_clientid.cl_id = source->cl_clientid.cl_id;
1133} 1170}
1134 1171
1135static void copy_cred(struct svc_cred *target, struct svc_cred *source) 1172static int copy_cred(struct svc_cred *target, struct svc_cred *source)
1136{ 1173{
1174 if (source->cr_principal) {
1175 target->cr_principal =
1176 kstrdup(source->cr_principal, GFP_KERNEL);
1177 if (target->cr_principal == NULL)
1178 return -ENOMEM;
1179 } else
1180 target->cr_principal = NULL;
1181 target->cr_flavor = source->cr_flavor;
1137 target->cr_uid = source->cr_uid; 1182 target->cr_uid = source->cr_uid;
1138 target->cr_gid = source->cr_gid; 1183 target->cr_gid = source->cr_gid;
1139 target->cr_group_info = source->cr_group_info; 1184 target->cr_group_info = source->cr_group_info;
1140 get_group_info(target->cr_group_info); 1185 get_group_info(target->cr_group_info);
1186 return 0;
1141} 1187}
1142 1188
1143static int same_name(const char *n1, const char *n2) 1189static int same_name(const char *n1, const char *n2)
@@ -1157,11 +1203,31 @@ same_clid(clientid_t *cl1, clientid_t *cl2)
1157 return (cl1->cl_boot == cl2->cl_boot) && (cl1->cl_id == cl2->cl_id); 1203 return (cl1->cl_boot == cl2->cl_boot) && (cl1->cl_id == cl2->cl_id);
1158} 1204}
1159 1205
1160/* XXX what about NGROUP */ 1206static bool groups_equal(struct group_info *g1, struct group_info *g2)
1207{
1208 int i;
1209
1210 if (g1->ngroups != g2->ngroups)
1211 return false;
1212 for (i=0; i<g1->ngroups; i++)
1213 if (GROUP_AT(g1, i) != GROUP_AT(g2, i))
1214 return false;
1215 return true;
1216}
1217
1161static int 1218static int
1162same_creds(struct svc_cred *cr1, struct svc_cred *cr2) 1219same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
1163{ 1220{
1164 return cr1->cr_uid == cr2->cr_uid; 1221 if ((cr1->cr_flavor != cr2->cr_flavor)
1222 || (cr1->cr_uid != cr2->cr_uid)
1223 || (cr1->cr_gid != cr2->cr_gid)
1224 || !groups_equal(cr1->cr_group_info, cr2->cr_group_info))
1225 return false;
1226 if (cr1->cr_principal == cr2->cr_principal)
1227 return true;
1228 if (!cr1->cr_principal || !cr2->cr_principal)
1229 return false;
1230 return 0 == strcmp(cr1->cr_principal, cr1->cr_principal);
1165} 1231}
1166 1232
1167static void gen_clid(struct nfs4_client *clp) 1233static void gen_clid(struct nfs4_client *clp)
@@ -1204,25 +1270,20 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
1204{ 1270{
1205 struct nfs4_client *clp; 1271 struct nfs4_client *clp;
1206 struct sockaddr *sa = svc_addr(rqstp); 1272 struct sockaddr *sa = svc_addr(rqstp);
1207 char *princ; 1273 int ret;
1208 1274
1209 clp = alloc_client(name); 1275 clp = alloc_client(name);
1210 if (clp == NULL) 1276 if (clp == NULL)
1211 return NULL; 1277 return NULL;
1212 1278
1213 INIT_LIST_HEAD(&clp->cl_sessions); 1279 INIT_LIST_HEAD(&clp->cl_sessions);
1214 1280 ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred);
1215 princ = svc_gss_principal(rqstp); 1281 if (ret) {
1216 if (princ) { 1282 spin_lock(&client_lock);
1217 clp->cl_principal = kstrdup(princ, GFP_KERNEL); 1283 free_client(clp);
1218 if (clp->cl_principal == NULL) { 1284 spin_unlock(&client_lock);
1219 spin_lock(&client_lock); 1285 return NULL;
1220 free_client(clp);
1221 spin_unlock(&client_lock);
1222 return NULL;
1223 }
1224 } 1286 }
1225
1226 idr_init(&clp->cl_stateids); 1287 idr_init(&clp->cl_stateids);
1227 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); 1288 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
1228 atomic_set(&clp->cl_refcount, 0); 1289 atomic_set(&clp->cl_refcount, 0);
@@ -1240,8 +1301,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
1240 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); 1301 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
1241 copy_verf(clp, verf); 1302 copy_verf(clp, verf);
1242 rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa); 1303 rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
1243 clp->cl_flavor = rqstp->rq_flavor;
1244 copy_cred(&clp->cl_cred, &rqstp->rq_cred);
1245 gen_confirm(clp); 1304 gen_confirm(clp);
1246 clp->cl_cb_session = NULL; 1305 clp->cl_cb_session = NULL;
1247 return clp; 1306 return clp;
@@ -1470,18 +1529,32 @@ nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
1470 clid->flags = new->cl_exchange_flags; 1529 clid->flags = new->cl_exchange_flags;
1471} 1530}
1472 1531
1532static bool client_has_state(struct nfs4_client *clp)
1533{
1534 /*
1535 * Note clp->cl_openowners check isn't quite right: there's no
1536 * need to count owners without stateid's.
1537 *
1538 * Also note we should probably be using this in 4.0 case too.
1539 */
1540 return !list_empty(&clp->cl_openowners)
1541 || !list_empty(&clp->cl_delegations)
1542 || !list_empty(&clp->cl_sessions);
1543}
1544
1473__be32 1545__be32
1474nfsd4_exchange_id(struct svc_rqst *rqstp, 1546nfsd4_exchange_id(struct svc_rqst *rqstp,
1475 struct nfsd4_compound_state *cstate, 1547 struct nfsd4_compound_state *cstate,
1476 struct nfsd4_exchange_id *exid) 1548 struct nfsd4_exchange_id *exid)
1477{ 1549{
1478 struct nfs4_client *unconf, *conf, *new; 1550 struct nfs4_client *unconf, *conf, *new;
1479 int status; 1551 __be32 status;
1480 unsigned int strhashval; 1552 unsigned int strhashval;
1481 char dname[HEXDIR_LEN]; 1553 char dname[HEXDIR_LEN];
1482 char addr_str[INET6_ADDRSTRLEN]; 1554 char addr_str[INET6_ADDRSTRLEN];
1483 nfs4_verifier verf = exid->verifier; 1555 nfs4_verifier verf = exid->verifier;
1484 struct sockaddr *sa = svc_addr(rqstp); 1556 struct sockaddr *sa = svc_addr(rqstp);
1557 bool update = exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A;
1485 1558
1486 rpc_ntop(sa, addr_str, sizeof(addr_str)); 1559 rpc_ntop(sa, addr_str, sizeof(addr_str));
1487 dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " 1560 dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
@@ -1507,71 +1580,63 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1507 status = nfs4_make_rec_clidname(dname, &exid->clname); 1580 status = nfs4_make_rec_clidname(dname, &exid->clname);
1508 1581
1509 if (status) 1582 if (status)
1510 goto error; 1583 return status;
1511 1584
1512 strhashval = clientstr_hashval(dname); 1585 strhashval = clientstr_hashval(dname);
1513 1586
1587 /* Cases below refer to rfc 5661 section 18.35.4: */
1514 nfs4_lock_state(); 1588 nfs4_lock_state();
1515 status = nfs_ok;
1516
1517 conf = find_confirmed_client_by_str(dname, strhashval); 1589 conf = find_confirmed_client_by_str(dname, strhashval);
1518 if (conf) { 1590 if (conf) {
1519 if (!clp_used_exchangeid(conf)) { 1591 bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred);
1520 status = nfserr_clid_inuse; /* XXX: ? */ 1592 bool verfs_match = same_verf(&verf, &conf->cl_verifier);
1521 goto out; 1593
1522 } 1594 if (update) {
1523 if (!same_verf(&verf, &conf->cl_verifier)) { 1595 if (!clp_used_exchangeid(conf)) { /* buggy client */
1524 /* 18.35.4 case 8 */ 1596 status = nfserr_inval;
1525 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) { 1597 goto out;
1598 }
1599 if (!creds_match) { /* case 9 */
1600 status = nfserr_perm;
1601 goto out;
1602 }
1603 if (!verfs_match) { /* case 8 */
1526 status = nfserr_not_same; 1604 status = nfserr_not_same;
1527 goto out; 1605 goto out;
1528 } 1606 }
1529 /* Client reboot: destroy old state */ 1607 /* case 6 */
1530 expire_client(conf); 1608 exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
1531 goto out_new; 1609 new = conf;
1610 goto out_copy;
1532 } 1611 }
1533 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { 1612 if (!creds_match) { /* case 3 */
1534 /* 18.35.4 case 9 */ 1613 if (client_has_state(conf)) {
1535 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) { 1614 status = nfserr_clid_inuse;
1536 status = nfserr_perm;
1537 goto out; 1615 goto out;
1538 } 1616 }
1539 expire_client(conf); 1617 expire_client(conf);
1540 goto out_new; 1618 goto out_new;
1541 } 1619 }
1542 /* 1620 if (verfs_match) { /* case 2 */
1543 * Set bit when the owner id and verifier map to an already 1621 conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R;
1544 * confirmed client id (18.35.3). 1622 new = conf;
1545 */ 1623 goto out_copy;
1546 exid->flags |= EXCHGID4_FLAG_CONFIRMED_R; 1624 }
1547 1625 /* case 5, client reboot */
1548 /* 1626 goto out_new;
1549 * Falling into 18.35.4 case 2, possible router replay.
1550 * Leave confirmed record intact and return same result.
1551 */
1552 copy_verf(conf, &verf);
1553 new = conf;
1554 goto out_copy;
1555 } 1627 }
1556 1628
1557 /* 18.35.4 case 7 */ 1629 if (update) { /* case 7 */
1558 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
1559 status = nfserr_noent; 1630 status = nfserr_noent;
1560 goto out; 1631 goto out;
1561 } 1632 }
1562 1633
1563 unconf = find_unconfirmed_client_by_str(dname, strhashval); 1634 unconf = find_unconfirmed_client_by_str(dname, strhashval);
1564 if (unconf) { 1635 if (unconf) /* case 4, possible retry or client restart */
1565 /*
1566 * Possible retry or client restart. Per 18.35.4 case 4,
1567 * a new unconfirmed record should be generated regardless
1568 * of whether any properties have changed.
1569 */
1570 expire_client(unconf); 1636 expire_client(unconf);
1571 }
1572 1637
1638 /* case 1 (normal case) */
1573out_new: 1639out_new:
1574 /* Normal case */
1575 new = create_client(exid->clname, dname, rqstp, &verf); 1640 new = create_client(exid->clname, dname, rqstp, &verf);
1576 if (new == NULL) { 1641 if (new == NULL) {
1577 status = nfserr_jukebox; 1642 status = nfserr_jukebox;
@@ -1584,7 +1649,7 @@ out_copy:
1584 exid->clientid.cl_boot = new->cl_clientid.cl_boot; 1649 exid->clientid.cl_boot = new->cl_clientid.cl_boot;
1585 exid->clientid.cl_id = new->cl_clientid.cl_id; 1650 exid->clientid.cl_id = new->cl_clientid.cl_id;
1586 1651
1587 exid->seqid = 1; 1652 exid->seqid = new->cl_cs_slot.sl_seqid + 1;
1588 nfsd4_set_ex_flags(new, exid); 1653 nfsd4_set_ex_flags(new, exid);
1589 1654
1590 dprintk("nfsd4_exchange_id seqid %d flags %x\n", 1655 dprintk("nfsd4_exchange_id seqid %d flags %x\n",
@@ -1593,12 +1658,10 @@ out_copy:
1593 1658
1594out: 1659out:
1595 nfs4_unlock_state(); 1660 nfs4_unlock_state();
1596error:
1597 dprintk("nfsd4_exchange_id returns %d\n", ntohl(status));
1598 return status; 1661 return status;
1599} 1662}
1600 1663
1601static int 1664static __be32
1602check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse) 1665check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse)
1603{ 1666{
1604 dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid, 1667 dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid,
@@ -1626,7 +1689,7 @@ check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse)
1626 */ 1689 */
1627static void 1690static void
1628nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses, 1691nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses,
1629 struct nfsd4_clid_slot *slot, int nfserr) 1692 struct nfsd4_clid_slot *slot, __be32 nfserr)
1630{ 1693{
1631 slot->sl_status = nfserr; 1694 slot->sl_status = nfserr;
1632 memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses)); 1695 memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses));
@@ -1657,7 +1720,7 @@ nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses,
1657 /* seqid, slotID, slotID, slotID, status */ \ 1720 /* seqid, slotID, slotID, slotID, status */ \
1658 5 ) * sizeof(__be32)) 1721 5 ) * sizeof(__be32))
1659 1722
1660static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs fchannel) 1723static bool check_forechannel_attrs(struct nfsd4_channel_attrs fchannel)
1661{ 1724{
1662 return fchannel.maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ 1725 return fchannel.maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ
1663 || fchannel.maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ; 1726 || fchannel.maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ;
@@ -1673,7 +1736,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1673 struct nfsd4_session *new; 1736 struct nfsd4_session *new;
1674 struct nfsd4_clid_slot *cs_slot = NULL; 1737 struct nfsd4_clid_slot *cs_slot = NULL;
1675 bool confirm_me = false; 1738 bool confirm_me = false;
1676 int status = 0; 1739 __be32 status = 0;
1677 1740
1678 if (cr_ses->flags & ~SESSION4_FLAG_MASK_A) 1741 if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
1679 return nfserr_inval; 1742 return nfserr_inval;
@@ -1686,16 +1749,10 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1686 cs_slot = &conf->cl_cs_slot; 1749 cs_slot = &conf->cl_cs_slot;
1687 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); 1750 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
1688 if (status == nfserr_replay_cache) { 1751 if (status == nfserr_replay_cache) {
1689 dprintk("Got a create_session replay! seqid= %d\n",
1690 cs_slot->sl_seqid);
1691 /* Return the cached reply status */
1692 status = nfsd4_replay_create_session(cr_ses, cs_slot); 1752 status = nfsd4_replay_create_session(cr_ses, cs_slot);
1693 goto out; 1753 goto out;
1694 } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) { 1754 } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) {
1695 status = nfserr_seq_misordered; 1755 status = nfserr_seq_misordered;
1696 dprintk("Sequence misordered!\n");
1697 dprintk("Expected seqid= %d but got seqid= %d\n",
1698 cs_slot->sl_seqid, cr_ses->seqid);
1699 goto out; 1756 goto out;
1700 } 1757 }
1701 } else if (unconf) { 1758 } else if (unconf) {
@@ -1704,7 +1761,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1704 status = nfserr_clid_inuse; 1761 status = nfserr_clid_inuse;
1705 goto out; 1762 goto out;
1706 } 1763 }
1707
1708 cs_slot = &unconf->cl_cs_slot; 1764 cs_slot = &unconf->cl_cs_slot;
1709 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); 1765 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
1710 if (status) { 1766 if (status) {
@@ -1712,7 +1768,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1712 status = nfserr_seq_misordered; 1768 status = nfserr_seq_misordered;
1713 goto out; 1769 goto out;
1714 } 1770 }
1715
1716 confirm_me = true; 1771 confirm_me = true;
1717 conf = unconf; 1772 conf = unconf;
1718 } else { 1773 } else {
@@ -1749,8 +1804,14 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1749 1804
1750 /* cache solo and embedded create sessions under the state lock */ 1805 /* cache solo and embedded create sessions under the state lock */
1751 nfsd4_cache_create_session(cr_ses, cs_slot, status); 1806 nfsd4_cache_create_session(cr_ses, cs_slot, status);
1752 if (confirm_me) 1807 if (confirm_me) {
1808 unsigned int hash = clientstr_hashval(unconf->cl_recdir);
1809 struct nfs4_client *old =
1810 find_confirmed_client_by_str(conf->cl_recdir, hash);
1811 if (old)
1812 expire_client(old);
1753 move_to_confirmed(conf); 1813 move_to_confirmed(conf);
1814 }
1754out: 1815out:
1755 nfs4_unlock_state(); 1816 nfs4_unlock_state();
1756 dprintk("%s returns %d\n", __func__, ntohl(status)); 1817 dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -1818,7 +1879,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
1818 struct nfsd4_destroy_session *sessionid) 1879 struct nfsd4_destroy_session *sessionid)
1819{ 1880{
1820 struct nfsd4_session *ses; 1881 struct nfsd4_session *ses;
1821 u32 status = nfserr_badsession; 1882 __be32 status = nfserr_badsession;
1822 1883
1823 /* Notes: 1884 /* Notes:
1824 * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid 1885 * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
@@ -1914,7 +1975,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1914 struct nfsd4_session *session; 1975 struct nfsd4_session *session;
1915 struct nfsd4_slot *slot; 1976 struct nfsd4_slot *slot;
1916 struct nfsd4_conn *conn; 1977 struct nfsd4_conn *conn;
1917 int status; 1978 __be32 status;
1918 1979
1919 if (resp->opcnt != 1) 1980 if (resp->opcnt != 1)
1920 return nfserr_sequence_pos; 1981 return nfserr_sequence_pos;
@@ -2008,18 +2069,11 @@ out:
2008 return status; 2069 return status;
2009} 2070}
2010 2071
2011static inline bool has_resources(struct nfs4_client *clp)
2012{
2013 return !list_empty(&clp->cl_openowners)
2014 || !list_empty(&clp->cl_delegations)
2015 || !list_empty(&clp->cl_sessions);
2016}
2017
2018__be32 2072__be32
2019nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc) 2073nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc)
2020{ 2074{
2021 struct nfs4_client *conf, *unconf, *clp; 2075 struct nfs4_client *conf, *unconf, *clp;
2022 int status = 0; 2076 __be32 status = 0;
2023 2077
2024 nfs4_lock_state(); 2078 nfs4_lock_state();
2025 unconf = find_unconfirmed_client(&dc->clientid); 2079 unconf = find_unconfirmed_client(&dc->clientid);
@@ -2028,7 +2082,7 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
2028 if (conf) { 2082 if (conf) {
2029 clp = conf; 2083 clp = conf;
2030 2084
2031 if (!is_client_expired(conf) && has_resources(conf)) { 2085 if (!is_client_expired(conf) && client_has_state(conf)) {
2032 status = nfserr_clientid_busy; 2086 status = nfserr_clientid_busy;
2033 goto out; 2087 goto out;
2034 } 2088 }
@@ -2055,7 +2109,7 @@ out:
2055__be32 2109__be32
2056nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc) 2110nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
2057{ 2111{
2058 int status = 0; 2112 __be32 status = 0;
2059 2113
2060 if (rc->rca_one_fs) { 2114 if (rc->rca_one_fs) {
2061 if (!cstate->current_fh.fh_dentry) 2115 if (!cstate->current_fh.fh_dentry)
@@ -2106,17 +2160,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2106 if (status) 2160 if (status)
2107 return status; 2161 return status;
2108 2162
2109 /*
2110 * XXX The Duplicate Request Cache (DRC) has been checked (??)
2111 * We get here on a DRC miss.
2112 */
2113
2114 strhashval = clientstr_hashval(dname); 2163 strhashval = clientstr_hashval(dname);
2115 2164
2165 /* Cases below refer to rfc 3530 section 14.2.33: */
2116 nfs4_lock_state(); 2166 nfs4_lock_state();
2117 conf = find_confirmed_client_by_str(dname, strhashval); 2167 conf = find_confirmed_client_by_str(dname, strhashval);
2118 if (conf) { 2168 if (conf) {
2119 /* RFC 3530 14.2.33 CASE 0: */ 2169 /* case 0: */
2120 status = nfserr_clid_inuse; 2170 status = nfserr_clid_inuse;
2121 if (clp_used_exchangeid(conf)) 2171 if (clp_used_exchangeid(conf))
2122 goto out; 2172 goto out;
@@ -2129,63 +2179,18 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2129 goto out; 2179 goto out;
2130 } 2180 }
2131 } 2181 }
2132 /*
2133 * section 14.2.33 of RFC 3530 (under the heading "IMPLEMENTATION")
2134 * has a description of SETCLIENTID request processing consisting
2135 * of 5 bullet points, labeled as CASE0 - CASE4 below.
2136 */
2137 unconf = find_unconfirmed_client_by_str(dname, strhashval); 2182 unconf = find_unconfirmed_client_by_str(dname, strhashval);
2183 if (unconf)
2184 expire_client(unconf);
2138 status = nfserr_jukebox; 2185 status = nfserr_jukebox;
2139 if (!conf) { 2186 new = create_client(clname, dname, rqstp, &clverifier);
2140 /* 2187 if (new == NULL)
2141 * RFC 3530 14.2.33 CASE 4: 2188 goto out;
2142 * placed first, because it is the normal case 2189 if (conf && same_verf(&conf->cl_verifier, &clverifier))
2143 */ 2190 /* case 1: probable callback update */
2144 if (unconf)
2145 expire_client(unconf);
2146 new = create_client(clname, dname, rqstp, &clverifier);
2147 if (new == NULL)
2148 goto out;
2149 gen_clid(new);
2150 } else if (same_verf(&conf->cl_verifier, &clverifier)) {
2151 /*
2152 * RFC 3530 14.2.33 CASE 1:
2153 * probable callback update
2154 */
2155 if (unconf) {
2156 /* Note this is removing unconfirmed {*x***},
2157 * which is stronger than RFC recommended {vxc**}.
2158 * This has the advantage that there is at most
2159 * one {*x***} in either list at any time.
2160 */
2161 expire_client(unconf);
2162 }
2163 new = create_client(clname, dname, rqstp, &clverifier);
2164 if (new == NULL)
2165 goto out;
2166 copy_clid(new, conf); 2191 copy_clid(new, conf);
2167 } else if (!unconf) { 2192 else /* case 4 (new client) or cases 2, 3 (client reboot): */
2168 /*
2169 * RFC 3530 14.2.33 CASE 2:
2170 * probable client reboot; state will be removed if
2171 * confirmed.
2172 */
2173 new = create_client(clname, dname, rqstp, &clverifier);
2174 if (new == NULL)
2175 goto out;
2176 gen_clid(new);
2177 } else {
2178 /*
2179 * RFC 3530 14.2.33 CASE 3:
2180 * probable client reboot; state will be removed if
2181 * confirmed.
2182 */
2183 expire_client(unconf);
2184 new = create_client(clname, dname, rqstp, &clverifier);
2185 if (new == NULL)
2186 goto out;
2187 gen_clid(new); 2193 gen_clid(new);
2188 }
2189 /* 2194 /*
2190 * XXX: we should probably set this at creation time, and check 2195 * XXX: we should probably set this at creation time, and check
2191 * for consistent minorversion use throughout: 2196 * for consistent minorversion use throughout:
@@ -2203,17 +2208,11 @@ out:
2203} 2208}
2204 2209
2205 2210
2206/*
2207 * Section 14.2.34 of RFC 3530 (under the heading "IMPLEMENTATION") has
2208 * a description of SETCLIENTID_CONFIRM request processing consisting of 4
2209 * bullets, labeled as CASE1 - CASE4 below.
2210 */
2211__be32 2211__be32
2212nfsd4_setclientid_confirm(struct svc_rqst *rqstp, 2212nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
2213 struct nfsd4_compound_state *cstate, 2213 struct nfsd4_compound_state *cstate,
2214 struct nfsd4_setclientid_confirm *setclientid_confirm) 2214 struct nfsd4_setclientid_confirm *setclientid_confirm)
2215{ 2215{
2216 struct sockaddr *sa = svc_addr(rqstp);
2217 struct nfs4_client *conf, *unconf; 2216 struct nfs4_client *conf, *unconf;
2218 nfs4_verifier confirm = setclientid_confirm->sc_confirm; 2217 nfs4_verifier confirm = setclientid_confirm->sc_confirm;
2219 clientid_t * clid = &setclientid_confirm->sc_clientid; 2218 clientid_t * clid = &setclientid_confirm->sc_clientid;
@@ -2221,84 +2220,44 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
2221 2220
2222 if (STALE_CLIENTID(clid)) 2221 if (STALE_CLIENTID(clid))
2223 return nfserr_stale_clientid; 2222 return nfserr_stale_clientid;
2224 /*
2225 * XXX The Duplicate Request Cache (DRC) has been checked (??)
2226 * We get here on a DRC miss.
2227 */
2228
2229 nfs4_lock_state(); 2223 nfs4_lock_state();
2230 2224
2231 conf = find_confirmed_client(clid); 2225 conf = find_confirmed_client(clid);
2232 unconf = find_unconfirmed_client(clid); 2226 unconf = find_unconfirmed_client(clid);
2233
2234 status = nfserr_clid_inuse;
2235 if (conf && !rpc_cmp_addr((struct sockaddr *) &conf->cl_addr, sa))
2236 goto out;
2237 if (unconf && !rpc_cmp_addr((struct sockaddr *) &unconf->cl_addr, sa))
2238 goto out;
2239
2240 /* 2227 /*
2241 * section 14.2.34 of RFC 3530 has a description of 2228 * We try hard to give out unique clientid's, so if we get an
2242 * SETCLIENTID_CONFIRM request processing consisting 2229 * attempt to confirm the same clientid with a different cred,
2243 * of 4 bullet points, labeled as CASE1 - CASE4 below. 2230 * there's a bug somewhere. Let's charitably assume it's our
2231 * bug.
2244 */ 2232 */
2245 if (conf && unconf && same_verf(&confirm, &unconf->cl_confirm)) { 2233 status = nfserr_serverfault;
2246 /* 2234 if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred))
2247 * RFC 3530 14.2.34 CASE 1: 2235 goto out;
2248 * callback update 2236 if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred))
2249 */ 2237 goto out;
2250 if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) 2238 /* cases below refer to rfc 3530 section 14.2.34: */
2251 status = nfserr_clid_inuse; 2239 if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) {
2252 else { 2240 if (conf && !unconf) /* case 2: probable retransmit */
2253 nfsd4_change_callback(conf, &unconf->cl_cb_conn);
2254 nfsd4_probe_callback(conf);
2255 expire_client(unconf);
2256 status = nfs_ok; 2241 status = nfs_ok;
2242 else /* case 4: client hasn't noticed we rebooted yet? */
2243 status = nfserr_stale_clientid;
2244 goto out;
2245 }
2246 status = nfs_ok;
2247 if (conf) { /* case 1: callback update */
2248 nfsd4_change_callback(conf, &unconf->cl_cb_conn);
2249 nfsd4_probe_callback(conf);
2250 expire_client(unconf);
2251 } else { /* case 3: normal case; new or rebooted client */
2252 unsigned int hash = clientstr_hashval(unconf->cl_recdir);
2257 2253
2254 conf = find_confirmed_client_by_str(unconf->cl_recdir, hash);
2255 if (conf) {
2256 nfsd4_client_record_remove(conf);
2257 expire_client(conf);
2258 } 2258 }
2259 } else if (conf && !unconf) { 2259 move_to_confirmed(unconf);
2260 /* 2260 nfsd4_probe_callback(unconf);
2261 * RFC 3530 14.2.34 CASE 2:
2262 * probable retransmitted request; play it safe and
2263 * do nothing.
2264 */
2265 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred))
2266 status = nfserr_clid_inuse;
2267 else
2268 status = nfs_ok;
2269 } else if (!conf && unconf
2270 && same_verf(&unconf->cl_confirm, &confirm)) {
2271 /*
2272 * RFC 3530 14.2.34 CASE 3:
2273 * Normal case; new or rebooted client:
2274 */
2275 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred)) {
2276 status = nfserr_clid_inuse;
2277 } else {
2278 unsigned int hash =
2279 clientstr_hashval(unconf->cl_recdir);
2280 conf = find_confirmed_client_by_str(unconf->cl_recdir,
2281 hash);
2282 if (conf) {
2283 nfsd4_client_record_remove(conf);
2284 expire_client(conf);
2285 }
2286 move_to_confirmed(unconf);
2287 conf = unconf;
2288 nfsd4_probe_callback(conf);
2289 status = nfs_ok;
2290 }
2291 } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
2292 && (!unconf || (unconf && !same_verf(&unconf->cl_confirm,
2293 &confirm)))) {
2294 /*
2295 * RFC 3530 14.2.34 CASE 4:
2296 * Client probably hasn't noticed that we rebooted yet.
2297 */
2298 status = nfserr_stale_clientid;
2299 } else {
2300 /* check that we have hit one of the cases...*/
2301 status = nfserr_clid_inuse;
2302 } 2261 }
2303out: 2262out:
2304 nfs4_unlock_state(); 2263 nfs4_unlock_state();
@@ -2454,8 +2413,8 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
2454 stp->st_file = fp; 2413 stp->st_file = fp;
2455 stp->st_access_bmap = 0; 2414 stp->st_access_bmap = 0;
2456 stp->st_deny_bmap = 0; 2415 stp->st_deny_bmap = 0;
2457 __set_bit(open->op_share_access, &stp->st_access_bmap); 2416 set_access(open->op_share_access, stp);
2458 __set_bit(open->op_share_deny, &stp->st_deny_bmap); 2417 set_deny(open->op_share_deny, stp);
2459 stp->st_openstp = NULL; 2418 stp->st_openstp = NULL;
2460} 2419}
2461 2420
@@ -2534,8 +2493,8 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
2534 ret = nfserr_locked; 2493 ret = nfserr_locked;
2535 /* Search for conflicting share reservations */ 2494 /* Search for conflicting share reservations */
2536 list_for_each_entry(stp, &fp->fi_stateids, st_perfile) { 2495 list_for_each_entry(stp, &fp->fi_stateids, st_perfile) {
2537 if (test_bit(deny_type, &stp->st_deny_bmap) || 2496 if (test_deny(deny_type, stp) ||
2538 test_bit(NFS4_SHARE_DENY_BOTH, &stp->st_deny_bmap)) 2497 test_deny(NFS4_SHARE_DENY_BOTH, stp))
2539 goto out; 2498 goto out;
2540 } 2499 }
2541 ret = nfs_ok; 2500 ret = nfs_ok;
@@ -2791,7 +2750,7 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
2791 bool new_access; 2750 bool new_access;
2792 __be32 status; 2751 __be32 status;
2793 2752
2794 new_access = !test_bit(op_share_access, &stp->st_access_bmap); 2753 new_access = !test_access(op_share_access, stp);
2795 if (new_access) { 2754 if (new_access) {
2796 status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open); 2755 status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open);
2797 if (status) 2756 if (status)
@@ -2806,8 +2765,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
2806 return status; 2765 return status;
2807 } 2766 }
2808 /* remember the open */ 2767 /* remember the open */
2809 __set_bit(op_share_access, &stp->st_access_bmap); 2768 set_access(op_share_access, stp);
2810 __set_bit(open->op_share_deny, &stp->st_deny_bmap); 2769 set_deny(open->op_share_deny, stp);
2811 2770
2812 return nfs_ok; 2771 return nfs_ok;
2813} 2772}
@@ -3282,18 +3241,18 @@ STALE_STATEID(stateid_t *stateid)
3282} 3241}
3283 3242
3284static inline int 3243static inline int
3285access_permit_read(unsigned long access_bmap) 3244access_permit_read(struct nfs4_ol_stateid *stp)
3286{ 3245{
3287 return test_bit(NFS4_SHARE_ACCESS_READ, &access_bmap) || 3246 return test_access(NFS4_SHARE_ACCESS_READ, stp) ||
3288 test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap) || 3247 test_access(NFS4_SHARE_ACCESS_BOTH, stp) ||
3289 test_bit(NFS4_SHARE_ACCESS_WRITE, &access_bmap); 3248 test_access(NFS4_SHARE_ACCESS_WRITE, stp);
3290} 3249}
3291 3250
3292static inline int 3251static inline int
3293access_permit_write(unsigned long access_bmap) 3252access_permit_write(struct nfs4_ol_stateid *stp)
3294{ 3253{
3295 return test_bit(NFS4_SHARE_ACCESS_WRITE, &access_bmap) || 3254 return test_access(NFS4_SHARE_ACCESS_WRITE, stp) ||
3296 test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap); 3255 test_access(NFS4_SHARE_ACCESS_BOTH, stp);
3297} 3256}
3298 3257
3299static 3258static
@@ -3304,9 +3263,9 @@ __be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags)
3304 /* For lock stateid's, we test the parent open, not the lock: */ 3263 /* For lock stateid's, we test the parent open, not the lock: */
3305 if (stp->st_openstp) 3264 if (stp->st_openstp)
3306 stp = stp->st_openstp; 3265 stp = stp->st_openstp;
3307 if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap))) 3266 if ((flags & WR_STATE) && !access_permit_write(stp))
3308 goto out; 3267 goto out;
3309 if ((flags & RD_STATE) && (!access_permit_read(stp->st_access_bmap))) 3268 if ((flags & RD_STATE) && !access_permit_read(stp))
3310 goto out; 3269 goto out;
3311 status = nfs_ok; 3270 status = nfs_ok;
3312out: 3271out:
@@ -3346,7 +3305,7 @@ static bool stateid_generation_after(stateid_t *a, stateid_t *b)
3346 return (s32)a->si_generation - (s32)b->si_generation > 0; 3305 return (s32)a->si_generation - (s32)b->si_generation > 0;
3347} 3306}
3348 3307
3349static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) 3308static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
3350{ 3309{
3351 /* 3310 /*
3352 * When sessions are used the stateid generation number is ignored 3311 * When sessions are used the stateid generation number is ignored
@@ -3655,10 +3614,10 @@ out:
3655 3614
3656static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access) 3615static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access)
3657{ 3616{
3658 if (!test_bit(access, &stp->st_access_bmap)) 3617 if (!test_access(access, stp))
3659 return; 3618 return;
3660 nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access)); 3619 nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access));
3661 __clear_bit(access, &stp->st_access_bmap); 3620 clear_access(access, stp);
3662} 3621}
3663 3622
3664static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access) 3623static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access)
@@ -3680,12 +3639,12 @@ static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_ac
3680} 3639}
3681 3640
3682static void 3641static void
3683reset_union_bmap_deny(unsigned long deny, unsigned long *bmap) 3642reset_union_bmap_deny(unsigned long deny, struct nfs4_ol_stateid *stp)
3684{ 3643{
3685 int i; 3644 int i;
3686 for (i = 0; i < 4; i++) { 3645 for (i = 0; i < 4; i++) {
3687 if ((i & deny) != i) 3646 if ((i & deny) != i)
3688 __clear_bit(i, bmap); 3647 clear_deny(i, stp);
3689 } 3648 }
3690} 3649}
3691 3650
@@ -3712,19 +3671,19 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
3712 if (status) 3671 if (status)
3713 goto out; 3672 goto out;
3714 status = nfserr_inval; 3673 status = nfserr_inval;
3715 if (!test_bit(od->od_share_access, &stp->st_access_bmap)) { 3674 if (!test_access(od->od_share_access, stp)) {
3716 dprintk("NFSD:access not a subset current bitmap: 0x%lx, input access=%08x\n", 3675 dprintk("NFSD: access not a subset current bitmap: 0x%lx, input access=%08x\n",
3717 stp->st_access_bmap, od->od_share_access); 3676 stp->st_access_bmap, od->od_share_access);
3718 goto out; 3677 goto out;
3719 } 3678 }
3720 if (!test_bit(od->od_share_deny, &stp->st_deny_bmap)) { 3679 if (!test_deny(od->od_share_deny, stp)) {
3721 dprintk("NFSD:deny not a subset current bitmap: 0x%lx, input deny=%08x\n", 3680 dprintk("NFSD:deny not a subset current bitmap: 0x%lx, input deny=%08x\n",
3722 stp->st_deny_bmap, od->od_share_deny); 3681 stp->st_deny_bmap, od->od_share_deny);
3723 goto out; 3682 goto out;
3724 } 3683 }
3725 nfs4_stateid_downgrade(stp, od->od_share_access); 3684 nfs4_stateid_downgrade(stp, od->od_share_access);
3726 3685
3727 reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap); 3686 reset_union_bmap_deny(od->od_share_deny, stp);
3728 3687
3729 update_stateid(&stp->st_stid.sc_stateid); 3688 update_stateid(&stp->st_stid.sc_stateid);
3730 memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); 3689 memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
@@ -4014,13 +3973,13 @@ static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
4014 struct nfs4_file *fp = lock_stp->st_file; 3973 struct nfs4_file *fp = lock_stp->st_file;
4015 int oflag = nfs4_access_to_omode(access); 3974 int oflag = nfs4_access_to_omode(access);
4016 3975
4017 if (test_bit(access, &lock_stp->st_access_bmap)) 3976 if (test_access(access, lock_stp))
4018 return; 3977 return;
4019 nfs4_file_get_access(fp, oflag); 3978 nfs4_file_get_access(fp, oflag);
4020 __set_bit(access, &lock_stp->st_access_bmap); 3979 set_access(access, lock_stp);
4021} 3980}
4022 3981
4023__be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new) 3982static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new)
4024{ 3983{
4025 struct nfs4_file *fi = ost->st_file; 3984 struct nfs4_file *fi = ost->st_file;
4026 struct nfs4_openowner *oo = openowner(ost->st_stateowner); 3985 struct nfs4_openowner *oo = openowner(ost->st_stateowner);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 74c00bc92b9a..4949667c84ea 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1674,12 +1674,12 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1674 1674
1675static void write32(__be32 **p, u32 n) 1675static void write32(__be32 **p, u32 n)
1676{ 1676{
1677 *(*p)++ = n; 1677 *(*p)++ = htonl(n);
1678} 1678}
1679 1679
1680static void write64(__be32 **p, u64 n) 1680static void write64(__be32 **p, u64 n)
1681{ 1681{
1682 write32(p, (u32)(n >> 32)); 1682 write32(p, (n >> 32));
1683 write32(p, (u32)n); 1683 write32(p, (u32)n);
1684} 1684}
1685 1685
@@ -1744,15 +1744,16 @@ static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, _
1744} 1744}
1745 1745
1746/* Encode as an array of strings the string given with components 1746/* Encode as an array of strings the string given with components
1747 * separated @sep. 1747 * separated @sep, escaped with esc_enter and esc_exit.
1748 */ 1748 */
1749static __be32 nfsd4_encode_components(char sep, char *components, 1749static __be32 nfsd4_encode_components_esc(char sep, char *components,
1750 __be32 **pp, int *buflen) 1750 __be32 **pp, int *buflen,
1751 char esc_enter, char esc_exit)
1751{ 1752{
1752 __be32 *p = *pp; 1753 __be32 *p = *pp;
1753 __be32 *countp = p; 1754 __be32 *countp = p;
1754 int strlen, count=0; 1755 int strlen, count=0;
1755 char *str, *end; 1756 char *str, *end, *next;
1756 1757
1757 dprintk("nfsd4_encode_components(%s)\n", components); 1758 dprintk("nfsd4_encode_components(%s)\n", components);
1758 if ((*buflen -= 4) < 0) 1759 if ((*buflen -= 4) < 0)
@@ -1760,8 +1761,23 @@ static __be32 nfsd4_encode_components(char sep, char *components,
1760 WRITE32(0); /* We will fill this in with @count later */ 1761 WRITE32(0); /* We will fill this in with @count later */
1761 end = str = components; 1762 end = str = components;
1762 while (*end) { 1763 while (*end) {
1763 for (; *end && (*end != sep); end++) 1764 bool found_esc = false;
1764 ; /* Point to end of component */ 1765
1766 /* try to parse as esc_start, ..., esc_end, sep */
1767 if (*str == esc_enter) {
1768 for (; *end && (*end != esc_exit); end++)
1769 /* find esc_exit or end of string */;
1770 next = end + 1;
1771 if (*end && (!*next || *next == sep)) {
1772 str++;
1773 found_esc = true;
1774 }
1775 }
1776
1777 if (!found_esc)
1778 for (; *end && (*end != sep); end++)
1779 /* find sep or end of string */;
1780
1765 strlen = end - str; 1781 strlen = end - str;
1766 if (strlen) { 1782 if (strlen) {
1767 if ((*buflen -= ((XDR_QUADLEN(strlen) << 2) + 4)) < 0) 1783 if ((*buflen -= ((XDR_QUADLEN(strlen) << 2) + 4)) < 0)
@@ -1780,6 +1796,15 @@ static __be32 nfsd4_encode_components(char sep, char *components,
1780 return 0; 1796 return 0;
1781} 1797}
1782 1798
1799/* Encode as an array of strings the string given with components
1800 * separated @sep.
1801 */
1802static __be32 nfsd4_encode_components(char sep, char *components,
1803 __be32 **pp, int *buflen)
1804{
1805 return nfsd4_encode_components_esc(sep, components, pp, buflen, 0, 0);
1806}
1807
1783/* 1808/*
1784 * encode a location element of a fs_locations structure 1809 * encode a location element of a fs_locations structure
1785 */ 1810 */
@@ -1789,7 +1814,8 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
1789 __be32 status; 1814 __be32 status;
1790 __be32 *p = *pp; 1815 __be32 *p = *pp;
1791 1816
1792 status = nfsd4_encode_components(':', location->hosts, &p, buflen); 1817 status = nfsd4_encode_components_esc(':', location->hosts, &p, buflen,
1818 '[', ']');
1793 if (status) 1819 if (status)
1794 return status; 1820 return status;
1795 status = nfsd4_encode_components('/', location->path, &p, buflen); 1821 status = nfsd4_encode_components('/', location->path, &p, buflen);
@@ -3251,7 +3277,7 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
3251} 3277}
3252 3278
3253static __be32 3279static __be32
3254nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr, 3280nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
3255 struct nfsd4_exchange_id *exid) 3281 struct nfsd4_exchange_id *exid)
3256{ 3282{
3257 __be32 *p; 3283 __be32 *p;
@@ -3306,7 +3332,7 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
3306} 3332}
3307 3333
3308static __be32 3334static __be32
3309nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr, 3335nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
3310 struct nfsd4_create_session *sess) 3336 struct nfsd4_create_session *sess)
3311{ 3337{
3312 __be32 *p; 3338 __be32 *p;
@@ -3355,14 +3381,14 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
3355} 3381}
3356 3382
3357static __be32 3383static __be32
3358nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr, 3384nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, __be32 nfserr,
3359 struct nfsd4_destroy_session *destroy_session) 3385 struct nfsd4_destroy_session *destroy_session)
3360{ 3386{
3361 return nfserr; 3387 return nfserr;
3362} 3388}
3363 3389
3364static __be32 3390static __be32
3365nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, int nfserr, 3391nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
3366 struct nfsd4_free_stateid *free_stateid) 3392 struct nfsd4_free_stateid *free_stateid)
3367{ 3393{
3368 __be32 *p; 3394 __be32 *p;
@@ -3371,13 +3397,13 @@ nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, int nfserr,
3371 return nfserr; 3397 return nfserr;
3372 3398
3373 RESERVE_SPACE(4); 3399 RESERVE_SPACE(4);
3374 WRITE32(nfserr); 3400 *p++ = nfserr;
3375 ADJUST_ARGS(); 3401 ADJUST_ARGS();
3376 return nfserr; 3402 return nfserr;
3377} 3403}
3378 3404
3379static __be32 3405static __be32
3380nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, 3406nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
3381 struct nfsd4_sequence *seq) 3407 struct nfsd4_sequence *seq)
3382{ 3408{
3383 __be32 *p; 3409 __be32 *p;
@@ -3399,8 +3425,8 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
3399 return 0; 3425 return 0;
3400} 3426}
3401 3427
3402__be32 3428static __be32
3403nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr, 3429nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
3404 struct nfsd4_test_stateid *test_stateid) 3430 struct nfsd4_test_stateid *test_stateid)
3405{ 3431{
3406 struct nfsd4_test_stateid_id *stateid, *next; 3432 struct nfsd4_test_stateid_id *stateid, *next;
@@ -3503,7 +3529,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
3503 * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so 3529 * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
3504 * will be at least a page and will therefore hold the xdr_buf head. 3530 * will be at least a page and will therefore hold the xdr_buf head.
3505 */ 3531 */
3506int nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad) 3532__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
3507{ 3533{
3508 struct xdr_buf *xb = &resp->rqstp->rq_res; 3534 struct xdr_buf *xb = &resp->rqstp->rq_res;
3509 struct nfsd4_session *session = NULL; 3535 struct nfsd4_session *session = NULL;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 72699885ac48..c55298ed5772 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -661,6 +661,7 @@ static ssize_t __write_ports_addfd(char *buf)
661{ 661{
662 char *mesg = buf; 662 char *mesg = buf;
663 int fd, err; 663 int fd, err;
664 struct net *net = &init_net;
664 665
665 err = get_int(&mesg, &fd); 666 err = get_int(&mesg, &fd);
666 if (err != 0 || fd < 0) 667 if (err != 0 || fd < 0)
@@ -672,6 +673,8 @@ static ssize_t __write_ports_addfd(char *buf)
672 673
673 err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT); 674 err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
674 if (err < 0) { 675 if (err < 0) {
676 if (nfsd_serv->sv_nrthreads == 1)
677 svc_shutdown_net(nfsd_serv, net);
675 svc_destroy(nfsd_serv); 678 svc_destroy(nfsd_serv);
676 return err; 679 return err;
677 } 680 }
@@ -709,6 +712,7 @@ static ssize_t __write_ports_addxprt(char *buf)
709 char transport[16]; 712 char transport[16];
710 struct svc_xprt *xprt; 713 struct svc_xprt *xprt;
711 int port, err; 714 int port, err;
715 struct net *net = &init_net;
712 716
713 if (sscanf(buf, "%15s %4u", transport, &port) != 2) 717 if (sscanf(buf, "%15s %4u", transport, &port) != 2)
714 return -EINVAL; 718 return -EINVAL;
@@ -720,12 +724,12 @@ static ssize_t __write_ports_addxprt(char *buf)
720 if (err != 0) 724 if (err != 0)
721 return err; 725 return err;
722 726
723 err = svc_create_xprt(nfsd_serv, transport, &init_net, 727 err = svc_create_xprt(nfsd_serv, transport, net,
724 PF_INET, port, SVC_SOCK_ANONYMOUS); 728 PF_INET, port, SVC_SOCK_ANONYMOUS);
725 if (err < 0) 729 if (err < 0)
726 goto out_err; 730 goto out_err;
727 731
728 err = svc_create_xprt(nfsd_serv, transport, &init_net, 732 err = svc_create_xprt(nfsd_serv, transport, net,
729 PF_INET6, port, SVC_SOCK_ANONYMOUS); 733 PF_INET6, port, SVC_SOCK_ANONYMOUS);
730 if (err < 0 && err != -EAFNOSUPPORT) 734 if (err < 0 && err != -EAFNOSUPPORT)
731 goto out_close; 735 goto out_close;
@@ -734,12 +738,14 @@ static ssize_t __write_ports_addxprt(char *buf)
734 nfsd_serv->sv_nrthreads--; 738 nfsd_serv->sv_nrthreads--;
735 return 0; 739 return 0;
736out_close: 740out_close:
737 xprt = svc_find_xprt(nfsd_serv, transport, &init_net, PF_INET, port); 741 xprt = svc_find_xprt(nfsd_serv, transport, net, PF_INET, port);
738 if (xprt != NULL) { 742 if (xprt != NULL) {
739 svc_close_xprt(xprt); 743 svc_close_xprt(xprt);
740 svc_xprt_put(xprt); 744 svc_xprt_put(xprt);
741 } 745 }
742out_err: 746out_err:
747 if (nfsd_serv->sv_nrthreads == 1)
748 svc_shutdown_net(nfsd_serv, net);
743 svc_destroy(nfsd_serv); 749 svc_destroy(nfsd_serv);
744 return err; 750 return err;
745} 751}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index cb4d51d8cbdb..ee709fc8f58b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -11,6 +11,7 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/fs_struct.h> 12#include <linux/fs_struct.h>
13#include <linux/swap.h> 13#include <linux/swap.h>
14#include <linux/nsproxy.h>
14 15
15#include <linux/sunrpc/stats.h> 16#include <linux/sunrpc/stats.h>
16#include <linux/sunrpc/svcsock.h> 17#include <linux/sunrpc/svcsock.h>
@@ -330,6 +331,8 @@ static int nfsd_get_default_max_blksize(void)
330 331
331int nfsd_create_serv(void) 332int nfsd_create_serv(void)
332{ 333{
334 int error;
335
333 WARN_ON(!mutex_is_locked(&nfsd_mutex)); 336 WARN_ON(!mutex_is_locked(&nfsd_mutex));
334 if (nfsd_serv) { 337 if (nfsd_serv) {
335 svc_get(nfsd_serv); 338 svc_get(nfsd_serv);
@@ -343,6 +346,12 @@ int nfsd_create_serv(void)
343 if (nfsd_serv == NULL) 346 if (nfsd_serv == NULL)
344 return -ENOMEM; 347 return -ENOMEM;
345 348
349 error = svc_bind(nfsd_serv, current->nsproxy->net_ns);
350 if (error < 0) {
351 svc_destroy(nfsd_serv);
352 return error;
353 }
354
346 set_max_drc(); 355 set_max_drc();
347 do_gettimeofday(&nfssvc_boot); /* record boot time */ 356 do_gettimeofday(&nfssvc_boot); /* record boot time */
348 return 0; 357 return 0;
@@ -373,6 +382,7 @@ int nfsd_set_nrthreads(int n, int *nthreads)
373 int i = 0; 382 int i = 0;
374 int tot = 0; 383 int tot = 0;
375 int err = 0; 384 int err = 0;
385 struct net *net = &init_net;
376 386
377 WARN_ON(!mutex_is_locked(&nfsd_mutex)); 387 WARN_ON(!mutex_is_locked(&nfsd_mutex));
378 388
@@ -417,6 +427,9 @@ int nfsd_set_nrthreads(int n, int *nthreads)
417 if (err) 427 if (err)
418 break; 428 break;
419 } 429 }
430
431 if (nfsd_serv->sv_nrthreads == 1)
432 svc_shutdown_net(nfsd_serv, net);
420 svc_destroy(nfsd_serv); 433 svc_destroy(nfsd_serv);
421 434
422 return err; 435 return err;
@@ -432,6 +445,7 @@ nfsd_svc(unsigned short port, int nrservs)
432{ 445{
433 int error; 446 int error;
434 bool nfsd_up_before; 447 bool nfsd_up_before;
448 struct net *net = &init_net;
435 449
436 mutex_lock(&nfsd_mutex); 450 mutex_lock(&nfsd_mutex);
437 dprintk("nfsd: creating service\n"); 451 dprintk("nfsd: creating service\n");
@@ -464,6 +478,8 @@ out_shutdown:
464 if (error < 0 && !nfsd_up_before) 478 if (error < 0 && !nfsd_up_before)
465 nfsd_shutdown(); 479 nfsd_shutdown();
466out_destroy: 480out_destroy:
481 if (nfsd_serv->sv_nrthreads == 1)
482 svc_shutdown_net(nfsd_serv, net);
467 svc_destroy(nfsd_serv); /* Release server */ 483 svc_destroy(nfsd_serv); /* Release server */
468out: 484out:
469 mutex_unlock(&nfsd_mutex); 485 mutex_unlock(&nfsd_mutex);
@@ -547,6 +563,9 @@ nfsd(void *vrqstp)
547 nfsdstats.th_cnt --; 563 nfsdstats.th_cnt --;
548 564
549out: 565out:
566 if (rqstp->rq_server->sv_nrthreads == 1)
567 svc_shutdown_net(rqstp->rq_server, &init_net);
568
550 /* Release the thread */ 569 /* Release the thread */
551 svc_exit_thread(rqstp); 570 svc_exit_thread(rqstp);
552 571
@@ -659,8 +678,12 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file)
659int nfsd_pool_stats_release(struct inode *inode, struct file *file) 678int nfsd_pool_stats_release(struct inode *inode, struct file *file)
660{ 679{
661 int ret = seq_release(inode, file); 680 int ret = seq_release(inode, file);
681 struct net *net = &init_net;
682
662 mutex_lock(&nfsd_mutex); 683 mutex_lock(&nfsd_mutex);
663 /* this function really, really should have been called svc_put() */ 684 /* this function really, really should have been called svc_put() */
685 if (nfsd_serv->sv_nrthreads == 1)
686 svc_shutdown_net(nfsd_serv, net);
664 svc_destroy(nfsd_serv); 687 svc_destroy(nfsd_serv);
665 mutex_unlock(&nfsd_mutex); 688 mutex_unlock(&nfsd_mutex);
666 return ret; 689 return ret;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 89ab137d379a..849091e16ea6 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -232,7 +232,6 @@ struct nfs4_client {
232 time_t cl_time; /* time of last lease renewal */ 232 time_t cl_time; /* time of last lease renewal */
233 struct sockaddr_storage cl_addr; /* client ipaddress */ 233 struct sockaddr_storage cl_addr; /* client ipaddress */
234 u32 cl_flavor; /* setclientid pseudoflavor */ 234 u32 cl_flavor; /* setclientid pseudoflavor */
235 char *cl_principal; /* setclientid principal name */
236 struct svc_cred cl_cred; /* setclientid principal */ 235 struct svc_cred cl_cred; /* setclientid principal */
237 clientid_t cl_clientid; /* generated by server */ 236 clientid_t cl_clientid; /* generated by server */
238 nfs4_verifier cl_confirm; /* generated by server */ 237 nfs4_verifier cl_confirm; /* generated by server */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 1b3501598ab5..acd127d4ee82 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -60,7 +60,7 @@ struct nfsd4_compound_state {
60 __be32 *datap; 60 __be32 *datap;
61 size_t iovlen; 61 size_t iovlen;
62 u32 minorversion; 62 u32 minorversion;
63 u32 status; 63 __be32 status;
64 stateid_t current_stateid; 64 stateid_t current_stateid;
65 stateid_t save_stateid; 65 stateid_t save_stateid;
66 /* to indicate current and saved state id presents */ 66 /* to indicate current and saved state id presents */
@@ -364,7 +364,7 @@ struct nfsd4_test_stateid_id {
364}; 364};
365 365
366struct nfsd4_test_stateid { 366struct nfsd4_test_stateid {
367 __be32 ts_num_ids; 367 u32 ts_num_ids;
368 struct list_head ts_stateid_list; 368 struct list_head ts_stateid_list;
369}; 369};
370 370
@@ -549,7 +549,7 @@ int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *,
549 struct nfsd4_compoundargs *); 549 struct nfsd4_compoundargs *);
550int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *, 550int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *,
551 struct nfsd4_compoundres *); 551 struct nfsd4_compoundres *);
552int nfsd4_check_resp_size(struct nfsd4_compoundres *, u32); 552__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
553void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); 553void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
554void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); 554void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
555__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, 555__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 0bb2c2010b95..b72847988b78 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -508,31 +508,29 @@ static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh,
508 return nilfs_get_dentry(sb, fid->cno, fid->parent_ino, fid->parent_gen); 508 return nilfs_get_dentry(sb, fid->cno, fid->parent_ino, fid->parent_gen);
509} 509}
510 510
511static int nilfs_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp, 511static int nilfs_encode_fh(struct inode *inode, __u32 *fh, int *lenp,
512 int connectable) 512 struct inode *parent)
513{ 513{
514 struct nilfs_fid *fid = (struct nilfs_fid *)fh; 514 struct nilfs_fid *fid = (struct nilfs_fid *)fh;
515 struct inode *inode = dentry->d_inode;
516 struct nilfs_root *root = NILFS_I(inode)->i_root; 515 struct nilfs_root *root = NILFS_I(inode)->i_root;
517 int type; 516 int type;
518 517
519 if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE || 518 if (parent && *lenp < NILFS_FID_SIZE_CONNECTABLE) {
520 (connectable && *lenp < NILFS_FID_SIZE_CONNECTABLE)) 519 *lenp = NILFS_FID_SIZE_CONNECTABLE;
520 return 255;
521 }
522 if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE) {
523 *lenp = NILFS_FID_SIZE_NON_CONNECTABLE;
521 return 255; 524 return 255;
525 }
522 526
523 fid->cno = root->cno; 527 fid->cno = root->cno;
524 fid->ino = inode->i_ino; 528 fid->ino = inode->i_ino;
525 fid->gen = inode->i_generation; 529 fid->gen = inode->i_generation;
526 530
527 if (connectable && !S_ISDIR(inode->i_mode)) { 531 if (parent) {
528 struct inode *parent;
529
530 spin_lock(&dentry->d_lock);
531 parent = dentry->d_parent->d_inode;
532 fid->parent_ino = parent->i_ino; 532 fid->parent_ino = parent->i_ino;
533 fid->parent_gen = parent->i_generation; 533 fid->parent_gen = parent->i_generation;
534 spin_unlock(&dentry->d_lock);
535
536 type = FILEID_NILFS_WITH_PARENT; 534 type = FILEID_NILFS_WITH_PARENT;
537 *lenp = NILFS_FID_SIZE_CONNECTABLE; 535 *lenp = NILFS_FID_SIZE_CONNECTABLE;
538 } else { 536 } else {
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ccb14d3fc0de..b39c5c161adb 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -123,7 +123,7 @@ int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
123} 123}
124EXPORT_SYMBOL_GPL(__fsnotify_parent); 124EXPORT_SYMBOL_GPL(__fsnotify_parent);
125 125
126static int send_to_group(struct inode *to_tell, struct vfsmount *mnt, 126static int send_to_group(struct inode *to_tell,
127 struct fsnotify_mark *inode_mark, 127 struct fsnotify_mark *inode_mark,
128 struct fsnotify_mark *vfsmount_mark, 128 struct fsnotify_mark *vfsmount_mark,
129 __u32 mask, void *data, 129 __u32 mask, void *data,
@@ -168,10 +168,10 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
168 vfsmount_test_mask &= ~inode_mark->ignored_mask; 168 vfsmount_test_mask &= ~inode_mark->ignored_mask;
169 } 169 }
170 170
171 pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p" 171 pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p"
172 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x" 172 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
173 " data=%p data_is=%d cookie=%d event=%p\n", 173 " data=%p data_is=%d cookie=%d event=%p\n",
174 __func__, group, to_tell, mnt, mask, inode_mark, 174 __func__, group, to_tell, mask, inode_mark,
175 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data, 175 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
176 data_is, cookie, *event); 176 data_is, cookie, *event);
177 177
@@ -258,16 +258,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
258 258
259 if (inode_group > vfsmount_group) { 259 if (inode_group > vfsmount_group) {
260 /* handle inode */ 260 /* handle inode */
261 ret = send_to_group(to_tell, NULL, inode_mark, NULL, mask, data, 261 ret = send_to_group(to_tell, inode_mark, NULL, mask, data,
262 data_is, cookie, file_name, &event); 262 data_is, cookie, file_name, &event);
263 /* we didn't use the vfsmount_mark */ 263 /* we didn't use the vfsmount_mark */
264 vfsmount_group = NULL; 264 vfsmount_group = NULL;
265 } else if (vfsmount_group > inode_group) { 265 } else if (vfsmount_group > inode_group) {
266 ret = send_to_group(to_tell, &mnt->mnt, NULL, vfsmount_mark, mask, data, 266 ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, data,
267 data_is, cookie, file_name, &event); 267 data_is, cookie, file_name, &event);
268 inode_group = NULL; 268 inode_group = NULL;
269 } else { 269 } else {
270 ret = send_to_group(to_tell, &mnt->mnt, inode_mark, vfsmount_mark, 270 ret = send_to_group(to_tell, inode_mark, vfsmount_mark,
271 mask, data, data_is, cookie, file_name, 271 mask, data, data_is, cookie, file_name,
272 &event); 272 &event);
273 } 273 }
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 8639169221c7..7389d2d5e51d 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2096,7 +2096,9 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
2096 err = file_remove_suid(file); 2096 err = file_remove_suid(file);
2097 if (err) 2097 if (err)
2098 goto out; 2098 goto out;
2099 file_update_time(file); 2099 err = file_update_time(file);
2100 if (err)
2101 goto out;
2100 written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos, 2102 written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
2101 count); 2103 count);
2102out: 2104out:
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index c7ee03c22226..0725e6054650 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -422,45 +422,46 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
422 struct ocfs2_blockcheck_stats *stats) 422 struct ocfs2_blockcheck_stats *stats)
423{ 423{
424 int rc = 0; 424 int rc = 0;
425 struct ocfs2_block_check check; 425 u32 bc_crc32e;
426 u16 bc_ecc;
426 u32 crc, ecc; 427 u32 crc, ecc;
427 428
428 ocfs2_blockcheck_inc_check(stats); 429 ocfs2_blockcheck_inc_check(stats);
429 430
430 check.bc_crc32e = le32_to_cpu(bc->bc_crc32e); 431 bc_crc32e = le32_to_cpu(bc->bc_crc32e);
431 check.bc_ecc = le16_to_cpu(bc->bc_ecc); 432 bc_ecc = le16_to_cpu(bc->bc_ecc);
432 433
433 memset(bc, 0, sizeof(struct ocfs2_block_check)); 434 memset(bc, 0, sizeof(struct ocfs2_block_check));
434 435
435 /* Fast path - if the crc32 validates, we're good to go */ 436 /* Fast path - if the crc32 validates, we're good to go */
436 crc = crc32_le(~0, data, blocksize); 437 crc = crc32_le(~0, data, blocksize);
437 if (crc == check.bc_crc32e) 438 if (crc == bc_crc32e)
438 goto out; 439 goto out;
439 440
440 ocfs2_blockcheck_inc_failure(stats); 441 ocfs2_blockcheck_inc_failure(stats);
441 mlog(ML_ERROR, 442 mlog(ML_ERROR,
442 "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n", 443 "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n",
443 (unsigned int)check.bc_crc32e, (unsigned int)crc); 444 (unsigned int)bc_crc32e, (unsigned int)crc);
444 445
445 /* Ok, try ECC fixups */ 446 /* Ok, try ECC fixups */
446 ecc = ocfs2_hamming_encode_block(data, blocksize); 447 ecc = ocfs2_hamming_encode_block(data, blocksize);
447 ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc); 448 ocfs2_hamming_fix_block(data, blocksize, ecc ^ bc_ecc);
448 449
449 /* And check the crc32 again */ 450 /* And check the crc32 again */
450 crc = crc32_le(~0, data, blocksize); 451 crc = crc32_le(~0, data, blocksize);
451 if (crc == check.bc_crc32e) { 452 if (crc == bc_crc32e) {
452 ocfs2_blockcheck_inc_recover(stats); 453 ocfs2_blockcheck_inc_recover(stats);
453 goto out; 454 goto out;
454 } 455 }
455 456
456 mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n", 457 mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n",
457 (unsigned int)check.bc_crc32e, (unsigned int)crc); 458 (unsigned int)bc_crc32e, (unsigned int)crc);
458 459
459 rc = -EIO; 460 rc = -EIO;
460 461
461out: 462out:
462 bc->bc_crc32e = cpu_to_le32(check.bc_crc32e); 463 bc->bc_crc32e = cpu_to_le32(bc_crc32e);
463 bc->bc_ecc = cpu_to_le16(check.bc_ecc); 464 bc->bc_ecc = cpu_to_le16(bc_ecc);
464 465
465 return rc; 466 return rc;
466} 467}
@@ -528,7 +529,8 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
528 struct ocfs2_blockcheck_stats *stats) 529 struct ocfs2_blockcheck_stats *stats)
529{ 530{
530 int i, rc = 0; 531 int i, rc = 0;
531 struct ocfs2_block_check check; 532 u32 bc_crc32e;
533 u16 bc_ecc;
532 u32 crc, ecc, fix; 534 u32 crc, ecc, fix;
533 535
534 BUG_ON(nr < 0); 536 BUG_ON(nr < 0);
@@ -538,21 +540,21 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
538 540
539 ocfs2_blockcheck_inc_check(stats); 541 ocfs2_blockcheck_inc_check(stats);
540 542
541 check.bc_crc32e = le32_to_cpu(bc->bc_crc32e); 543 bc_crc32e = le32_to_cpu(bc->bc_crc32e);
542 check.bc_ecc = le16_to_cpu(bc->bc_ecc); 544 bc_ecc = le16_to_cpu(bc->bc_ecc);
543 545
544 memset(bc, 0, sizeof(struct ocfs2_block_check)); 546 memset(bc, 0, sizeof(struct ocfs2_block_check));
545 547
546 /* Fast path - if the crc32 validates, we're good to go */ 548 /* Fast path - if the crc32 validates, we're good to go */
547 for (i = 0, crc = ~0; i < nr; i++) 549 for (i = 0, crc = ~0; i < nr; i++)
548 crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); 550 crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
549 if (crc == check.bc_crc32e) 551 if (crc == bc_crc32e)
550 goto out; 552 goto out;
551 553
552 ocfs2_blockcheck_inc_failure(stats); 554 ocfs2_blockcheck_inc_failure(stats);
553 mlog(ML_ERROR, 555 mlog(ML_ERROR,
554 "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", 556 "CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
555 (unsigned int)check.bc_crc32e, (unsigned int)crc); 557 (unsigned int)bc_crc32e, (unsigned int)crc);
556 558
557 /* Ok, try ECC fixups */ 559 /* Ok, try ECC fixups */
558 for (i = 0, ecc = 0; i < nr; i++) { 560 for (i = 0, ecc = 0; i < nr; i++) {
@@ -565,7 +567,7 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
565 bhs[i]->b_size * 8, 567 bhs[i]->b_size * 8,
566 bhs[i]->b_size * 8 * i); 568 bhs[i]->b_size * 8 * i);
567 } 569 }
568 fix = ecc ^ check.bc_ecc; 570 fix = ecc ^ bc_ecc;
569 for (i = 0; i < nr; i++) { 571 for (i = 0; i < nr; i++) {
570 /* 572 /*
571 * Try the fix against each buffer. It will only affect 573 * Try the fix against each buffer. It will only affect
@@ -578,19 +580,19 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
578 /* And check the crc32 again */ 580 /* And check the crc32 again */
579 for (i = 0, crc = ~0; i < nr; i++) 581 for (i = 0, crc = ~0; i < nr; i++)
580 crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); 582 crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
581 if (crc == check.bc_crc32e) { 583 if (crc == bc_crc32e) {
582 ocfs2_blockcheck_inc_recover(stats); 584 ocfs2_blockcheck_inc_recover(stats);
583 goto out; 585 goto out;
584 } 586 }
585 587
586 mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", 588 mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
587 (unsigned int)check.bc_crc32e, (unsigned int)crc); 589 (unsigned int)bc_crc32e, (unsigned int)crc);
588 590
589 rc = -EIO; 591 rc = -EIO;
590 592
591out: 593out:
592 bc->bc_crc32e = cpu_to_le32(check.bc_crc32e); 594 bc->bc_crc32e = cpu_to_le32(bc_crc32e);
593 bc->bc_ecc = cpu_to_le16(check.bc_ecc); 595 bc->bc_ecc = cpu_to_le16(bc_ecc);
594 596
595 return rc; 597 return rc;
596} 598}
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 3a3ed4bb794b..fbec0be62326 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -293,7 +293,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
293 struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf; 293 struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
294 char *name; 294 char *name;
295 struct list_head *iter, *head=NULL; 295 struct list_head *iter, *head=NULL;
296 u64 cookie; 296 __be64 cookie;
297 u32 flags; 297 u32 flags;
298 u8 node; 298 u8 node;
299 299
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index a5952ceecba5..de854cca12a2 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -679,7 +679,7 @@ struct dlm_query_join_packet {
679}; 679};
680 680
681union dlm_query_join_response { 681union dlm_query_join_response {
682 u32 intval; 682 __be32 intval;
683 struct dlm_query_join_packet packet; 683 struct dlm_query_join_packet packet;
684}; 684};
685 685
@@ -755,8 +755,8 @@ struct dlm_query_region {
755struct dlm_node_info { 755struct dlm_node_info {
756 u8 ni_nodenum; 756 u8 ni_nodenum;
757 u8 pad1; 757 u8 pad1;
758 u16 ni_ipv4_port; 758 __be16 ni_ipv4_port;
759 u32 ni_ipv4_address; 759 __be32 ni_ipv4_address;
760}; 760};
761 761
762struct dlm_query_nodeinfo { 762struct dlm_query_nodeinfo {
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 92f2ead0fab6..9e89d70df337 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -818,7 +818,7 @@ static void dlm_query_join_packet_to_wire(struct dlm_query_join_packet *packet,
818 union dlm_query_join_response response; 818 union dlm_query_join_response response;
819 819
820 response.packet = *packet; 820 response.packet = *packet;
821 *wire = cpu_to_be32(response.intval); 821 *wire = be32_to_cpu(response.intval);
822} 822}
823 823
824static void dlm_query_join_wire_to_packet(u32 wire, 824static void dlm_query_join_wire_to_packet(u32 wire,
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 745db42528d5..322216a5f0dd 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -177,21 +177,23 @@ bail:
177 return parent; 177 return parent;
178} 178}
179 179
180static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len, 180static int ocfs2_encode_fh(struct inode *inode, u32 *fh_in, int *max_len,
181 int connectable) 181 struct inode *parent)
182{ 182{
183 struct inode *inode = dentry->d_inode;
184 int len = *max_len; 183 int len = *max_len;
185 int type = 1; 184 int type = 1;
186 u64 blkno; 185 u64 blkno;
187 u32 generation; 186 u32 generation;
188 __le32 *fh = (__force __le32 *) fh_in; 187 __le32 *fh = (__force __le32 *) fh_in;
189 188
189#ifdef TRACE_HOOKS_ARE_NOT_BRAINDEAD_IN_YOUR_OPINION
190#error "You go ahead and fix that mess, then. Somehow"
190 trace_ocfs2_encode_fh_begin(dentry, dentry->d_name.len, 191 trace_ocfs2_encode_fh_begin(dentry, dentry->d_name.len,
191 dentry->d_name.name, 192 dentry->d_name.name,
192 fh, len, connectable); 193 fh, len, connectable);
194#endif
193 195
194 if (connectable && (len < 6)) { 196 if (parent && (len < 6)) {
195 *max_len = 6; 197 *max_len = 6;
196 type = 255; 198 type = 255;
197 goto bail; 199 goto bail;
@@ -211,12 +213,7 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
211 fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff)); 213 fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
212 fh[2] = cpu_to_le32(generation); 214 fh[2] = cpu_to_le32(generation);
213 215
214 if (connectable && !S_ISDIR(inode->i_mode)) { 216 if (parent) {
215 struct inode *parent;
216
217 spin_lock(&dentry->d_lock);
218
219 parent = dentry->d_parent->d_inode;
220 blkno = OCFS2_I(parent)->ip_blkno; 217 blkno = OCFS2_I(parent)->ip_blkno;
221 generation = parent->i_generation; 218 generation = parent->i_generation;
222 219
@@ -224,8 +221,6 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
224 fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff)); 221 fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
225 fh[5] = cpu_to_le32(generation); 222 fh[5] = cpu_to_le32(generation);
226 223
227 spin_unlock(&dentry->d_lock);
228
229 len = 6; 224 len = 6;
230 type = 2; 225 type = 2;
231 226
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 735514ca400f..d89e08a81eda 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -273,11 +273,13 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
273 inode->i_gid = le32_to_cpu(fe->i_gid); 273 inode->i_gid = le32_to_cpu(fe->i_gid);
274 274
275 /* Fast symlinks will have i_size but no allocated clusters. */ 275 /* Fast symlinks will have i_size but no allocated clusters. */
276 if (S_ISLNK(inode->i_mode) && !fe->i_clusters) 276 if (S_ISLNK(inode->i_mode) && !fe->i_clusters) {
277 inode->i_blocks = 0; 277 inode->i_blocks = 0;
278 else 278 inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops;
279 } else {
279 inode->i_blocks = ocfs2_inode_sector_count(inode); 280 inode->i_blocks = ocfs2_inode_sector_count(inode);
280 inode->i_mapping->a_ops = &ocfs2_aops; 281 inode->i_mapping->a_ops = &ocfs2_aops;
282 }
281 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); 283 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
282 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); 284 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
283 inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); 285 inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
@@ -331,10 +333,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
331 OCFS2_I(inode)->ip_dir_lock_gen = 1; 333 OCFS2_I(inode)->ip_dir_lock_gen = 1;
332 break; 334 break;
333 case S_IFLNK: 335 case S_IFLNK:
334 if (ocfs2_inode_is_fast_symlink(inode)) 336 inode->i_op = &ocfs2_symlink_inode_operations;
335 inode->i_op = &ocfs2_fast_symlink_inode_operations;
336 else
337 inode->i_op = &ocfs2_symlink_inode_operations;
338 i_size_write(inode, le64_to_cpu(fe->i_size)); 337 i_size_write(inode, le64_to_cpu(fe->i_size));
339 break; 338 break;
340 default: 339 default:
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index a1a1bfd652c9..d96f7f81d8dd 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -864,7 +864,7 @@ int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
864 if (status) 864 if (status)
865 break; 865 break;
866 866
867 reqp = (struct ocfs2_info_request *)(unsigned long)req_addr; 867 reqp = (struct ocfs2_info_request __user *)(unsigned long)req_addr;
868 if (!reqp) { 868 if (!reqp) {
869 status = -EINVAL; 869 status = -EINVAL;
870 goto bail; 870 goto bail;
@@ -888,9 +888,11 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
888 struct ocfs2_space_resv sr; 888 struct ocfs2_space_resv sr;
889 struct ocfs2_new_group_input input; 889 struct ocfs2_new_group_input input;
890 struct reflink_arguments args; 890 struct reflink_arguments args;
891 const char *old_path, *new_path; 891 const char __user *old_path;
892 const char __user *new_path;
892 bool preserve; 893 bool preserve;
893 struct ocfs2_info info; 894 struct ocfs2_info info;
895 void __user *argp = (void __user *)arg;
894 896
895 switch (cmd) { 897 switch (cmd) {
896 case OCFS2_IOC_GETFLAGS: 898 case OCFS2_IOC_GETFLAGS:
@@ -937,17 +939,15 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
937 939
938 return ocfs2_group_add(inode, &input); 940 return ocfs2_group_add(inode, &input);
939 case OCFS2_IOC_REFLINK: 941 case OCFS2_IOC_REFLINK:
940 if (copy_from_user(&args, (struct reflink_arguments *)arg, 942 if (copy_from_user(&args, argp, sizeof(args)))
941 sizeof(args)))
942 return -EFAULT; 943 return -EFAULT;
943 old_path = (const char *)(unsigned long)args.old_path; 944 old_path = (const char __user *)(unsigned long)args.old_path;
944 new_path = (const char *)(unsigned long)args.new_path; 945 new_path = (const char __user *)(unsigned long)args.new_path;
945 preserve = (args.preserve != 0); 946 preserve = (args.preserve != 0);
946 947
947 return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); 948 return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
948 case OCFS2_IOC_INFO: 949 case OCFS2_IOC_INFO:
949 if (copy_from_user(&info, (struct ocfs2_info __user *)arg, 950 if (copy_from_user(&info, argp, sizeof(struct ocfs2_info)))
950 sizeof(struct ocfs2_info)))
951 return -EFAULT; 951 return -EFAULT;
952 952
953 return ocfs2_info_handle(inode, &info, 0); 953 return ocfs2_info_handle(inode, &info, 0);
@@ -960,22 +960,20 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
960 if (!capable(CAP_SYS_ADMIN)) 960 if (!capable(CAP_SYS_ADMIN))
961 return -EPERM; 961 return -EPERM;
962 962
963 if (copy_from_user(&range, (struct fstrim_range *)arg, 963 if (copy_from_user(&range, argp, sizeof(range)))
964 sizeof(range)))
965 return -EFAULT; 964 return -EFAULT;
966 965
967 ret = ocfs2_trim_fs(sb, &range); 966 ret = ocfs2_trim_fs(sb, &range);
968 if (ret < 0) 967 if (ret < 0)
969 return ret; 968 return ret;
970 969
971 if (copy_to_user((struct fstrim_range *)arg, &range, 970 if (copy_to_user(argp, &range, sizeof(range)))
972 sizeof(range)))
973 return -EFAULT; 971 return -EFAULT;
974 972
975 return 0; 973 return 0;
976 } 974 }
977 case OCFS2_IOC_MOVE_EXT: 975 case OCFS2_IOC_MOVE_EXT:
978 return ocfs2_ioctl_move_extents(filp, (void __user *)arg); 976 return ocfs2_ioctl_move_extents(filp, argp);
979 default: 977 default:
980 return -ENOTTY; 978 return -ENOTTY;
981 } 979 }
@@ -988,6 +986,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
988 struct reflink_arguments args; 986 struct reflink_arguments args;
989 struct inode *inode = file->f_path.dentry->d_inode; 987 struct inode *inode = file->f_path.dentry->d_inode;
990 struct ocfs2_info info; 988 struct ocfs2_info info;
989 void __user *argp = (void __user *)arg;
991 990
992 switch (cmd) { 991 switch (cmd) {
993 case OCFS2_IOC32_GETFLAGS: 992 case OCFS2_IOC32_GETFLAGS:
@@ -1006,16 +1005,14 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1006 case FITRIM: 1005 case FITRIM:
1007 break; 1006 break;
1008 case OCFS2_IOC_REFLINK: 1007 case OCFS2_IOC_REFLINK:
1009 if (copy_from_user(&args, (struct reflink_arguments *)arg, 1008 if (copy_from_user(&args, argp, sizeof(args)))
1010 sizeof(args)))
1011 return -EFAULT; 1009 return -EFAULT;
1012 preserve = (args.preserve != 0); 1010 preserve = (args.preserve != 0);
1013 1011
1014 return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path), 1012 return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
1015 compat_ptr(args.new_path), preserve); 1013 compat_ptr(args.new_path), preserve);
1016 case OCFS2_IOC_INFO: 1014 case OCFS2_IOC_INFO:
1017 if (copy_from_user(&info, (struct ocfs2_info __user *)arg, 1015 if (copy_from_user(&info, argp, sizeof(struct ocfs2_info)))
1018 sizeof(struct ocfs2_info)))
1019 return -EFAULT; 1016 return -EFAULT;
1020 1017
1021 return ocfs2_info_handle(inode, &info, 1); 1018 return ocfs2_info_handle(inode, &info, 1);
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index b1e3fce72ea4..6083432f667e 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -1082,8 +1082,7 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
1082 context->file = filp; 1082 context->file = filp;
1083 1083
1084 if (argp) { 1084 if (argp) {
1085 if (copy_from_user(&range, (struct ocfs2_move_extents *)argp, 1085 if (copy_from_user(&range, argp, sizeof(range))) {
1086 sizeof(range))) {
1087 status = -EFAULT; 1086 status = -EFAULT;
1088 goto out; 1087 goto out;
1089 } 1088 }
@@ -1138,8 +1137,7 @@ out:
1138 * length and new_offset even if failure happens somewhere. 1137 * length and new_offset even if failure happens somewhere.
1139 */ 1138 */
1140 if (argp) { 1139 if (argp) {
1141 if (copy_to_user((struct ocfs2_move_extents *)argp, &range, 1140 if (copy_to_user(argp, &range, sizeof(range)))
1142 sizeof(range)))
1143 status = -EFAULT; 1141 status = -EFAULT;
1144 } 1142 }
1145 1143
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index a9856e3eaaf0..9f39c640cddf 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1724,15 +1724,16 @@ static int ocfs2_symlink(struct inode *dir,
1724 fe = (struct ocfs2_dinode *) new_fe_bh->b_data; 1724 fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
1725 inode->i_rdev = 0; 1725 inode->i_rdev = 0;
1726 newsize = l - 1; 1726 newsize = l - 1;
1727 inode->i_op = &ocfs2_symlink_inode_operations;
1727 if (l > ocfs2_fast_symlink_chars(sb)) { 1728 if (l > ocfs2_fast_symlink_chars(sb)) {
1728 u32 offset = 0; 1729 u32 offset = 0;
1729 1730
1730 inode->i_op = &ocfs2_symlink_inode_operations;
1731 status = dquot_alloc_space_nodirty(inode, 1731 status = dquot_alloc_space_nodirty(inode,
1732 ocfs2_clusters_to_bytes(osb->sb, 1)); 1732 ocfs2_clusters_to_bytes(osb->sb, 1));
1733 if (status) 1733 if (status)
1734 goto bail; 1734 goto bail;
1735 did_quota = 1; 1735 did_quota = 1;
1736 inode->i_mapping->a_ops = &ocfs2_aops;
1736 status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0, 1737 status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
1737 new_fe_bh, 1738 new_fe_bh,
1738 handle, data_ac, NULL, 1739 handle, data_ac, NULL,
@@ -1750,7 +1751,7 @@ static int ocfs2_symlink(struct inode *dir,
1750 i_size_write(inode, newsize); 1751 i_size_write(inode, newsize);
1751 inode->i_blocks = ocfs2_inode_sector_count(inode); 1752 inode->i_blocks = ocfs2_inode_sector_count(inode);
1752 } else { 1753 } else {
1753 inode->i_op = &ocfs2_fast_symlink_inode_operations; 1754 inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops;
1754 memcpy((char *) fe->id2.i_symlink, symname, l); 1755 memcpy((char *) fe->id2.i_symlink, symname, l);
1755 i_size_write(inode, newsize); 1756 i_size_write(inode, newsize);
1756 inode->i_blocks = 0; 1757 inode->i_blocks = 0;
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 5d22872e2bb3..f1fbb4b552ad 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -54,101 +54,40 @@
54#include "buffer_head_io.h" 54#include "buffer_head_io.h"
55 55
56 56
57static char *ocfs2_fast_symlink_getlink(struct inode *inode, 57static int ocfs2_fast_symlink_readpage(struct file *unused, struct page *page)
58 struct buffer_head **bh)
59{ 58{
60 int status; 59 struct inode *inode = page->mapping->host;
61 char *link = NULL; 60 struct buffer_head *bh;
61 int status = ocfs2_read_inode_block(inode, &bh);
62 struct ocfs2_dinode *fe; 62 struct ocfs2_dinode *fe;
63 const char *link;
64 void *kaddr;
65 size_t len;
63 66
64 status = ocfs2_read_inode_block(inode, bh);
65 if (status < 0) { 67 if (status < 0) {
66 mlog_errno(status); 68 mlog_errno(status);
67 link = ERR_PTR(status); 69 return status;
68 goto bail;
69 } 70 }
70 71
71 fe = (struct ocfs2_dinode *) (*bh)->b_data; 72 fe = (struct ocfs2_dinode *) bh->b_data;
72 link = (char *) fe->id2.i_symlink; 73 link = (char *) fe->id2.i_symlink;
73bail: 74 /* will be less than a page size */
74 75 len = strnlen(link, ocfs2_fast_symlink_chars(inode->i_sb));
75 return link; 76 kaddr = kmap_atomic(page);
76} 77 memcpy(kaddr, link, len + 1);
77 78 kunmap_atomic(kaddr);
78static int ocfs2_readlink(struct dentry *dentry, 79 SetPageUptodate(page);
79 char __user *buffer, 80 unlock_page(page);
80 int buflen)
81{
82 int ret;
83 char *link;
84 struct buffer_head *bh = NULL;
85 struct inode *inode = dentry->d_inode;
86
87 link = ocfs2_fast_symlink_getlink(inode, &bh);
88 if (IS_ERR(link)) {
89 ret = PTR_ERR(link);
90 goto out;
91 }
92
93 /*
94 * Without vfsmount we can't update atime now,
95 * but we will update atime here ultimately.
96 */
97 ret = vfs_readlink(dentry, buffer, buflen, link);
98
99 brelse(bh); 81 brelse(bh);
100out: 82 return 0;
101 if (ret < 0)
102 mlog_errno(ret);
103 return ret;
104} 83}
105 84
106static void *ocfs2_fast_follow_link(struct dentry *dentry, 85const struct address_space_operations ocfs2_fast_symlink_aops = {
107 struct nameidata *nd) 86 .readpage = ocfs2_fast_symlink_readpage,
108{ 87};
109 int status = 0;
110 int len;
111 char *target, *link = ERR_PTR(-ENOMEM);
112 struct inode *inode = dentry->d_inode;
113 struct buffer_head *bh = NULL;
114
115 BUG_ON(!ocfs2_inode_is_fast_symlink(inode));
116 target = ocfs2_fast_symlink_getlink(inode, &bh);
117 if (IS_ERR(target)) {
118 status = PTR_ERR(target);
119 mlog_errno(status);
120 goto bail;
121 }
122
123 /* Fast symlinks can't be large */
124 len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb));
125 link = kzalloc(len + 1, GFP_NOFS);
126 if (!link) {
127 status = -ENOMEM;
128 mlog_errno(status);
129 goto bail;
130 }
131
132 memcpy(link, target, len);
133
134bail:
135 nd_set_link(nd, status ? ERR_PTR(status) : link);
136 brelse(bh);
137
138 if (status)
139 mlog_errno(status);
140 return NULL;
141}
142
143static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
144{
145 char *link = nd_get_link(nd);
146 if (!IS_ERR(link))
147 kfree(link);
148}
149 88
150const struct inode_operations ocfs2_symlink_inode_operations = { 89const struct inode_operations ocfs2_symlink_inode_operations = {
151 .readlink = page_readlink, 90 .readlink = generic_readlink,
152 .follow_link = page_follow_link_light, 91 .follow_link = page_follow_link_light,
153 .put_link = page_put_link, 92 .put_link = page_put_link,
154 .getattr = ocfs2_getattr, 93 .getattr = ocfs2_getattr,
@@ -159,15 +98,3 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
159 .removexattr = generic_removexattr, 98 .removexattr = generic_removexattr,
160 .fiemap = ocfs2_fiemap, 99 .fiemap = ocfs2_fiemap,
161}; 100};
162const struct inode_operations ocfs2_fast_symlink_inode_operations = {
163 .readlink = ocfs2_readlink,
164 .follow_link = ocfs2_fast_follow_link,
165 .put_link = ocfs2_fast_put_link,
166 .getattr = ocfs2_getattr,
167 .setattr = ocfs2_setattr,
168 .setxattr = generic_setxattr,
169 .getxattr = generic_getxattr,
170 .listxattr = ocfs2_listxattr,
171 .removexattr = generic_removexattr,
172 .fiemap = ocfs2_fiemap,
173};
diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h
index 65a6c9c6ad51..71ee4245e919 100644
--- a/fs/ocfs2/symlink.h
+++ b/fs/ocfs2/symlink.h
@@ -27,7 +27,7 @@
27#define OCFS2_SYMLINK_H 27#define OCFS2_SYMLINK_H
28 28
29extern const struct inode_operations ocfs2_symlink_inode_operations; 29extern const struct inode_operations ocfs2_symlink_inode_operations;
30extern const struct inode_operations ocfs2_fast_symlink_inode_operations; 30extern const struct address_space_operations ocfs2_fast_symlink_aops;
31 31
32/* 32/*
33 * Test whether an inode is a fast symlink. 33 * Test whether an inode is a fast symlink.
diff --git a/fs/open.c b/fs/open.c
index d54301219d04..d6c79a0dffc7 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -654,10 +654,23 @@ static inline int __get_file_write_access(struct inode *inode,
654 return error; 654 return error;
655} 655}
656 656
657static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, 657int open_check_o_direct(struct file *f)
658 struct file *f, 658{
659 int (*open)(struct inode *, struct file *), 659 /* NB: we're sure to have correct a_ops only after f_op->open */
660 const struct cred *cred) 660 if (f->f_flags & O_DIRECT) {
661 if (!f->f_mapping->a_ops ||
662 ((!f->f_mapping->a_ops->direct_IO) &&
663 (!f->f_mapping->a_ops->get_xip_mem))) {
664 return -EINVAL;
665 }
666 }
667 return 0;
668}
669
670static struct file *do_dentry_open(struct dentry *dentry, struct vfsmount *mnt,
671 struct file *f,
672 int (*open)(struct inode *, struct file *),
673 const struct cred *cred)
661{ 674{
662 static const struct file_operations empty_fops = {}; 675 static const struct file_operations empty_fops = {};
663 struct inode *inode; 676 struct inode *inode;
@@ -713,16 +726,6 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
713 726
714 file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); 727 file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
715 728
716 /* NB: we're sure to have correct a_ops only after f_op->open */
717 if (f->f_flags & O_DIRECT) {
718 if (!f->f_mapping->a_ops ||
719 ((!f->f_mapping->a_ops->direct_IO) &&
720 (!f->f_mapping->a_ops->get_xip_mem))) {
721 fput(f);
722 f = ERR_PTR(-EINVAL);
723 }
724 }
725
726 return f; 729 return f;
727 730
728cleanup_all: 731cleanup_all:
@@ -744,12 +747,29 @@ cleanup_all:
744 f->f_path.dentry = NULL; 747 f->f_path.dentry = NULL;
745 f->f_path.mnt = NULL; 748 f->f_path.mnt = NULL;
746cleanup_file: 749cleanup_file:
747 put_filp(f);
748 dput(dentry); 750 dput(dentry);
749 mntput(mnt); 751 mntput(mnt);
750 return ERR_PTR(error); 752 return ERR_PTR(error);
751} 753}
752 754
755static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
756 struct file *f,
757 int (*open)(struct inode *, struct file *),
758 const struct cred *cred)
759{
760 struct file *res = do_dentry_open(dentry, mnt, f, open, cred);
761 if (!IS_ERR(res)) {
762 int error = open_check_o_direct(f);
763 if (error) {
764 fput(res);
765 res = ERR_PTR(error);
766 }
767 } else {
768 put_filp(f);
769 }
770 return res;
771}
772
753/** 773/**
754 * lookup_instantiate_filp - instantiates the open intent filp 774 * lookup_instantiate_filp - instantiates the open intent filp
755 * @nd: pointer to nameidata 775 * @nd: pointer to nameidata
@@ -804,13 +824,31 @@ struct file *nameidata_to_filp(struct nameidata *nd)
804 824
805 /* Pick up the filp from the open intent */ 825 /* Pick up the filp from the open intent */
806 filp = nd->intent.open.file; 826 filp = nd->intent.open.file;
807 nd->intent.open.file = NULL;
808 827
809 /* Has the filesystem initialised the file for us? */ 828 /* Has the filesystem initialised the file for us? */
810 if (filp->f_path.dentry == NULL) { 829 if (filp->f_path.dentry != NULL) {
830 nd->intent.open.file = NULL;
831 } else {
832 struct file *res;
833
811 path_get(&nd->path); 834 path_get(&nd->path);
812 filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp, 835 res = do_dentry_open(nd->path.dentry, nd->path.mnt,
813 NULL, cred); 836 filp, NULL, cred);
837 if (!IS_ERR(res)) {
838 int error;
839
840 nd->intent.open.file = NULL;
841 BUG_ON(res != filp);
842
843 error = open_check_o_direct(filp);
844 if (error) {
845 fput(filp);
846 filp = ERR_PTR(error);
847 }
848 } else {
849 /* Allow nd->intent.open.file to be recycled */
850 filp = res;
851 }
814 } 852 }
815 return filp; 853 return filp;
816} 854}
diff --git a/fs/pipe.c b/fs/pipe.c
index 95ebb56de494..49c1065256fd 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -654,8 +654,11 @@ out:
654 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); 654 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
655 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 655 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
656 } 656 }
657 if (ret > 0) 657 if (ret > 0) {
658 file_update_time(filp); 658 int err = file_update_time(filp);
659 if (err)
660 ret = err;
661 }
659 return ret; 662 return ret;
660} 663}
661 664
diff --git a/fs/pnode.c b/fs/pnode.c
index ab5fa9e1a79a..bed378db0758 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -257,12 +257,12 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry,
257 prev_src_mnt = child; 257 prev_src_mnt = child;
258 } 258 }
259out: 259out:
260 br_write_lock(vfsmount_lock); 260 br_write_lock(&vfsmount_lock);
261 while (!list_empty(&tmp_list)) { 261 while (!list_empty(&tmp_list)) {
262 child = list_first_entry(&tmp_list, struct mount, mnt_hash); 262 child = list_first_entry(&tmp_list, struct mount, mnt_hash);
263 umount_tree(child, 0, &umount_list); 263 umount_tree(child, 0, &umount_list);
264 } 264 }
265 br_write_unlock(vfsmount_lock); 265 br_write_unlock(&vfsmount_lock);
266 release_mounts(&umount_list); 266 release_mounts(&umount_list);
267 return ret; 267 return ret;
268} 268}
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 12412852d88a..5e289a7cbad1 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -23,12 +23,12 @@ static unsigned mounts_poll(struct file *file, poll_table *wait)
23 23
24 poll_wait(file, &p->ns->poll, wait); 24 poll_wait(file, &p->ns->poll, wait);
25 25
26 br_read_lock(vfsmount_lock); 26 br_read_lock(&vfsmount_lock);
27 if (p->m.poll_event != ns->event) { 27 if (p->m.poll_event != ns->event) {
28 p->m.poll_event = ns->event; 28 p->m.poll_event = ns->event;
29 res |= POLLERR | POLLPRI; 29 res |= POLLERR | POLLPRI;
30 } 30 }
31 br_read_unlock(vfsmount_lock); 31 br_read_unlock(&vfsmount_lock);
32 32
33 return res; 33 return res;
34} 34}
diff --git a/fs/readdir.c b/fs/readdir.c
index cc0a8227cddf..39e3370d79cf 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -108,11 +108,11 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
108 int error; 108 int error;
109 struct file * file; 109 struct file * file;
110 struct readdir_callback buf; 110 struct readdir_callback buf;
111 int fput_needed;
111 112
112 error = -EBADF; 113 file = fget_light(fd, &fput_needed);
113 file = fget(fd);
114 if (!file) 114 if (!file)
115 goto out; 115 return -EBADF;
116 116
117 buf.result = 0; 117 buf.result = 0;
118 buf.dirent = dirent; 118 buf.dirent = dirent;
@@ -121,8 +121,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
121 if (buf.result) 121 if (buf.result)
122 error = buf.result; 122 error = buf.result;
123 123
124 fput(file); 124 fput_light(file, fput_needed);
125out:
126 return error; 125 return error;
127} 126}
128 127
@@ -195,16 +194,15 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
195 struct file * file; 194 struct file * file;
196 struct linux_dirent __user * lastdirent; 195 struct linux_dirent __user * lastdirent;
197 struct getdents_callback buf; 196 struct getdents_callback buf;
197 int fput_needed;
198 int error; 198 int error;
199 199
200 error = -EFAULT;
201 if (!access_ok(VERIFY_WRITE, dirent, count)) 200 if (!access_ok(VERIFY_WRITE, dirent, count))
202 goto out; 201 return -EFAULT;
203 202
204 error = -EBADF; 203 file = fget_light(fd, &fput_needed);
205 file = fget(fd);
206 if (!file) 204 if (!file)
207 goto out; 205 return -EBADF;
208 206
209 buf.current_dir = dirent; 207 buf.current_dir = dirent;
210 buf.previous = NULL; 208 buf.previous = NULL;
@@ -221,8 +219,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
221 else 219 else
222 error = count - buf.count; 220 error = count - buf.count;
223 } 221 }
224 fput(file); 222 fput_light(file, fput_needed);
225out:
226 return error; 223 return error;
227} 224}
228 225
@@ -278,16 +275,15 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
278 struct file * file; 275 struct file * file;
279 struct linux_dirent64 __user * lastdirent; 276 struct linux_dirent64 __user * lastdirent;
280 struct getdents_callback64 buf; 277 struct getdents_callback64 buf;
278 int fput_needed;
281 int error; 279 int error;
282 280
283 error = -EFAULT;
284 if (!access_ok(VERIFY_WRITE, dirent, count)) 281 if (!access_ok(VERIFY_WRITE, dirent, count))
285 goto out; 282 return -EFAULT;
286 283
287 error = -EBADF; 284 file = fget_light(fd, &fput_needed);
288 file = fget(fd);
289 if (!file) 285 if (!file)
290 goto out; 286 return -EBADF;
291 287
292 buf.current_dir = dirent; 288 buf.current_dir = dirent;
293 buf.previous = NULL; 289 buf.previous = NULL;
@@ -305,7 +301,6 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
305 else 301 else
306 error = count - buf.count; 302 error = count - buf.count;
307 } 303 }
308 fput(file); 304 fput_light(file, fput_needed);
309out:
310 return error; 305 return error;
311} 306}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 59d06871a850..a6d4268fb6c1 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1592,13 +1592,12 @@ struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
1592 (fh_type == 6) ? fid->raw[5] : 0); 1592 (fh_type == 6) ? fid->raw[5] : 0);
1593} 1593}
1594 1594
1595int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp, 1595int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
1596 int need_parent) 1596 struct inode *parent)
1597{ 1597{
1598 struct inode *inode = dentry->d_inode;
1599 int maxlen = *lenp; 1598 int maxlen = *lenp;
1600 1599
1601 if (need_parent && (maxlen < 5)) { 1600 if (parent && (maxlen < 5)) {
1602 *lenp = 5; 1601 *lenp = 5;
1603 return 255; 1602 return 255;
1604 } else if (maxlen < 3) { 1603 } else if (maxlen < 3) {
@@ -1610,20 +1609,15 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
1610 data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); 1609 data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1611 data[2] = inode->i_generation; 1610 data[2] = inode->i_generation;
1612 *lenp = 3; 1611 *lenp = 3;
1613 /* no room for directory info? return what we've stored so far */ 1612 if (parent) {
1614 if (maxlen < 5 || !need_parent) 1613 data[3] = parent->i_ino;
1615 return 3; 1614 data[4] = le32_to_cpu(INODE_PKEY(parent)->k_dir_id);
1616 1615 *lenp = 5;
1617 spin_lock(&dentry->d_lock); 1616 if (maxlen >= 6) {
1618 inode = dentry->d_parent->d_inode; 1617 data[5] = parent->i_generation;
1619 data[3] = inode->i_ino; 1618 *lenp = 6;
1620 data[4] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); 1619 }
1621 *lenp = 5; 1620 }
1622 if (maxlen >= 6) {
1623 data[5] = inode->i_generation;
1624 *lenp = 6;
1625 }
1626 spin_unlock(&dentry->d_lock);
1627 return *lenp; 1621 return *lenp;
1628} 1622}
1629 1623
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index b1a08573fe14..afcadcc03e8a 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1923,6 +1923,8 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
1923 * the workqueue job (flush_async_commit) needs this lock 1923 * the workqueue job (flush_async_commit) needs this lock
1924 */ 1924 */
1925 reiserfs_write_unlock(sb); 1925 reiserfs_write_unlock(sb);
1926
1927 cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work);
1926 flush_workqueue(commit_wq); 1928 flush_workqueue(commit_wq);
1927 1929
1928 if (!reiserfs_mounted_fs_count) { 1930 if (!reiserfs_mounted_fs_count) {
@@ -3231,8 +3233,6 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
3231 th->t_trans_id, journal->j_trans_id); 3233 th->t_trans_id, journal->j_trans_id);
3232 } 3234 }
3233 3235
3234 sb->s_dirt = 1;
3235
3236 prepared = test_clear_buffer_journal_prepared(bh); 3236 prepared = test_clear_buffer_journal_prepared(bh);
3237 clear_buffer_journal_restore_dirty(bh); 3237 clear_buffer_journal_restore_dirty(bh);
3238 /* already in this transaction, we are done */ 3238 /* already in this transaction, we are done */
@@ -3316,6 +3316,7 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
3316 journal->j_first = cn; 3316 journal->j_first = cn;
3317 journal->j_last = cn; 3317 journal->j_last = cn;
3318 } 3318 }
3319 reiserfs_schedule_old_flush(sb);
3319 return 0; 3320 return 0;
3320} 3321}
3321 3322
@@ -3492,7 +3493,7 @@ static void flush_async_commits(struct work_struct *work)
3492** flushes any old transactions to disk 3493** flushes any old transactions to disk
3493** ends the current transaction if it is too old 3494** ends the current transaction if it is too old
3494*/ 3495*/
3495int reiserfs_flush_old_commits(struct super_block *sb) 3496void reiserfs_flush_old_commits(struct super_block *sb)
3496{ 3497{
3497 time_t now; 3498 time_t now;
3498 struct reiserfs_transaction_handle th; 3499 struct reiserfs_transaction_handle th;
@@ -3502,9 +3503,8 @@ int reiserfs_flush_old_commits(struct super_block *sb)
3502 /* safety check so we don't flush while we are replaying the log during 3503 /* safety check so we don't flush while we are replaying the log during
3503 * mount 3504 * mount
3504 */ 3505 */
3505 if (list_empty(&journal->j_journal_list)) { 3506 if (list_empty(&journal->j_journal_list))
3506 return 0; 3507 return;
3507 }
3508 3508
3509 /* check the current transaction. If there are no writers, and it is 3509 /* check the current transaction. If there are no writers, and it is
3510 * too old, finish it, and force the commit blocks to disk 3510 * too old, finish it, and force the commit blocks to disk
@@ -3526,7 +3526,6 @@ int reiserfs_flush_old_commits(struct super_block *sb)
3526 do_journal_end(&th, sb, 1, COMMIT_NOW | WAIT); 3526 do_journal_end(&th, sb, 1, COMMIT_NOW | WAIT);
3527 } 3527 }
3528 } 3528 }
3529 return sb->s_dirt;
3530} 3529}
3531 3530
3532/* 3531/*
@@ -3955,7 +3954,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
3955 ** it tells us if we should continue with the journal_end, or just return 3954 ** it tells us if we should continue with the journal_end, or just return
3956 */ 3955 */
3957 if (!check_journal_end(th, sb, nblocks, flags)) { 3956 if (!check_journal_end(th, sb, nblocks, flags)) {
3958 sb->s_dirt = 1; 3957 reiserfs_schedule_old_flush(sb);
3959 wake_queued_writers(sb); 3958 wake_queued_writers(sb);
3960 reiserfs_async_progress_wait(sb); 3959 reiserfs_async_progress_wait(sb);
3961 goto out; 3960 goto out;
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index a59d27126338..33215f57ea06 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -480,6 +480,11 @@ struct reiserfs_sb_info {
480 struct dentry *priv_root; /* root of /.reiserfs_priv */ 480 struct dentry *priv_root; /* root of /.reiserfs_priv */
481 struct dentry *xattr_root; /* root of /.reiserfs_priv/xattrs */ 481 struct dentry *xattr_root; /* root of /.reiserfs_priv/xattrs */
482 int j_errno; 482 int j_errno;
483
484 int work_queued; /* non-zero delayed work is queued */
485 struct delayed_work old_work; /* old transactions flush delayed work */
486 spinlock_t old_work_lock; /* protects old_work and work_queued */
487
483#ifdef CONFIG_QUOTA 488#ifdef CONFIG_QUOTA
484 char *s_qf_names[MAXQUOTAS]; 489 char *s_qf_names[MAXQUOTAS];
485 int s_jquota_fmt; 490 int s_jquota_fmt;
@@ -2452,7 +2457,7 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
2452int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *); 2457int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *);
2453int reiserfs_commit_page(struct inode *inode, struct page *page, 2458int reiserfs_commit_page(struct inode *inode, struct page *page,
2454 unsigned from, unsigned to); 2459 unsigned from, unsigned to);
2455int reiserfs_flush_old_commits(struct super_block *); 2460void reiserfs_flush_old_commits(struct super_block *);
2456int reiserfs_commit_for_inode(struct inode *); 2461int reiserfs_commit_for_inode(struct inode *);
2457int reiserfs_inode_needs_commit(struct inode *); 2462int reiserfs_inode_needs_commit(struct inode *);
2458void reiserfs_update_inode_transaction(struct inode *); 2463void reiserfs_update_inode_transaction(struct inode *);
@@ -2487,6 +2492,7 @@ void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...);
2487int reiserfs_allocate_list_bitmaps(struct super_block *s, 2492int reiserfs_allocate_list_bitmaps(struct super_block *s,
2488 struct reiserfs_list_bitmap *, unsigned int); 2493 struct reiserfs_list_bitmap *, unsigned int);
2489 2494
2495void reiserfs_schedule_old_flush(struct super_block *s);
2490void add_save_link(struct reiserfs_transaction_handle *th, 2496void add_save_link(struct reiserfs_transaction_handle *th,
2491 struct inode *inode, int truncate); 2497 struct inode *inode, int truncate);
2492int remove_save_link(struct inode *inode, int truncate); 2498int remove_save_link(struct inode *inode, int truncate);
@@ -2611,8 +2617,8 @@ struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
2611 int fh_len, int fh_type); 2617 int fh_len, int fh_type);
2612struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid, 2618struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
2613 int fh_len, int fh_type); 2619 int fh_len, int fh_type);
2614int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp, 2620int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
2615 int connectable); 2621 struct inode *parent);
2616 2622
2617int reiserfs_truncate_file(struct inode *, int update_timestamps); 2623int reiserfs_truncate_file(struct inode *, int update_timestamps);
2618void make_cpu_key(struct cpu_key *cpu_key, struct inode *inode, loff_t offset, 2624void make_cpu_key(struct cpu_key *cpu_key, struct inode *inode, loff_t offset,
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index 9a17f63c3fd7..3ce02cff5e90 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -200,7 +200,6 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
200 (bmap_nr_new - bmap_nr))); 200 (bmap_nr_new - bmap_nr)));
201 PUT_SB_BLOCK_COUNT(s, block_count_new); 201 PUT_SB_BLOCK_COUNT(s, block_count_new);
202 PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new); 202 PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new);
203 s->s_dirt = 1;
204 203
205 journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); 204 journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
206 205
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index c07b7d709447..651ce767b55d 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -72,20 +72,58 @@ static int reiserfs_sync_fs(struct super_block *s, int wait)
72 if (!journal_begin(&th, s, 1)) 72 if (!journal_begin(&th, s, 1))
73 if (!journal_end_sync(&th, s, 1)) 73 if (!journal_end_sync(&th, s, 1))
74 reiserfs_flush_old_commits(s); 74 reiserfs_flush_old_commits(s);
75 s->s_dirt = 0; /* Even if it's not true.
76 * We'll loop forever in sync_supers otherwise */
77 reiserfs_write_unlock(s); 75 reiserfs_write_unlock(s);
78 return 0; 76 return 0;
79} 77}
80 78
81static void reiserfs_write_super(struct super_block *s) 79static void flush_old_commits(struct work_struct *work)
82{ 80{
81 struct reiserfs_sb_info *sbi;
82 struct super_block *s;
83
84 sbi = container_of(work, struct reiserfs_sb_info, old_work.work);
85 s = sbi->s_journal->j_work_sb;
86
87 spin_lock(&sbi->old_work_lock);
88 sbi->work_queued = 0;
89 spin_unlock(&sbi->old_work_lock);
90
83 reiserfs_sync_fs(s, 1); 91 reiserfs_sync_fs(s, 1);
84} 92}
85 93
94void reiserfs_schedule_old_flush(struct super_block *s)
95{
96 struct reiserfs_sb_info *sbi = REISERFS_SB(s);
97 unsigned long delay;
98
99 if (s->s_flags & MS_RDONLY)
100 return;
101
102 spin_lock(&sbi->old_work_lock);
103 if (!sbi->work_queued) {
104 delay = msecs_to_jiffies(dirty_writeback_interval * 10);
105 queue_delayed_work(system_long_wq, &sbi->old_work, delay);
106 sbi->work_queued = 1;
107 }
108 spin_unlock(&sbi->old_work_lock);
109}
110
111static void cancel_old_flush(struct super_block *s)
112{
113 struct reiserfs_sb_info *sbi = REISERFS_SB(s);
114
115 cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
116 spin_lock(&sbi->old_work_lock);
117 sbi->work_queued = 0;
118 spin_unlock(&sbi->old_work_lock);
119}
120
86static int reiserfs_freeze(struct super_block *s) 121static int reiserfs_freeze(struct super_block *s)
87{ 122{
88 struct reiserfs_transaction_handle th; 123 struct reiserfs_transaction_handle th;
124
125 cancel_old_flush(s);
126
89 reiserfs_write_lock(s); 127 reiserfs_write_lock(s);
90 if (!(s->s_flags & MS_RDONLY)) { 128 if (!(s->s_flags & MS_RDONLY)) {
91 int err = journal_begin(&th, s, 1); 129 int err = journal_begin(&th, s, 1);
@@ -99,7 +137,6 @@ static int reiserfs_freeze(struct super_block *s)
99 journal_end_sync(&th, s, 1); 137 journal_end_sync(&th, s, 1);
100 } 138 }
101 } 139 }
102 s->s_dirt = 0;
103 reiserfs_write_unlock(s); 140 reiserfs_write_unlock(s);
104 return 0; 141 return 0;
105} 142}
@@ -483,9 +520,6 @@ static void reiserfs_put_super(struct super_block *s)
483 520
484 reiserfs_write_lock(s); 521 reiserfs_write_lock(s);
485 522
486 if (s->s_dirt)
487 reiserfs_write_super(s);
488
489 /* change file system state to current state if it was mounted with read-write permissions */ 523 /* change file system state to current state if it was mounted with read-write permissions */
490 if (!(s->s_flags & MS_RDONLY)) { 524 if (!(s->s_flags & MS_RDONLY)) {
491 if (!journal_begin(&th, s, 10)) { 525 if (!journal_begin(&th, s, 10)) {
@@ -692,7 +726,6 @@ static const struct super_operations reiserfs_sops = {
692 .dirty_inode = reiserfs_dirty_inode, 726 .dirty_inode = reiserfs_dirty_inode,
693 .evict_inode = reiserfs_evict_inode, 727 .evict_inode = reiserfs_evict_inode,
694 .put_super = reiserfs_put_super, 728 .put_super = reiserfs_put_super,
695 .write_super = reiserfs_write_super,
696 .sync_fs = reiserfs_sync_fs, 729 .sync_fs = reiserfs_sync_fs,
697 .freeze_fs = reiserfs_freeze, 730 .freeze_fs = reiserfs_freeze,
698 .unfreeze_fs = reiserfs_unfreeze, 731 .unfreeze_fs = reiserfs_unfreeze,
@@ -1400,7 +1433,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1400 err = journal_end(&th, s, 10); 1433 err = journal_end(&th, s, 10);
1401 if (err) 1434 if (err)
1402 goto out_err; 1435 goto out_err;
1403 s->s_dirt = 0;
1404 1436
1405 if (!(*mount_flags & MS_RDONLY)) { 1437 if (!(*mount_flags & MS_RDONLY)) {
1406 dquot_resume(s, -1); 1438 dquot_resume(s, -1);
@@ -1730,19 +1762,21 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1730 return -ENOMEM; 1762 return -ENOMEM;
1731 s->s_fs_info = sbi; 1763 s->s_fs_info = sbi;
1732 /* Set default values for options: non-aggressive tails, RO on errors */ 1764 /* Set default values for options: non-aggressive tails, RO on errors */
1733 REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL); 1765 sbi->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
1734 REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ERROR_RO); 1766 sbi->s_mount_opt |= (1 << REISERFS_ERROR_RO);
1735 REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH); 1767 sbi->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH);
1736 /* no preallocation minimum, be smart in 1768 /* no preallocation minimum, be smart in
1737 reiserfs_file_write instead */ 1769 reiserfs_file_write instead */
1738 REISERFS_SB(s)->s_alloc_options.preallocmin = 0; 1770 sbi->s_alloc_options.preallocmin = 0;
1739 /* Preallocate by 16 blocks (17-1) at once */ 1771 /* Preallocate by 16 blocks (17-1) at once */
1740 REISERFS_SB(s)->s_alloc_options.preallocsize = 17; 1772 sbi->s_alloc_options.preallocsize = 17;
1741 /* setup default block allocator options */ 1773 /* setup default block allocator options */
1742 reiserfs_init_alloc_options(s); 1774 reiserfs_init_alloc_options(s);
1743 1775
1744 mutex_init(&REISERFS_SB(s)->lock); 1776 spin_lock_init(&sbi->old_work_lock);
1745 REISERFS_SB(s)->lock_depth = -1; 1777 INIT_DELAYED_WORK(&sbi->old_work, flush_old_commits);
1778 mutex_init(&sbi->lock);
1779 sbi->lock_depth = -1;
1746 1780
1747 jdev_name = NULL; 1781 jdev_name = NULL;
1748 if (reiserfs_parse_options 1782 if (reiserfs_parse_options
@@ -1751,8 +1785,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1751 goto error_unlocked; 1785 goto error_unlocked;
1752 } 1786 }
1753 if (jdev_name && jdev_name[0]) { 1787 if (jdev_name && jdev_name[0]) {
1754 REISERFS_SB(s)->s_jdev = kstrdup(jdev_name, GFP_KERNEL); 1788 sbi->s_jdev = kstrdup(jdev_name, GFP_KERNEL);
1755 if (!REISERFS_SB(s)->s_jdev) { 1789 if (!sbi->s_jdev) {
1756 SWARN(silent, s, "", "Cannot allocate memory for " 1790 SWARN(silent, s, "", "Cannot allocate memory for "
1757 "journal device name"); 1791 "journal device name");
1758 goto error; 1792 goto error;
@@ -1810,7 +1844,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1810 /* make data=ordered the default */ 1844 /* make data=ordered the default */
1811 if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) && 1845 if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) &&
1812 !reiserfs_data_writeback(s)) { 1846 !reiserfs_data_writeback(s)) {
1813 REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_DATA_ORDERED); 1847 sbi->s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
1814 } 1848 }
1815 1849
1816 if (reiserfs_data_log(s)) { 1850 if (reiserfs_data_log(s)) {
@@ -2003,6 +2037,8 @@ error_unlocked:
2003 reiserfs_write_unlock(s); 2037 reiserfs_write_unlock(s);
2004 } 2038 }
2005 2039
2040 cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
2041
2006 reiserfs_free_bitmap_cache(s); 2042 reiserfs_free_bitmap_cache(s);
2007 if (SB_BUFFER_WITH_SB(s)) 2043 if (SB_BUFFER_WITH_SB(s))
2008 brelse(SB_BUFFER_WITH_SB(s)); 2044 brelse(SB_BUFFER_WITH_SB(s));
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 7ae2a574cb25..9f35a37173de 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -269,12 +269,13 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
269 if (ufd < 0) 269 if (ufd < 0)
270 kfree(ctx); 270 kfree(ctx);
271 } else { 271 } else {
272 struct file *file = fget(ufd); 272 int fput_needed;
273 struct file *file = fget_light(ufd, &fput_needed);
273 if (!file) 274 if (!file)
274 return -EBADF; 275 return -EBADF;
275 ctx = file->private_data; 276 ctx = file->private_data;
276 if (file->f_op != &signalfd_fops) { 277 if (file->f_op != &signalfd_fops) {
277 fput(file); 278 fput_light(file, fput_needed);
278 return -EINVAL; 279 return -EINVAL;
279 } 280 }
280 spin_lock_irq(&current->sighand->siglock); 281 spin_lock_irq(&current->sighand->siglock);
@@ -282,7 +283,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
282 spin_unlock_irq(&current->sighand->siglock); 283 spin_unlock_irq(&current->sighand->siglock);
283 284
284 wake_up(&current->sighand->signalfd_wqh); 285 wake_up(&current->sighand->signalfd_wqh);
285 fput(file); 286 fput_light(file, fput_needed);
286 } 287 }
287 288
288 return ufd; 289 return ufd;
diff --git a/fs/splice.c b/fs/splice.c
index 406ef2b792c2..c9f1318a3b82 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1003,8 +1003,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
1003 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 1003 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
1004 ret = file_remove_suid(out); 1004 ret = file_remove_suid(out);
1005 if (!ret) { 1005 if (!ret) {
1006 file_update_time(out); 1006 ret = file_update_time(out);
1007 ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file); 1007 if (!ret)
1008 ret = splice_from_pipe_feed(pipe, &sd,
1009 pipe_to_file);
1008 } 1010 }
1009 mutex_unlock(&inode->i_mutex); 1011 mutex_unlock(&inode->i_mutex);
1010 } while (ret > 0); 1012 } while (ret > 0);
diff --git a/fs/statfs.c b/fs/statfs.c
index 43e6b6fe4e85..95ad5c0e586c 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -87,11 +87,12 @@ int user_statfs(const char __user *pathname, struct kstatfs *st)
87 87
88int fd_statfs(int fd, struct kstatfs *st) 88int fd_statfs(int fd, struct kstatfs *st)
89{ 89{
90 struct file *file = fget(fd); 90 int fput_needed;
91 struct file *file = fget_light(fd, &fput_needed);
91 int error = -EBADF; 92 int error = -EBADF;
92 if (file) { 93 if (file) {
93 error = vfs_statfs(&file->f_path, st); 94 error = vfs_statfs(&file->f_path, st);
94 fput(file); 95 fput_light(file, fput_needed);
95 } 96 }
96 return error; 97 return error;
97} 98}
diff --git a/fs/sync.c b/fs/sync.c
index 0e8db939d96f..11e3d1c44901 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -188,11 +188,12 @@ static int do_fsync(unsigned int fd, int datasync)
188{ 188{
189 struct file *file; 189 struct file *file;
190 int ret = -EBADF; 190 int ret = -EBADF;
191 int fput_needed;
191 192
192 file = fget(fd); 193 file = fget_light(fd, &fput_needed);
193 if (file) { 194 if (file) {
194 ret = vfs_fsync(file, datasync); 195 ret = vfs_fsync(file, datasync);
195 fput(file); 196 fput_light(file, fput_needed);
196 } 197 }
197 return ret; 198 return ret;
198} 199}
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 62a2727f4ecf..a6d42efc76d2 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1127,16 +1127,7 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1127 struct ubifs_inode *ui = ubifs_inode(inode); 1127 struct ubifs_inode *ui = ubifs_inode(inode);
1128 1128
1129 mutex_lock(&ui->ui_mutex); 1129 mutex_lock(&ui->ui_mutex);
1130 stat->dev = inode->i_sb->s_dev; 1130 generic_fillattr(inode, stat);
1131 stat->ino = inode->i_ino;
1132 stat->mode = inode->i_mode;
1133 stat->nlink = inode->i_nlink;
1134 stat->uid = inode->i_uid;
1135 stat->gid = inode->i_gid;
1136 stat->rdev = inode->i_rdev;
1137 stat->atime = inode->i_atime;
1138 stat->mtime = inode->i_mtime;
1139 stat->ctime = inode->i_ctime;
1140 stat->blksize = UBIFS_BLOCK_SIZE; 1131 stat->blksize = UBIFS_BLOCK_SIZE;
1141 stat->size = ui->ui_size; 1132 stat->size = ui->ui_size;
1142 1133
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index a165c66e3eef..18024178ac4c 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1260,16 +1260,15 @@ static struct dentry *udf_fh_to_parent(struct super_block *sb,
1260 fid->udf.parent_partref, 1260 fid->udf.parent_partref,
1261 fid->udf.parent_generation); 1261 fid->udf.parent_generation);
1262} 1262}
1263static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp, 1263static int udf_encode_fh(struct inode *inode, __u32 *fh, int *lenp,
1264 int connectable) 1264 struct inode *parent)
1265{ 1265{
1266 int len = *lenp; 1266 int len = *lenp;
1267 struct inode *inode = de->d_inode;
1268 struct kernel_lb_addr location = UDF_I(inode)->i_location; 1267 struct kernel_lb_addr location = UDF_I(inode)->i_location;
1269 struct fid *fid = (struct fid *)fh; 1268 struct fid *fid = (struct fid *)fh;
1270 int type = FILEID_UDF_WITHOUT_PARENT; 1269 int type = FILEID_UDF_WITHOUT_PARENT;
1271 1270
1272 if (connectable && (len < 5)) { 1271 if (parent && (len < 5)) {
1273 *lenp = 5; 1272 *lenp = 5;
1274 return 255; 1273 return 255;
1275 } else if (len < 3) { 1274 } else if (len < 3) {
@@ -1282,14 +1281,11 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
1282 fid->udf.partref = location.partitionReferenceNum; 1281 fid->udf.partref = location.partitionReferenceNum;
1283 fid->udf.generation = inode->i_generation; 1282 fid->udf.generation = inode->i_generation;
1284 1283
1285 if (connectable && !S_ISDIR(inode->i_mode)) { 1284 if (parent) {
1286 spin_lock(&de->d_lock); 1285 location = UDF_I(parent)->i_location;
1287 inode = de->d_parent->d_inode;
1288 location = UDF_I(inode)->i_location;
1289 fid->udf.parent_block = location.logicalBlockNum; 1286 fid->udf.parent_block = location.logicalBlockNum;
1290 fid->udf.parent_partref = location.partitionReferenceNum; 1287 fid->udf.parent_partref = location.partitionReferenceNum;
1291 fid->udf.parent_generation = inode->i_generation; 1288 fid->udf.parent_generation = inode->i_generation;
1292 spin_unlock(&de->d_lock);
1293 *lenp = 5; 1289 *lenp = 5;
1294 type = FILEID_UDF_WITH_PARENT; 1290 type = FILEID_UDF_WITH_PARENT;
1295 } 1291 }
diff --git a/fs/utimes.c b/fs/utimes.c
index ba653f3dc1bc..fa4dbe451e27 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -140,18 +140,19 @@ long do_utimes(int dfd, const char __user *filename, struct timespec *times,
140 goto out; 140 goto out;
141 141
142 if (filename == NULL && dfd != AT_FDCWD) { 142 if (filename == NULL && dfd != AT_FDCWD) {
143 int fput_needed;
143 struct file *file; 144 struct file *file;
144 145
145 if (flags & AT_SYMLINK_NOFOLLOW) 146 if (flags & AT_SYMLINK_NOFOLLOW)
146 goto out; 147 goto out;
147 148
148 file = fget(dfd); 149 file = fget_light(dfd, &fput_needed);
149 error = -EBADF; 150 error = -EBADF;
150 if (!file) 151 if (!file)
151 goto out; 152 goto out;
152 153
153 error = utimes_common(&file->f_path, times); 154 error = utimes_common(&file->f_path, times);
154 fput(file); 155 fput_light(file, fput_needed);
155 } else { 156 } else {
156 struct path path; 157 struct path path;
157 int lookup_flags = 0; 158 int lookup_flags = 0;
diff --git a/fs/xattr.c b/fs/xattr.c
index 3c8c1cc333c7..1d7ac3790458 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -399,11 +399,12 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
399SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, 399SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
400 const void __user *,value, size_t, size, int, flags) 400 const void __user *,value, size_t, size, int, flags)
401{ 401{
402 int fput_needed;
402 struct file *f; 403 struct file *f;
403 struct dentry *dentry; 404 struct dentry *dentry;
404 int error = -EBADF; 405 int error = -EBADF;
405 406
406 f = fget(fd); 407 f = fget_light(fd, &fput_needed);
407 if (!f) 408 if (!f)
408 return error; 409 return error;
409 dentry = f->f_path.dentry; 410 dentry = f->f_path.dentry;
@@ -413,7 +414,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
413 error = setxattr(dentry, name, value, size, flags); 414 error = setxattr(dentry, name, value, size, flags);
414 mnt_drop_write_file(f); 415 mnt_drop_write_file(f);
415 } 416 }
416 fput(f); 417 fput_light(f, fput_needed);
417 return error; 418 return error;
418} 419}
419 420
@@ -486,15 +487,16 @@ SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
486SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, 487SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
487 void __user *, value, size_t, size) 488 void __user *, value, size_t, size)
488{ 489{
490 int fput_needed;
489 struct file *f; 491 struct file *f;
490 ssize_t error = -EBADF; 492 ssize_t error = -EBADF;
491 493
492 f = fget(fd); 494 f = fget_light(fd, &fput_needed);
493 if (!f) 495 if (!f)
494 return error; 496 return error;
495 audit_inode(NULL, f->f_path.dentry); 497 audit_inode(NULL, f->f_path.dentry);
496 error = getxattr(f->f_path.dentry, name, value, size); 498 error = getxattr(f->f_path.dentry, name, value, size);
497 fput(f); 499 fput_light(f, fput_needed);
498 return error; 500 return error;
499} 501}
500 502
@@ -566,15 +568,16 @@ SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
566 568
567SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) 569SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
568{ 570{
571 int fput_needed;
569 struct file *f; 572 struct file *f;
570 ssize_t error = -EBADF; 573 ssize_t error = -EBADF;
571 574
572 f = fget(fd); 575 f = fget_light(fd, &fput_needed);
573 if (!f) 576 if (!f)
574 return error; 577 return error;
575 audit_inode(NULL, f->f_path.dentry); 578 audit_inode(NULL, f->f_path.dentry);
576 error = listxattr(f->f_path.dentry, list, size); 579 error = listxattr(f->f_path.dentry, list, size);
577 fput(f); 580 fput_light(f, fput_needed);
578 return error; 581 return error;
579} 582}
580 583
@@ -634,11 +637,12 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
634 637
635SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) 638SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
636{ 639{
640 int fput_needed;
637 struct file *f; 641 struct file *f;
638 struct dentry *dentry; 642 struct dentry *dentry;
639 int error = -EBADF; 643 int error = -EBADF;
640 644
641 f = fget(fd); 645 f = fget_light(fd, &fput_needed);
642 if (!f) 646 if (!f)
643 return error; 647 return error;
644 dentry = f->f_path.dentry; 648 dentry = f->f_path.dentry;
@@ -648,7 +652,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
648 error = removexattr(dentry, name); 652 error = removexattr(dentry, name);
649 mnt_drop_write_file(f); 653 mnt_drop_write_file(f);
650 } 654 }
651 fput(f); 655 fput_light(f, fput_needed);
652 return error; 656 return error;
653} 657}
654 658
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index a907de565db3..4a7286c1dc80 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -46,7 +46,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
46} 46}
47 47
48void * 48void *
49kmem_alloc(size_t size, unsigned int __nocast flags) 49kmem_alloc(size_t size, xfs_km_flags_t flags)
50{ 50{
51 int retries = 0; 51 int retries = 0;
52 gfp_t lflags = kmem_flags_convert(flags); 52 gfp_t lflags = kmem_flags_convert(flags);
@@ -65,7 +65,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
65} 65}
66 66
67void * 67void *
68kmem_zalloc(size_t size, unsigned int __nocast flags) 68kmem_zalloc(size_t size, xfs_km_flags_t flags)
69{ 69{
70 void *ptr; 70 void *ptr;
71 71
@@ -87,7 +87,7 @@ kmem_free(const void *ptr)
87 87
88void * 88void *
89kmem_realloc(const void *ptr, size_t newsize, size_t oldsize, 89kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
90 unsigned int __nocast flags) 90 xfs_km_flags_t flags)
91{ 91{
92 void *new; 92 void *new;
93 93
@@ -102,7 +102,7 @@ kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
102} 102}
103 103
104void * 104void *
105kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags) 105kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
106{ 106{
107 int retries = 0; 107 int retries = 0;
108 gfp_t lflags = kmem_flags_convert(flags); 108 gfp_t lflags = kmem_flags_convert(flags);
@@ -121,7 +121,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
121} 121}
122 122
123void * 123void *
124kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags) 124kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags)
125{ 125{
126 void *ptr; 126 void *ptr;
127 127
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index ab7c53fe346e..b2f2620f9a87 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -27,10 +27,11 @@
27 * General memory allocation interfaces 27 * General memory allocation interfaces
28 */ 28 */
29 29
30#define KM_SLEEP 0x0001u 30typedef unsigned __bitwise xfs_km_flags_t;
31#define KM_NOSLEEP 0x0002u 31#define KM_SLEEP ((__force xfs_km_flags_t)0x0001u)
32#define KM_NOFS 0x0004u 32#define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u)
33#define KM_MAYFAIL 0x0008u 33#define KM_NOFS ((__force xfs_km_flags_t)0x0004u)
34#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u)
34 35
35/* 36/*
36 * We use a special process flag to avoid recursive callbacks into 37 * We use a special process flag to avoid recursive callbacks into
@@ -38,7 +39,7 @@
38 * warnings, so we explicitly skip any generic ones (silly of us). 39 * warnings, so we explicitly skip any generic ones (silly of us).
39 */ 40 */
40static inline gfp_t 41static inline gfp_t
41kmem_flags_convert(unsigned int __nocast flags) 42kmem_flags_convert(xfs_km_flags_t flags)
42{ 43{
43 gfp_t lflags; 44 gfp_t lflags;
44 45
@@ -54,9 +55,9 @@ kmem_flags_convert(unsigned int __nocast flags)
54 return lflags; 55 return lflags;
55} 56}
56 57
57extern void *kmem_alloc(size_t, unsigned int __nocast); 58extern void *kmem_alloc(size_t, xfs_km_flags_t);
58extern void *kmem_zalloc(size_t, unsigned int __nocast); 59extern void *kmem_zalloc(size_t, xfs_km_flags_t);
59extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast); 60extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
60extern void kmem_free(const void *); 61extern void kmem_free(const void *);
61 62
62static inline void *kmem_zalloc_large(size_t size) 63static inline void *kmem_zalloc_large(size_t size)
@@ -107,7 +108,7 @@ kmem_zone_destroy(kmem_zone_t *zone)
107 kmem_cache_destroy(zone); 108 kmem_cache_destroy(zone);
108} 109}
109 110
110extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast); 111extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t);
111extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast); 112extern void *kmem_zone_zalloc(kmem_zone_t *, xfs_km_flags_t);
112 113
113#endif /* __XFS_SUPPORT_KMEM_H__ */ 114#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 2d25d19c4ea1..42679223a0fd 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -52,19 +52,18 @@ static int xfs_fileid_length(int fileid_type)
52 52
53STATIC int 53STATIC int
54xfs_fs_encode_fh( 54xfs_fs_encode_fh(
55 struct dentry *dentry, 55 struct inode *inode,
56 __u32 *fh, 56 __u32 *fh,
57 int *max_len, 57 int *max_len,
58 int connectable) 58 struct inode *parent)
59{ 59{
60 struct fid *fid = (struct fid *)fh; 60 struct fid *fid = (struct fid *)fh;
61 struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fh; 61 struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fh;
62 struct inode *inode = dentry->d_inode;
63 int fileid_type; 62 int fileid_type;
64 int len; 63 int len;
65 64
66 /* Directories don't need their parent encoded, they have ".." */ 65 /* Directories don't need their parent encoded, they have ".." */
67 if (S_ISDIR(inode->i_mode) || !connectable) 66 if (!parent)
68 fileid_type = FILEID_INO32_GEN; 67 fileid_type = FILEID_INO32_GEN;
69 else 68 else
70 fileid_type = FILEID_INO32_GEN_PARENT; 69 fileid_type = FILEID_INO32_GEN_PARENT;
@@ -96,20 +95,16 @@ xfs_fs_encode_fh(
96 95
97 switch (fileid_type) { 96 switch (fileid_type) {
98 case FILEID_INO32_GEN_PARENT: 97 case FILEID_INO32_GEN_PARENT:
99 spin_lock(&dentry->d_lock); 98 fid->i32.parent_ino = XFS_I(parent)->i_ino;
100 fid->i32.parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino; 99 fid->i32.parent_gen = parent->i_generation;
101 fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation;
102 spin_unlock(&dentry->d_lock);
103 /*FALLTHRU*/ 100 /*FALLTHRU*/
104 case FILEID_INO32_GEN: 101 case FILEID_INO32_GEN:
105 fid->i32.ino = XFS_I(inode)->i_ino; 102 fid->i32.ino = XFS_I(inode)->i_ino;
106 fid->i32.gen = inode->i_generation; 103 fid->i32.gen = inode->i_generation;
107 break; 104 break;
108 case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: 105 case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
109 spin_lock(&dentry->d_lock); 106 fid64->parent_ino = XFS_I(parent)->i_ino;
110 fid64->parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino; 107 fid64->parent_gen = parent->i_generation;
111 fid64->parent_gen = dentry->d_parent->d_inode->i_generation;
112 spin_unlock(&dentry->d_lock);
113 /*FALLTHRU*/ 108 /*FALLTHRU*/
114 case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: 109 case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
115 fid64->ino = XFS_I(inode)->i_ino; 110 fid64->ino = XFS_I(inode)->i_ino;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 8d214b87f6bb..9f7ec15a6522 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -586,8 +586,11 @@ restart:
586 * lock above. Eventually we should look into a way to avoid 586 * lock above. Eventually we should look into a way to avoid
587 * the pointless lock roundtrip. 587 * the pointless lock roundtrip.
588 */ 588 */
589 if (likely(!(file->f_mode & FMODE_NOCMTIME))) 589 if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
590 file_update_time(file); 590 error = file_update_time(file);
591 if (error)
592 return error;
593 }
591 594
592 /* 595 /*
593 * If we're writing the file then make sure to clear the setuid and 596 * If we're writing the file then make sure to clear the setuid and
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 6b965bf450e4..f30d9807dc48 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3152,7 +3152,7 @@ xlog_ticket_alloc(
3152 int cnt, 3152 int cnt,
3153 char client, 3153 char client,
3154 bool permanent, 3154 bool permanent,
3155 int alloc_flags) 3155 xfs_km_flags_t alloc_flags)
3156{ 3156{
3157 struct xlog_ticket *tic; 3157 struct xlog_ticket *tic;
3158 uint num_headers; 3158 uint num_headers;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 735ff1ee53da..5bc33261f5be 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -555,7 +555,7 @@ extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
555extern kmem_zone_t *xfs_log_ticket_zone; 555extern kmem_zone_t *xfs_log_ticket_zone;
556struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes, 556struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
557 int count, char client, bool permanent, 557 int count, char client, bool permanent,
558 int alloc_flags); 558 xfs_km_flags_t alloc_flags);
559 559
560 560
561static inline void 561static inline void
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index cdf896fcbfa4..fdf324508c5e 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -584,7 +584,7 @@ xfs_trans_t *
584_xfs_trans_alloc( 584_xfs_trans_alloc(
585 xfs_mount_t *mp, 585 xfs_mount_t *mp,
586 uint type, 586 uint type,
587 uint memflags) 587 xfs_km_flags_t memflags)
588{ 588{
589 xfs_trans_t *tp; 589 xfs_trans_t *tp;
590 590
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 7ab99e1898c8..7c37b533aa8e 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -443,7 +443,7 @@ typedef struct xfs_trans {
443 * XFS transaction mechanism exported interfaces. 443 * XFS transaction mechanism exported interfaces.
444 */ 444 */
445xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); 445xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint);
446xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, uint); 446xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
447xfs_trans_t *xfs_trans_dup(xfs_trans_t *); 447xfs_trans_t *xfs_trans_dup(xfs_trans_t *);
448int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint, 448int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
449 uint, uint); 449 uint, uint);
diff --git a/include/asm-generic/posix_types.h b/include/asm-generic/posix_types.h
index 91d44bd4dde3..fe74fccf18db 100644
--- a/include/asm-generic/posix_types.h
+++ b/include/asm-generic/posix_types.h
@@ -23,10 +23,6 @@ typedef __kernel_ulong_t __kernel_ino_t;
23typedef unsigned int __kernel_mode_t; 23typedef unsigned int __kernel_mode_t;
24#endif 24#endif
25 25
26#ifndef __kernel_nlink_t
27typedef __kernel_ulong_t __kernel_nlink_t;
28#endif
29
30#ifndef __kernel_pid_t 26#ifndef __kernel_pid_t
31typedef int __kernel_pid_t; 27typedef int __kernel_pid_t;
32#endif 28#endif
diff --git a/include/linux/errno.h b/include/linux/errno.h
index 2d09bfa5c262..e0de516374da 100644
--- a/include/linux/errno.h
+++ b/include/linux/errno.h
@@ -17,6 +17,7 @@
17#define ENOIOCTLCMD 515 /* No ioctl command */ 17#define ENOIOCTLCMD 515 /* No ioctl command */
18#define ERESTART_RESTARTBLOCK 516 /* restart by calling sys_restart_syscall */ 18#define ERESTART_RESTARTBLOCK 516 /* restart by calling sys_restart_syscall */
19#define EPROBE_DEFER 517 /* Driver requests probe retry */ 19#define EPROBE_DEFER 517 /* Driver requests probe retry */
20#define EOPENSTALE 518 /* open found a stale dentry */
20 21
21/* Defined for the NFSv3 protocol */ 22/* Defined for the NFSv3 protocol */
22#define EBADHANDLE 521 /* Illegal NFS file handle */ 23#define EBADHANDLE 521 /* Illegal NFS file handle */
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 3a4cef5322dc..12291a7ee275 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -165,8 +165,8 @@ struct fid {
165 */ 165 */
166 166
167struct export_operations { 167struct export_operations {
168 int (*encode_fh)(struct dentry *de, __u32 *fh, int *max_len, 168 int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len,
169 int connectable); 169 struct inode *parent);
170 struct dentry * (*fh_to_dentry)(struct super_block *sb, struct fid *fid, 170 struct dentry * (*fh_to_dentry)(struct super_block *sb, struct fid *fid,
171 int fh_len, int fh_type); 171 int fh_len, int fh_type);
172 struct dentry * (*fh_to_parent)(struct super_block *sb, struct fid *fid, 172 struct dentry * (*fh_to_parent)(struct super_block *sb, struct fid *fid,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 40887afaaca7..51978ed43e97 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1692,6 +1692,7 @@ struct inode_operations {
1692 int (*removexattr) (struct dentry *, const char *); 1692 int (*removexattr) (struct dentry *, const char *);
1693 int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, 1693 int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
1694 u64 len); 1694 u64 len);
1695 int (*update_time)(struct inode *, struct timespec *, int);
1695} ____cacheline_aligned; 1696} ____cacheline_aligned;
1696 1697
1697struct seq_file; 1698struct seq_file;
@@ -1850,6 +1851,13 @@ static inline void inode_inc_iversion(struct inode *inode)
1850 spin_unlock(&inode->i_lock); 1851 spin_unlock(&inode->i_lock);
1851} 1852}
1852 1853
1854enum file_time_flags {
1855 S_ATIME = 1,
1856 S_MTIME = 2,
1857 S_CTIME = 4,
1858 S_VERSION = 8,
1859};
1860
1853extern void touch_atime(struct path *); 1861extern void touch_atime(struct path *);
1854static inline void file_accessed(struct file *file) 1862static inline void file_accessed(struct file *file)
1855{ 1863{
@@ -2583,7 +2591,7 @@ extern int inode_change_ok(const struct inode *, struct iattr *);
2583extern int inode_newsize_ok(const struct inode *, loff_t offset); 2591extern int inode_newsize_ok(const struct inode *, loff_t offset);
2584extern void setattr_copy(struct inode *inode, const struct iattr *attr); 2592extern void setattr_copy(struct inode *inode, const struct iattr *attr);
2585 2593
2586extern void file_update_time(struct file *file); 2594extern int file_update_time(struct file *file);
2587 2595
2588extern int generic_show_options(struct seq_file *m, struct dentry *root); 2596extern int generic_show_options(struct seq_file *m, struct dentry *root);
2589extern void save_mount_options(struct super_block *sb, char *options); 2597extern void save_mount_options(struct super_block *sb, char *options);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 91d0e0a34ef3..63d966d5c2ea 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -60,7 +60,7 @@
60#define FS_EVENTS_POSS_ON_CHILD (FS_ACCESS | FS_MODIFY | FS_ATTRIB |\ 60#define FS_EVENTS_POSS_ON_CHILD (FS_ACCESS | FS_MODIFY | FS_ATTRIB |\
61 FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | FS_OPEN |\ 61 FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | FS_OPEN |\
62 FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE |\ 62 FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE |\
63 FS_DELETE) 63 FS_DELETE | FS_OPEN_PERM | FS_ACCESS_PERM)
64 64
65#define FS_MOVE (FS_MOVED_FROM | FS_MOVED_TO) 65#define FS_MOVE (FS_MOVED_FROM | FS_MOVED_TO)
66 66
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 912c30a8ddb1..f334c7fab967 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -31,6 +31,7 @@
31#include <linux/mutex.h> 31#include <linux/mutex.h>
32#include <linux/timer.h> 32#include <linux/timer.h>
33#include <linux/slab.h> 33#include <linux/slab.h>
34#include <crypto/hash.h>
34#endif 35#endif
35 36
36#define journal_oom_retry 1 37#define journal_oom_retry 1
@@ -147,12 +148,24 @@ typedef struct journal_header_s
147#define JBD2_CRC32_CHKSUM 1 148#define JBD2_CRC32_CHKSUM 1
148#define JBD2_MD5_CHKSUM 2 149#define JBD2_MD5_CHKSUM 2
149#define JBD2_SHA1_CHKSUM 3 150#define JBD2_SHA1_CHKSUM 3
151#define JBD2_CRC32C_CHKSUM 4
150 152
151#define JBD2_CRC32_CHKSUM_SIZE 4 153#define JBD2_CRC32_CHKSUM_SIZE 4
152 154
153#define JBD2_CHECKSUM_BYTES (32 / sizeof(u32)) 155#define JBD2_CHECKSUM_BYTES (32 / sizeof(u32))
154/* 156/*
155 * Commit block header for storing transactional checksums: 157 * Commit block header for storing transactional checksums:
158 *
159 * NOTE: If FEATURE_COMPAT_CHECKSUM (checksum v1) is set, the h_chksum*
160 * fields are used to store a checksum of the descriptor and data blocks.
161 *
162 * If FEATURE_INCOMPAT_CSUM_V2 (checksum v2) is set, then the h_chksum
163 * field is used to store crc32c(uuid+commit_block). Each journal metadata
164 * block gets its own checksum, and data block checksums are stored in
165 * journal_block_tag (in the descriptor). The other h_chksum* fields are
166 * not used.
167 *
168 * Checksum v1 and v2 are mutually exclusive features.
156 */ 169 */
157struct commit_header { 170struct commit_header {
158 __be32 h_magic; 171 __be32 h_magic;
@@ -175,13 +188,19 @@ struct commit_header {
175typedef struct journal_block_tag_s 188typedef struct journal_block_tag_s
176{ 189{
177 __be32 t_blocknr; /* The on-disk block number */ 190 __be32 t_blocknr; /* The on-disk block number */
178 __be32 t_flags; /* See below */ 191 __be16 t_checksum; /* truncated crc32c(uuid+seq+block) */
192 __be16 t_flags; /* See below */
179 __be32 t_blocknr_high; /* most-significant high 32bits. */ 193 __be32 t_blocknr_high; /* most-significant high 32bits. */
180} journal_block_tag_t; 194} journal_block_tag_t;
181 195
182#define JBD2_TAG_SIZE32 (offsetof(journal_block_tag_t, t_blocknr_high)) 196#define JBD2_TAG_SIZE32 (offsetof(journal_block_tag_t, t_blocknr_high))
183#define JBD2_TAG_SIZE64 (sizeof(journal_block_tag_t)) 197#define JBD2_TAG_SIZE64 (sizeof(journal_block_tag_t))
184 198
199/* Tail of descriptor block, for checksumming */
200struct jbd2_journal_block_tail {
201 __be32 t_checksum; /* crc32c(uuid+descr_block) */
202};
203
185/* 204/*
186 * The revoke descriptor: used on disk to describe a series of blocks to 205 * The revoke descriptor: used on disk to describe a series of blocks to
187 * be revoked from the log 206 * be revoked from the log
@@ -192,6 +211,10 @@ typedef struct jbd2_journal_revoke_header_s
192 __be32 r_count; /* Count of bytes used in the block */ 211 __be32 r_count; /* Count of bytes used in the block */
193} jbd2_journal_revoke_header_t; 212} jbd2_journal_revoke_header_t;
194 213
214/* Tail of revoke block, for checksumming */
215struct jbd2_journal_revoke_tail {
216 __be32 r_checksum; /* crc32c(uuid+revoke_block) */
217};
195 218
196/* Definitions for the journal tag flags word: */ 219/* Definitions for the journal tag flags word: */
197#define JBD2_FLAG_ESCAPE 1 /* on-disk block is escaped */ 220#define JBD2_FLAG_ESCAPE 1 /* on-disk block is escaped */
@@ -241,7 +264,10 @@ typedef struct journal_superblock_s
241 __be32 s_max_trans_data; /* Limit of data blocks per trans. */ 264 __be32 s_max_trans_data; /* Limit of data blocks per trans. */
242 265
243/* 0x0050 */ 266/* 0x0050 */
244 __u32 s_padding[44]; 267 __u8 s_checksum_type; /* checksum type */
268 __u8 s_padding2[3];
269 __u32 s_padding[42];
270 __be32 s_checksum; /* crc32c(superblock) */
245 271
246/* 0x0100 */ 272/* 0x0100 */
247 __u8 s_users[16*48]; /* ids of all fs'es sharing the log */ 273 __u8 s_users[16*48]; /* ids of all fs'es sharing the log */
@@ -263,13 +289,15 @@ typedef struct journal_superblock_s
263#define JBD2_FEATURE_INCOMPAT_REVOKE 0x00000001 289#define JBD2_FEATURE_INCOMPAT_REVOKE 0x00000001
264#define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002 290#define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002
265#define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004 291#define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004
292#define JBD2_FEATURE_INCOMPAT_CSUM_V2 0x00000008
266 293
267/* Features known to this kernel version: */ 294/* Features known to this kernel version: */
268#define JBD2_KNOWN_COMPAT_FEATURES JBD2_FEATURE_COMPAT_CHECKSUM 295#define JBD2_KNOWN_COMPAT_FEATURES JBD2_FEATURE_COMPAT_CHECKSUM
269#define JBD2_KNOWN_ROCOMPAT_FEATURES 0 296#define JBD2_KNOWN_ROCOMPAT_FEATURES 0
270#define JBD2_KNOWN_INCOMPAT_FEATURES (JBD2_FEATURE_INCOMPAT_REVOKE | \ 297#define JBD2_KNOWN_INCOMPAT_FEATURES (JBD2_FEATURE_INCOMPAT_REVOKE | \
271 JBD2_FEATURE_INCOMPAT_64BIT | \ 298 JBD2_FEATURE_INCOMPAT_64BIT | \
272 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) 299 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \
300 JBD2_FEATURE_INCOMPAT_CSUM_V2)
273 301
274#ifdef __KERNEL__ 302#ifdef __KERNEL__
275 303
@@ -939,6 +967,12 @@ struct journal_s
939 * superblock pointer here 967 * superblock pointer here
940 */ 968 */
941 void *j_private; 969 void *j_private;
970
971 /* Reference to checksum algorithm driver via cryptoapi */
972 struct crypto_shash *j_chksum_driver;
973
974 /* Precomputed journal UUID checksum for seeding other checksums */
975 __u32 j_csum_seed;
942}; 976};
943 977
944/* 978/*
@@ -1268,6 +1302,25 @@ static inline int jbd_space_needed(journal_t *journal)
1268 1302
1269extern int jbd_blocks_per_page(struct inode *inode); 1303extern int jbd_blocks_per_page(struct inode *inode);
1270 1304
1305static inline u32 jbd2_chksum(journal_t *journal, u32 crc,
1306 const void *address, unsigned int length)
1307{
1308 struct {
1309 struct shash_desc shash;
1310 char ctx[crypto_shash_descsize(journal->j_chksum_driver)];
1311 } desc;
1312 int err;
1313
1314 desc.shash.tfm = journal->j_chksum_driver;
1315 desc.shash.flags = 0;
1316 *(u32 *)desc.ctx = crc;
1317
1318 err = crypto_shash_update(&desc.shash, address, length);
1319 BUG_ON(err);
1320
1321 return *(u32 *)desc.ctx;
1322}
1323
1271#ifdef __KERNEL__ 1324#ifdef __KERNEL__
1272 1325
1273#define buffer_trace_init(bh) do {} while (0) 1326#define buffer_trace_init(bh) do {} while (0)
diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h
index 6230f8556a4e..6133679bc4c0 100644
--- a/include/linux/jbd_common.h
+++ b/include/linux/jbd_common.h
@@ -12,6 +12,7 @@ enum jbd_state_bits {
12 BH_State, /* Pins most journal_head state */ 12 BH_State, /* Pins most journal_head state */
13 BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ 13 BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
14 BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */ 14 BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
15 BH_Verified, /* Metadata block has been verified ok */
15 BH_JBDPrivateStart, /* First bit available for private use by FS */ 16 BH_JBDPrivateStart, /* First bit available for private use by FS */
16}; 17};
17 18
@@ -24,6 +25,7 @@ TAS_BUFFER_FNS(Revoked, revoked)
24BUFFER_FNS(RevokeValid, revokevalid) 25BUFFER_FNS(RevokeValid, revokevalid)
25TAS_BUFFER_FNS(RevokeValid, revokevalid) 26TAS_BUFFER_FNS(RevokeValid, revokevalid)
26BUFFER_FNS(Freed, freed) 27BUFFER_FNS(Freed, freed)
28BUFFER_FNS(Verified, verified)
27 29
28static inline struct buffer_head *jh2bh(struct journal_head *jh) 30static inline struct buffer_head *jh2bh(struct journal_head *jh)
29{ 31{
diff --git a/include/linux/lglock.h b/include/linux/lglock.h
index 87f402ccec55..f01e5f6d1f07 100644
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -23,28 +23,17 @@
23#include <linux/lockdep.h> 23#include <linux/lockdep.h>
24#include <linux/percpu.h> 24#include <linux/percpu.h>
25#include <linux/cpu.h> 25#include <linux/cpu.h>
26#include <linux/notifier.h>
26 27
27/* can make br locks by using local lock for read side, global lock for write */ 28/* can make br locks by using local lock for read side, global lock for write */
28#define br_lock_init(name) name##_lock_init() 29#define br_lock_init(name) lg_lock_init(name, #name)
29#define br_read_lock(name) name##_local_lock() 30#define br_read_lock(name) lg_local_lock(name)
30#define br_read_unlock(name) name##_local_unlock() 31#define br_read_unlock(name) lg_local_unlock(name)
31#define br_write_lock(name) name##_global_lock_online() 32#define br_write_lock(name) lg_global_lock(name)
32#define br_write_unlock(name) name##_global_unlock_online() 33#define br_write_unlock(name) lg_global_unlock(name)
33 34
34#define DECLARE_BRLOCK(name) DECLARE_LGLOCK(name)
35#define DEFINE_BRLOCK(name) DEFINE_LGLOCK(name) 35#define DEFINE_BRLOCK(name) DEFINE_LGLOCK(name)
36 36
37
38#define lg_lock_init(name) name##_lock_init()
39#define lg_local_lock(name) name##_local_lock()
40#define lg_local_unlock(name) name##_local_unlock()
41#define lg_local_lock_cpu(name, cpu) name##_local_lock_cpu(cpu)
42#define lg_local_unlock_cpu(name, cpu) name##_local_unlock_cpu(cpu)
43#define lg_global_lock(name) name##_global_lock()
44#define lg_global_unlock(name) name##_global_unlock()
45#define lg_global_lock_online(name) name##_global_lock_online()
46#define lg_global_unlock_online(name) name##_global_unlock_online()
47
48#ifdef CONFIG_DEBUG_LOCK_ALLOC 37#ifdef CONFIG_DEBUG_LOCK_ALLOC
49#define LOCKDEP_INIT_MAP lockdep_init_map 38#define LOCKDEP_INIT_MAP lockdep_init_map
50 39
@@ -59,142 +48,26 @@
59#define DEFINE_LGLOCK_LOCKDEP(name) 48#define DEFINE_LGLOCK_LOCKDEP(name)
60#endif 49#endif
61 50
62 51struct lglock {
63#define DECLARE_LGLOCK(name) \ 52 arch_spinlock_t __percpu *lock;
64 extern void name##_lock_init(void); \ 53#ifdef CONFIG_DEBUG_LOCK_ALLOC
65 extern void name##_local_lock(void); \ 54 struct lock_class_key lock_key;
66 extern void name##_local_unlock(void); \ 55 struct lockdep_map lock_dep_map;
67 extern void name##_local_lock_cpu(int cpu); \ 56#endif
68 extern void name##_local_unlock_cpu(int cpu); \ 57};
69 extern void name##_global_lock(void); \
70 extern void name##_global_unlock(void); \
71 extern void name##_global_lock_online(void); \
72 extern void name##_global_unlock_online(void); \
73 58
74#define DEFINE_LGLOCK(name) \ 59#define DEFINE_LGLOCK(name) \
75 \ 60 DEFINE_LGLOCK_LOCKDEP(name); \
76 DEFINE_SPINLOCK(name##_cpu_lock); \ 61 DEFINE_PER_CPU(arch_spinlock_t, name ## _lock) \
77 cpumask_t name##_cpus __read_mostly; \ 62 = __ARCH_SPIN_LOCK_UNLOCKED; \
78 DEFINE_PER_CPU(arch_spinlock_t, name##_lock); \ 63 struct lglock name = { .lock = &name ## _lock }
79 DEFINE_LGLOCK_LOCKDEP(name); \ 64
80 \ 65void lg_lock_init(struct lglock *lg, char *name);
81 static int \ 66void lg_local_lock(struct lglock *lg);
82 name##_lg_cpu_callback(struct notifier_block *nb, \ 67void lg_local_unlock(struct lglock *lg);
83 unsigned long action, void *hcpu) \ 68void lg_local_lock_cpu(struct lglock *lg, int cpu);
84 { \ 69void lg_local_unlock_cpu(struct lglock *lg, int cpu);
85 switch (action & ~CPU_TASKS_FROZEN) { \ 70void lg_global_lock(struct lglock *lg);
86 case CPU_UP_PREPARE: \ 71void lg_global_unlock(struct lglock *lg);
87 spin_lock(&name##_cpu_lock); \ 72
88 cpu_set((unsigned long)hcpu, name##_cpus); \
89 spin_unlock(&name##_cpu_lock); \
90 break; \
91 case CPU_UP_CANCELED: case CPU_DEAD: \
92 spin_lock(&name##_cpu_lock); \
93 cpu_clear((unsigned long)hcpu, name##_cpus); \
94 spin_unlock(&name##_cpu_lock); \
95 } \
96 return NOTIFY_OK; \
97 } \
98 static struct notifier_block name##_lg_cpu_notifier = { \
99 .notifier_call = name##_lg_cpu_callback, \
100 }; \
101 void name##_lock_init(void) { \
102 int i; \
103 LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \
104 for_each_possible_cpu(i) { \
105 arch_spinlock_t *lock; \
106 lock = &per_cpu(name##_lock, i); \
107 *lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; \
108 } \
109 register_hotcpu_notifier(&name##_lg_cpu_notifier); \
110 get_online_cpus(); \
111 for_each_online_cpu(i) \
112 cpu_set(i, name##_cpus); \
113 put_online_cpus(); \
114 } \
115 EXPORT_SYMBOL(name##_lock_init); \
116 \
117 void name##_local_lock(void) { \
118 arch_spinlock_t *lock; \
119 preempt_disable(); \
120 rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_); \
121 lock = &__get_cpu_var(name##_lock); \
122 arch_spin_lock(lock); \
123 } \
124 EXPORT_SYMBOL(name##_local_lock); \
125 \
126 void name##_local_unlock(void) { \
127 arch_spinlock_t *lock; \
128 rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_); \
129 lock = &__get_cpu_var(name##_lock); \
130 arch_spin_unlock(lock); \
131 preempt_enable(); \
132 } \
133 EXPORT_SYMBOL(name##_local_unlock); \
134 \
135 void name##_local_lock_cpu(int cpu) { \
136 arch_spinlock_t *lock; \
137 preempt_disable(); \
138 rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_); \
139 lock = &per_cpu(name##_lock, cpu); \
140 arch_spin_lock(lock); \
141 } \
142 EXPORT_SYMBOL(name##_local_lock_cpu); \
143 \
144 void name##_local_unlock_cpu(int cpu) { \
145 arch_spinlock_t *lock; \
146 rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_); \
147 lock = &per_cpu(name##_lock, cpu); \
148 arch_spin_unlock(lock); \
149 preempt_enable(); \
150 } \
151 EXPORT_SYMBOL(name##_local_unlock_cpu); \
152 \
153 void name##_global_lock_online(void) { \
154 int i; \
155 spin_lock(&name##_cpu_lock); \
156 rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_); \
157 for_each_cpu(i, &name##_cpus) { \
158 arch_spinlock_t *lock; \
159 lock = &per_cpu(name##_lock, i); \
160 arch_spin_lock(lock); \
161 } \
162 } \
163 EXPORT_SYMBOL(name##_global_lock_online); \
164 \
165 void name##_global_unlock_online(void) { \
166 int i; \
167 rwlock_release(&name##_lock_dep_map, 1, _RET_IP_); \
168 for_each_cpu(i, &name##_cpus) { \
169 arch_spinlock_t *lock; \
170 lock = &per_cpu(name##_lock, i); \
171 arch_spin_unlock(lock); \
172 } \
173 spin_unlock(&name##_cpu_lock); \
174 } \
175 EXPORT_SYMBOL(name##_global_unlock_online); \
176 \
177 void name##_global_lock(void) { \
178 int i; \
179 preempt_disable(); \
180 rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_); \
181 for_each_possible_cpu(i) { \
182 arch_spinlock_t *lock; \
183 lock = &per_cpu(name##_lock, i); \
184 arch_spin_lock(lock); \
185 } \
186 } \
187 EXPORT_SYMBOL(name##_global_lock); \
188 \
189 void name##_global_unlock(void) { \
190 int i; \
191 rwlock_release(&name##_lock_dep_map, 1, _RET_IP_); \
192 for_each_possible_cpu(i) { \
193 arch_spinlock_t *lock; \
194 lock = &per_cpu(name##_lock, i); \
195 arch_spin_unlock(lock); \
196 } \
197 preempt_enable(); \
198 } \
199 EXPORT_SYMBOL(name##_global_unlock);
200#endif 73#endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ce26716238c3..b36d08ce5c57 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1392,7 +1392,7 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo
1392extern unsigned long mmap_region(struct file *file, unsigned long addr, 1392extern unsigned long mmap_region(struct file *file, unsigned long addr,
1393 unsigned long len, unsigned long flags, 1393 unsigned long len, unsigned long flags,
1394 vm_flags_t vm_flags, unsigned long pgoff); 1394 vm_flags_t vm_flags, unsigned long pgoff);
1395extern unsigned long do_mmap(struct file *, unsigned long, 1395extern unsigned long do_mmap_pgoff(struct file *, unsigned long,
1396 unsigned long, unsigned long, 1396 unsigned long, unsigned long,
1397 unsigned long, unsigned long); 1397 unsigned long, unsigned long);
1398extern int do_munmap(struct mm_struct *, unsigned long, size_t); 1398extern int do_munmap(struct mm_struct *, unsigned long, size_t);
diff --git a/include/linux/security.h b/include/linux/security.h
index ab0e091ce5fa..4e5a73cdbbef 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -86,9 +86,9 @@ extern int cap_inode_setxattr(struct dentry *dentry, const char *name,
86extern int cap_inode_removexattr(struct dentry *dentry, const char *name); 86extern int cap_inode_removexattr(struct dentry *dentry, const char *name);
87extern int cap_inode_need_killpriv(struct dentry *dentry); 87extern int cap_inode_need_killpriv(struct dentry *dentry);
88extern int cap_inode_killpriv(struct dentry *dentry); 88extern int cap_inode_killpriv(struct dentry *dentry);
89extern int cap_file_mmap(struct file *file, unsigned long reqprot, 89extern int cap_mmap_addr(unsigned long addr);
90 unsigned long prot, unsigned long flags, 90extern int cap_mmap_file(struct file *file, unsigned long reqprot,
91 unsigned long addr, unsigned long addr_only); 91 unsigned long prot, unsigned long flags);
92extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags); 92extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags);
93extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, 93extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
94 unsigned long arg4, unsigned long arg5); 94 unsigned long arg4, unsigned long arg5);
@@ -586,15 +586,17 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
586 * simple integer value. When @arg represents a user space pointer, it 586 * simple integer value. When @arg represents a user space pointer, it
587 * should never be used by the security module. 587 * should never be used by the security module.
588 * Return 0 if permission is granted. 588 * Return 0 if permission is granted.
589 * @file_mmap : 589 * @mmap_addr :
590 * Check permissions for a mmap operation at @addr.
591 * @addr contains virtual address that will be used for the operation.
592 * Return 0 if permission is granted.
593 * @mmap_file :
590 * Check permissions for a mmap operation. The @file may be NULL, e.g. 594 * Check permissions for a mmap operation. The @file may be NULL, e.g.
591 * if mapping anonymous memory. 595 * if mapping anonymous memory.
592 * @file contains the file structure for file to map (may be NULL). 596 * @file contains the file structure for file to map (may be NULL).
593 * @reqprot contains the protection requested by the application. 597 * @reqprot contains the protection requested by the application.
594 * @prot contains the protection that will be applied by the kernel. 598 * @prot contains the protection that will be applied by the kernel.
595 * @flags contains the operational flags. 599 * @flags contains the operational flags.
596 * @addr contains virtual address that will be used for the operation.
597 * @addr_only contains a boolean: 0 if file-backed VMA, otherwise 1.
598 * Return 0 if permission is granted. 600 * Return 0 if permission is granted.
599 * @file_mprotect: 601 * @file_mprotect:
600 * Check permissions before changing memory access permissions. 602 * Check permissions before changing memory access permissions.
@@ -1481,10 +1483,10 @@ struct security_operations {
1481 void (*file_free_security) (struct file *file); 1483 void (*file_free_security) (struct file *file);
1482 int (*file_ioctl) (struct file *file, unsigned int cmd, 1484 int (*file_ioctl) (struct file *file, unsigned int cmd,
1483 unsigned long arg); 1485 unsigned long arg);
1484 int (*file_mmap) (struct file *file, 1486 int (*mmap_addr) (unsigned long addr);
1487 int (*mmap_file) (struct file *file,
1485 unsigned long reqprot, unsigned long prot, 1488 unsigned long reqprot, unsigned long prot,
1486 unsigned long flags, unsigned long addr, 1489 unsigned long flags);
1487 unsigned long addr_only);
1488 int (*file_mprotect) (struct vm_area_struct *vma, 1490 int (*file_mprotect) (struct vm_area_struct *vma,
1489 unsigned long reqprot, 1491 unsigned long reqprot,
1490 unsigned long prot); 1492 unsigned long prot);
@@ -1743,9 +1745,9 @@ int security_file_permission(struct file *file, int mask);
1743int security_file_alloc(struct file *file); 1745int security_file_alloc(struct file *file);
1744void security_file_free(struct file *file); 1746void security_file_free(struct file *file);
1745int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 1747int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
1746int security_file_mmap(struct file *file, unsigned long reqprot, 1748int security_mmap_file(struct file *file, unsigned long prot,
1747 unsigned long prot, unsigned long flags, 1749 unsigned long flags);
1748 unsigned long addr, unsigned long addr_only); 1750int security_mmap_addr(unsigned long addr);
1749int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, 1751int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
1750 unsigned long prot); 1752 unsigned long prot);
1751int security_file_lock(struct file *file, unsigned int cmd); 1753int security_file_lock(struct file *file, unsigned int cmd);
@@ -2181,13 +2183,15 @@ static inline int security_file_ioctl(struct file *file, unsigned int cmd,
2181 return 0; 2183 return 0;
2182} 2184}
2183 2185
2184static inline int security_file_mmap(struct file *file, unsigned long reqprot, 2186static inline int security_mmap_file(struct file *file, unsigned long prot,
2185 unsigned long prot, 2187 unsigned long flags)
2186 unsigned long flags, 2188{
2187 unsigned long addr, 2189 return 0;
2188 unsigned long addr_only) 2190}
2191
2192static inline int security_mmap_addr(unsigned long addr)
2189{ 2193{
2190 return cap_file_mmap(file, reqprot, prot, flags, addr, addr_only); 2194 return cap_mmap_addr(addr);
2191} 2195}
2192 2196
2193static inline int security_file_mprotect(struct vm_area_struct *vma, 2197static inline int security_file_mprotect(struct vm_area_struct *vma,
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 51b29ac45a8e..40e0a273faea 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -232,7 +232,6 @@ struct svc_rqst {
232 struct svc_pool * rq_pool; /* thread pool */ 232 struct svc_pool * rq_pool; /* thread pool */
233 struct svc_procedure * rq_procinfo; /* procedure info */ 233 struct svc_procedure * rq_procinfo; /* procedure info */
234 struct auth_ops * rq_authop; /* authentication flavour */ 234 struct auth_ops * rq_authop; /* authentication flavour */
235 u32 rq_flavor; /* pseudoflavor */
236 struct svc_cred rq_cred; /* auth info */ 235 struct svc_cred rq_cred; /* auth info */
237 void * rq_xprt_ctxt; /* transport specific context ptr */ 236 void * rq_xprt_ctxt; /* transport specific context ptr */
238 struct svc_deferred_req*rq_deferred; /* deferred request we are replaying */ 237 struct svc_deferred_req*rq_deferred; /* deferred request we are replaying */
@@ -416,6 +415,7 @@ struct svc_procedure {
416 */ 415 */
417int svc_rpcb_setup(struct svc_serv *serv, struct net *net); 416int svc_rpcb_setup(struct svc_serv *serv, struct net *net);
418void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net); 417void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net);
418int svc_bind(struct svc_serv *serv, struct net *net);
419struct svc_serv *svc_create(struct svc_program *, unsigned int, 419struct svc_serv *svc_create(struct svc_program *, unsigned int,
420 void (*shutdown)(struct svc_serv *, struct net *net)); 420 void (*shutdown)(struct svc_serv *, struct net *net));
421struct svc_rqst *svc_prepare_thread(struct svc_serv *serv, 421struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h
index 2c54683b91de..dd74084a9799 100644
--- a/include/linux/sunrpc/svcauth.h
+++ b/include/linux/sunrpc/svcauth.h
@@ -15,13 +15,23 @@
15#include <linux/sunrpc/msg_prot.h> 15#include <linux/sunrpc/msg_prot.h>
16#include <linux/sunrpc/cache.h> 16#include <linux/sunrpc/cache.h>
17#include <linux/hash.h> 17#include <linux/hash.h>
18#include <linux/cred.h>
18 19
19struct svc_cred { 20struct svc_cred {
20 uid_t cr_uid; 21 uid_t cr_uid;
21 gid_t cr_gid; 22 gid_t cr_gid;
22 struct group_info *cr_group_info; 23 struct group_info *cr_group_info;
24 u32 cr_flavor; /* pseudoflavor */
25 char *cr_principal; /* for gss */
23}; 26};
24 27
28static inline void free_svc_cred(struct svc_cred *cred)
29{
30 if (cred->cr_group_info)
31 put_group_info(cred->cr_group_info);
32 kfree(cred->cr_principal);
33}
34
25struct svc_rqst; /* forward decl */ 35struct svc_rqst; /* forward decl */
26struct in6_addr; 36struct in6_addr;
27 37
diff --git a/include/linux/sunrpc/svcauth_gss.h b/include/linux/sunrpc/svcauth_gss.h
index 7c32daa025eb..726aff1a5201 100644
--- a/include/linux/sunrpc/svcauth_gss.h
+++ b/include/linux/sunrpc/svcauth_gss.h
@@ -22,7 +22,6 @@ int gss_svc_init_net(struct net *net);
22void gss_svc_shutdown_net(struct net *net); 22void gss_svc_shutdown_net(struct net *net);
23int svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name); 23int svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name);
24u32 svcauth_gss_flavor(struct auth_domain *dom); 24u32 svcauth_gss_flavor(struct auth_domain *dom);
25char *svc_gss_principal(struct svc_rqst *);
26 25
27#endif /* __KERNEL__ */ 26#endif /* __KERNEL__ */
28#endif /* _LINUX_SUNRPC_SVCAUTH_GSS_H */ 27#endif /* _LINUX_SUNRPC_SVCAUTH_GSS_H */
diff --git a/include/linux/types.h b/include/linux/types.h
index 7f480db60231..9c1bd539ea70 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -25,7 +25,7 @@ typedef __kernel_dev_t dev_t;
25typedef __kernel_ino_t ino_t; 25typedef __kernel_ino_t ino_t;
26typedef __kernel_mode_t mode_t; 26typedef __kernel_mode_t mode_t;
27typedef unsigned short umode_t; 27typedef unsigned short umode_t;
28typedef __kernel_nlink_t nlink_t; 28typedef __u32 nlink_t;
29typedef __kernel_off_t off_t; 29typedef __kernel_off_t off_t;
30typedef __kernel_pid_t pid_t; 30typedef __kernel_pid_t pid_t;
31typedef __kernel_daddr_t daddr_t; 31typedef __kernel_daddr_t daddr_t;
diff --git a/ipc/shm.c b/ipc/shm.c
index 406c5b208193..5e2cbfdab6fc 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1036,6 +1036,10 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr)
1036 sfd->file = shp->shm_file; 1036 sfd->file = shp->shm_file;
1037 sfd->vm_ops = NULL; 1037 sfd->vm_ops = NULL;
1038 1038
1039 err = security_mmap_file(file, prot, flags);
1040 if (err)
1041 goto out_fput;
1042
1039 down_write(&current->mm->mmap_sem); 1043 down_write(&current->mm->mmap_sem);
1040 if (addr && !(shmflg & SHM_REMAP)) { 1044 if (addr && !(shmflg & SHM_REMAP)) {
1041 err = -EINVAL; 1045 err = -EINVAL;
@@ -1050,7 +1054,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr)
1050 goto invalid; 1054 goto invalid;
1051 } 1055 }
1052 1056
1053 user_addr = do_mmap (file, addr, size, prot, flags, 0); 1057 user_addr = do_mmap_pgoff(file, addr, size, prot, flags, 0);
1054 *raddr = user_addr; 1058 *raddr = user_addr;
1055 err = 0; 1059 err = 0;
1056 if (IS_ERR_VALUE(user_addr)) 1060 if (IS_ERR_VALUE(user_addr))
@@ -1058,6 +1062,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr)
1058invalid: 1062invalid:
1059 up_write(&current->mm->mmap_sem); 1063 up_write(&current->mm->mmap_sem);
1060 1064
1065out_fput:
1061 fput(file); 1066 fput(file);
1062 1067
1063out_nattch: 1068out_nattch:
diff --git a/kernel/Makefile b/kernel/Makefile
index 6f3d0ae044b2..c0cc67ad764c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o cred.o \ 12 notifier.o ksysfs.o cred.o \
13 async.o range.o groups.o 13 async.o range.o groups.o lglock.o
14 14
15ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
16# Do not trace debug files and internal ftrace files 16# Do not trace debug files and internal ftrace files
diff --git a/kernel/lglock.c b/kernel/lglock.c
new file mode 100644
index 000000000000..6535a667a5a7
--- /dev/null
+++ b/kernel/lglock.c
@@ -0,0 +1,89 @@
1/* See include/linux/lglock.h for description */
2#include <linux/module.h>
3#include <linux/lglock.h>
4#include <linux/cpu.h>
5#include <linux/string.h>
6
7/*
8 * Note there is no uninit, so lglocks cannot be defined in
9 * modules (but it's fine to use them from there)
10 * Could be added though, just undo lg_lock_init
11 */
12
13void lg_lock_init(struct lglock *lg, char *name)
14{
15 LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
16}
17EXPORT_SYMBOL(lg_lock_init);
18
19void lg_local_lock(struct lglock *lg)
20{
21 arch_spinlock_t *lock;
22
23 preempt_disable();
24 rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
25 lock = this_cpu_ptr(lg->lock);
26 arch_spin_lock(lock);
27}
28EXPORT_SYMBOL(lg_local_lock);
29
30void lg_local_unlock(struct lglock *lg)
31{
32 arch_spinlock_t *lock;
33
34 rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
35 lock = this_cpu_ptr(lg->lock);
36 arch_spin_unlock(lock);
37 preempt_enable();
38}
39EXPORT_SYMBOL(lg_local_unlock);
40
41void lg_local_lock_cpu(struct lglock *lg, int cpu)
42{
43 arch_spinlock_t *lock;
44
45 preempt_disable();
46 rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
47 lock = per_cpu_ptr(lg->lock, cpu);
48 arch_spin_lock(lock);
49}
50EXPORT_SYMBOL(lg_local_lock_cpu);
51
52void lg_local_unlock_cpu(struct lglock *lg, int cpu)
53{
54 arch_spinlock_t *lock;
55
56 rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
57 lock = per_cpu_ptr(lg->lock, cpu);
58 arch_spin_unlock(lock);
59 preempt_enable();
60}
61EXPORT_SYMBOL(lg_local_unlock_cpu);
62
63void lg_global_lock(struct lglock *lg)
64{
65 int i;
66
67 preempt_disable();
68 rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
69 for_each_possible_cpu(i) {
70 arch_spinlock_t *lock;
71 lock = per_cpu_ptr(lg->lock, i);
72 arch_spin_lock(lock);
73 }
74}
75EXPORT_SYMBOL(lg_global_lock);
76
77void lg_global_unlock(struct lglock *lg)
78{
79 int i;
80
81 rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
82 for_each_possible_cpu(i) {
83 arch_spinlock_t *lock;
84 lock = per_cpu_ptr(lg->lock, i);
85 arch_spin_unlock(lock);
86 }
87 preempt_enable();
88}
89EXPORT_SYMBOL(lg_global_unlock);
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 5646c740f613..32e6f4136fa2 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -80,7 +80,7 @@ EXPORT_SYMBOL(__cleancache_init_shared_fs);
80static int cleancache_get_key(struct inode *inode, 80static int cleancache_get_key(struct inode *inode,
81 struct cleancache_filekey *key) 81 struct cleancache_filekey *key)
82{ 82{
83 int (*fhfn)(struct dentry *, __u32 *fh, int *, int); 83 int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *);
84 int len = 0, maxlen = CLEANCACHE_KEY_MAX; 84 int len = 0, maxlen = CLEANCACHE_KEY_MAX;
85 struct super_block *sb = inode->i_sb; 85 struct super_block *sb = inode->i_sb;
86 86
@@ -88,9 +88,7 @@ static int cleancache_get_key(struct inode *inode,
88 if (sb->s_export_op != NULL) { 88 if (sb->s_export_op != NULL) {
89 fhfn = sb->s_export_op->encode_fh; 89 fhfn = sb->s_export_op->encode_fh;
90 if (fhfn) { 90 if (fhfn) {
91 struct dentry d; 91 len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL);
92 d.d_inode = inode;
93 len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0);
94 if (len <= 0 || len == 255) 92 if (len <= 0 || len == 255)
95 return -1; 93 return -1;
96 if (maxlen > CLEANCACHE_KEY_MAX) 94 if (maxlen > CLEANCACHE_KEY_MAX)
diff --git a/mm/filemap.c b/mm/filemap.c
index 64b48f934b89..a4a5260b0279 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1899,71 +1899,6 @@ struct page *read_cache_page(struct address_space *mapping,
1899} 1899}
1900EXPORT_SYMBOL(read_cache_page); 1900EXPORT_SYMBOL(read_cache_page);
1901 1901
1902/*
1903 * The logic we want is
1904 *
1905 * if suid or (sgid and xgrp)
1906 * remove privs
1907 */
1908int should_remove_suid(struct dentry *dentry)
1909{
1910 umode_t mode = dentry->d_inode->i_mode;
1911 int kill = 0;
1912
1913 /* suid always must be killed */
1914 if (unlikely(mode & S_ISUID))
1915 kill = ATTR_KILL_SUID;
1916
1917 /*
1918 * sgid without any exec bits is just a mandatory locking mark; leave
1919 * it alone. If some exec bits are set, it's a real sgid; kill it.
1920 */
1921 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1922 kill |= ATTR_KILL_SGID;
1923
1924 if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
1925 return kill;
1926
1927 return 0;
1928}
1929EXPORT_SYMBOL(should_remove_suid);
1930
1931static int __remove_suid(struct dentry *dentry, int kill)
1932{
1933 struct iattr newattrs;
1934
1935 newattrs.ia_valid = ATTR_FORCE | kill;
1936 return notify_change(dentry, &newattrs);
1937}
1938
1939int file_remove_suid(struct file *file)
1940{
1941 struct dentry *dentry = file->f_path.dentry;
1942 struct inode *inode = dentry->d_inode;
1943 int killsuid;
1944 int killpriv;
1945 int error = 0;
1946
1947 /* Fast path for nothing security related */
1948 if (IS_NOSEC(inode))
1949 return 0;
1950
1951 killsuid = should_remove_suid(dentry);
1952 killpriv = security_inode_need_killpriv(dentry);
1953
1954 if (killpriv < 0)
1955 return killpriv;
1956 if (killpriv)
1957 error = security_inode_killpriv(dentry);
1958 if (!error && killsuid)
1959 error = __remove_suid(dentry, killsuid);
1960 if (!error && (inode->i_sb->s_flags & MS_NOSEC))
1961 inode->i_flags |= S_NOSEC;
1962
1963 return error;
1964}
1965EXPORT_SYMBOL(file_remove_suid);
1966
1967static size_t __iovec_copy_from_user_inatomic(char *vaddr, 1902static size_t __iovec_copy_from_user_inatomic(char *vaddr,
1968 const struct iovec *iov, size_t base, size_t bytes) 1903 const struct iovec *iov, size_t base, size_t bytes)
1969{ 1904{
@@ -2489,7 +2424,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2489 if (err) 2424 if (err)
2490 goto out; 2425 goto out;
2491 2426
2492 file_update_time(file); 2427 err = file_update_time(file);
2428 if (err)
2429 goto out;
2493 2430
2494 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 2431 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2495 if (unlikely(file->f_flags & O_DIRECT)) { 2432 if (unlikely(file->f_flags & O_DIRECT)) {
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index a4eb31132229..213ca1f53409 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -426,7 +426,9 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
426 if (ret) 426 if (ret)
427 goto out_backing; 427 goto out_backing;
428 428
429 file_update_time(filp); 429 ret = file_update_time(filp);
430 if (ret)
431 goto out_backing;
430 432
431 ret = __xip_file_write (filp, buf, count, pos, ppos); 433 ret = __xip_file_write (filp, buf, count, pos, ppos);
432 434
diff --git a/mm/internal.h b/mm/internal.h
index 4194ab9dc19b..5cbb78190041 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -350,3 +350,7 @@ extern u64 hwpoison_filter_flags_mask;
350extern u64 hwpoison_filter_flags_value; 350extern u64 hwpoison_filter_flags_value;
351extern u64 hwpoison_filter_memcg; 351extern u64 hwpoison_filter_memcg;
352extern u32 hwpoison_filter_enable; 352extern u32 hwpoison_filter_enable;
353
354extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
355 unsigned long, unsigned long,
356 unsigned long, unsigned long);
diff --git a/mm/mmap.c b/mm/mmap.c
index 4a9c2a391e28..3edfcdfa42d9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -971,15 +971,13 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
971 * The caller must hold down_write(&current->mm->mmap_sem). 971 * The caller must hold down_write(&current->mm->mmap_sem).
972 */ 972 */
973 973
974static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, 974unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
975 unsigned long len, unsigned long prot, 975 unsigned long len, unsigned long prot,
976 unsigned long flags, unsigned long pgoff) 976 unsigned long flags, unsigned long pgoff)
977{ 977{
978 struct mm_struct * mm = current->mm; 978 struct mm_struct * mm = current->mm;
979 struct inode *inode; 979 struct inode *inode;
980 vm_flags_t vm_flags; 980 vm_flags_t vm_flags;
981 int error;
982 unsigned long reqprot = prot;
983 981
984 /* 982 /*
985 * Does the application expect PROT_READ to imply PROT_EXEC? 983 * Does the application expect PROT_READ to imply PROT_EXEC?
@@ -1101,39 +1099,9 @@ static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1101 } 1099 }
1102 } 1100 }
1103 1101
1104 error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
1105 if (error)
1106 return error;
1107
1108 return mmap_region(file, addr, len, flags, vm_flags, pgoff); 1102 return mmap_region(file, addr, len, flags, vm_flags, pgoff);
1109} 1103}
1110 1104
1111unsigned long do_mmap(struct file *file, unsigned long addr,
1112 unsigned long len, unsigned long prot,
1113 unsigned long flag, unsigned long offset)
1114{
1115 if (unlikely(offset + PAGE_ALIGN(len) < offset))
1116 return -EINVAL;
1117 if (unlikely(offset & ~PAGE_MASK))
1118 return -EINVAL;
1119 return do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
1120}
1121EXPORT_SYMBOL(do_mmap);
1122
1123unsigned long vm_mmap(struct file *file, unsigned long addr,
1124 unsigned long len, unsigned long prot,
1125 unsigned long flag, unsigned long offset)
1126{
1127 unsigned long ret;
1128 struct mm_struct *mm = current->mm;
1129
1130 down_write(&mm->mmap_sem);
1131 ret = do_mmap(file, addr, len, prot, flag, offset);
1132 up_write(&mm->mmap_sem);
1133 return ret;
1134}
1135EXPORT_SYMBOL(vm_mmap);
1136
1137SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, 1105SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1138 unsigned long, prot, unsigned long, flags, 1106 unsigned long, prot, unsigned long, flags,
1139 unsigned long, fd, unsigned long, pgoff) 1107 unsigned long, fd, unsigned long, pgoff)
@@ -1165,10 +1133,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1165 1133
1166 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 1134 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1167 1135
1168 down_write(&current->mm->mmap_sem); 1136 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1169 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1170 up_write(&current->mm->mmap_sem);
1171
1172 if (file) 1137 if (file)
1173 fput(file); 1138 fput(file);
1174out: 1139out:
@@ -1629,7 +1594,9 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1629 if (addr & ~PAGE_MASK) 1594 if (addr & ~PAGE_MASK)
1630 return -EINVAL; 1595 return -EINVAL;
1631 1596
1632 return arch_rebalance_pgtables(addr, len); 1597 addr = arch_rebalance_pgtables(addr, len);
1598 error = security_mmap_addr(addr);
1599 return error ? error : addr;
1633} 1600}
1634 1601
1635EXPORT_SYMBOL(get_unmapped_area); 1602EXPORT_SYMBOL(get_unmapped_area);
@@ -1819,7 +1786,7 @@ int expand_downwards(struct vm_area_struct *vma,
1819 return -ENOMEM; 1786 return -ENOMEM;
1820 1787
1821 address &= PAGE_MASK; 1788 address &= PAGE_MASK;
1822 error = security_file_mmap(NULL, 0, 0, 0, address, 1); 1789 error = security_mmap_addr(address);
1823 if (error) 1790 if (error)
1824 return error; 1791 return error;
1825 1792
@@ -2159,7 +2126,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
2159 2126
2160 return 0; 2127 return 0;
2161} 2128}
2162EXPORT_SYMBOL(do_munmap);
2163 2129
2164int vm_munmap(unsigned long start, size_t len) 2130int vm_munmap(unsigned long start, size_t len)
2165{ 2131{
@@ -2207,10 +2173,6 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
2207 if (!len) 2173 if (!len)
2208 return addr; 2174 return addr;
2209 2175
2210 error = security_file_mmap(NULL, 0, 0, 0, addr, 1);
2211 if (error)
2212 return error;
2213
2214 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; 2176 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
2215 2177
2216 error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); 2178 error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
@@ -2563,10 +2525,6 @@ int install_special_mapping(struct mm_struct *mm,
2563 vma->vm_ops = &special_mapping_vmops; 2525 vma->vm_ops = &special_mapping_vmops;
2564 vma->vm_private_data = pages; 2526 vma->vm_private_data = pages;
2565 2527
2566 ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
2567 if (ret)
2568 goto out;
2569
2570 ret = insert_vm_struct(mm, vma); 2528 ret = insert_vm_struct(mm, vma);
2571 if (ret) 2529 if (ret)
2572 goto out; 2530 goto out;
diff --git a/mm/mremap.c b/mm/mremap.c
index db8d983b5a7d..21fed202ddad 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -371,10 +371,6 @@ static unsigned long mremap_to(unsigned long addr,
371 if ((addr <= new_addr) && (addr+old_len) > new_addr) 371 if ((addr <= new_addr) && (addr+old_len) > new_addr)
372 goto out; 372 goto out;
373 373
374 ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
375 if (ret)
376 goto out;
377
378 ret = do_munmap(mm, new_addr, new_len); 374 ret = do_munmap(mm, new_addr, new_len);
379 if (ret) 375 if (ret)
380 goto out; 376 goto out;
@@ -432,15 +428,17 @@ static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
432 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise 428 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
433 * This option implies MREMAP_MAYMOVE. 429 * This option implies MREMAP_MAYMOVE.
434 */ 430 */
435unsigned long do_mremap(unsigned long addr, 431SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
436 unsigned long old_len, unsigned long new_len, 432 unsigned long, new_len, unsigned long, flags,
437 unsigned long flags, unsigned long new_addr) 433 unsigned long, new_addr)
438{ 434{
439 struct mm_struct *mm = current->mm; 435 struct mm_struct *mm = current->mm;
440 struct vm_area_struct *vma; 436 struct vm_area_struct *vma;
441 unsigned long ret = -EINVAL; 437 unsigned long ret = -EINVAL;
442 unsigned long charged = 0; 438 unsigned long charged = 0;
443 439
440 down_write(&current->mm->mmap_sem);
441
444 if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) 442 if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
445 goto out; 443 goto out;
446 444
@@ -530,25 +528,11 @@ unsigned long do_mremap(unsigned long addr,
530 goto out; 528 goto out;
531 } 529 }
532 530
533 ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
534 if (ret)
535 goto out;
536 ret = move_vma(vma, addr, old_len, new_len, new_addr); 531 ret = move_vma(vma, addr, old_len, new_len, new_addr);
537 } 532 }
538out: 533out:
539 if (ret & ~PAGE_MASK) 534 if (ret & ~PAGE_MASK)
540 vm_unacct_memory(charged); 535 vm_unacct_memory(charged);
541 return ret;
542}
543
544SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
545 unsigned long, new_len, unsigned long, flags,
546 unsigned long, new_addr)
547{
548 unsigned long ret;
549
550 down_write(&current->mm->mmap_sem);
551 ret = do_mremap(addr, old_len, new_len, flags, new_addr);
552 up_write(&current->mm->mmap_sem); 536 up_write(&current->mm->mmap_sem);
553 return ret; 537 return ret;
554} 538}
diff --git a/mm/nommu.c b/mm/nommu.c
index bb8f4f004a82..c4acfbc09972 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -889,7 +889,6 @@ static int validate_mmap_request(struct file *file,
889 unsigned long *_capabilities) 889 unsigned long *_capabilities)
890{ 890{
891 unsigned long capabilities, rlen; 891 unsigned long capabilities, rlen;
892 unsigned long reqprot = prot;
893 int ret; 892 int ret;
894 893
895 /* do the simple checks first */ 894 /* do the simple checks first */
@@ -1047,7 +1046,7 @@ static int validate_mmap_request(struct file *file,
1047 } 1046 }
1048 1047
1049 /* allow the security API to have its say */ 1048 /* allow the security API to have its say */
1050 ret = security_file_mmap(file, reqprot, prot, flags, addr, 0); 1049 ret = security_mmap_addr(addr);
1051 if (ret < 0) 1050 if (ret < 0)
1052 return ret; 1051 return ret;
1053 1052
@@ -1233,7 +1232,7 @@ enomem:
1233/* 1232/*
1234 * handle mapping creation for uClinux 1233 * handle mapping creation for uClinux
1235 */ 1234 */
1236static unsigned long do_mmap_pgoff(struct file *file, 1235unsigned long do_mmap_pgoff(struct file *file,
1237 unsigned long addr, 1236 unsigned long addr,
1238 unsigned long len, 1237 unsigned long len,
1239 unsigned long prot, 1238 unsigned long prot,
@@ -1471,32 +1470,6 @@ error_getting_region:
1471 return -ENOMEM; 1470 return -ENOMEM;
1472} 1471}
1473 1472
1474unsigned long do_mmap(struct file *file, unsigned long addr,
1475 unsigned long len, unsigned long prot,
1476 unsigned long flag, unsigned long offset)
1477{
1478 if (unlikely(offset + PAGE_ALIGN(len) < offset))
1479 return -EINVAL;
1480 if (unlikely(offset & ~PAGE_MASK))
1481 return -EINVAL;
1482 return do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
1483}
1484EXPORT_SYMBOL(do_mmap);
1485
1486unsigned long vm_mmap(struct file *file, unsigned long addr,
1487 unsigned long len, unsigned long prot,
1488 unsigned long flag, unsigned long offset)
1489{
1490 unsigned long ret;
1491 struct mm_struct *mm = current->mm;
1492
1493 down_write(&mm->mmap_sem);
1494 ret = do_mmap(file, addr, len, prot, flag, offset);
1495 up_write(&mm->mmap_sem);
1496 return ret;
1497}
1498EXPORT_SYMBOL(vm_mmap);
1499
1500SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, 1473SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1501 unsigned long, prot, unsigned long, flags, 1474 unsigned long, prot, unsigned long, flags,
1502 unsigned long, fd, unsigned long, pgoff) 1475 unsigned long, fd, unsigned long, pgoff)
@@ -1513,9 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1513 1486
1514 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 1487 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1515 1488
1516 down_write(&current->mm->mmap_sem); 1489 ret = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1517 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1518 up_write(&current->mm->mmap_sem);
1519 1490
1520 if (file) 1491 if (file)
1521 fput(file); 1492 fput(file);
diff --git a/mm/shmem.c b/mm/shmem.c
index d576b84d913c..585bd220a21e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2439,11 +2439,9 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
2439 return dentry; 2439 return dentry;
2440} 2440}
2441 2441
2442static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, 2442static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
2443 int connectable) 2443 struct inode *parent)
2444{ 2444{
2445 struct inode *inode = dentry->d_inode;
2446
2447 if (*len < 3) { 2445 if (*len < 3) {
2448 *len = 3; 2446 *len = 3;
2449 return 255; 2447 return 255;
diff --git a/mm/util.c b/mm/util.c
index ae962b31de88..8c7265afa29f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,6 +4,7 @@
4#include <linux/export.h> 4#include <linux/export.h>
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/security.h>
7#include <asm/uaccess.h> 8#include <asm/uaccess.h>
8 9
9#include "internal.h" 10#include "internal.h"
@@ -341,6 +342,35 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
341} 342}
342EXPORT_SYMBOL_GPL(get_user_pages_fast); 343EXPORT_SYMBOL_GPL(get_user_pages_fast);
343 344
345unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
346 unsigned long len, unsigned long prot,
347 unsigned long flag, unsigned long pgoff)
348{
349 unsigned long ret;
350 struct mm_struct *mm = current->mm;
351
352 ret = security_mmap_file(file, prot, flag);
353 if (!ret) {
354 down_write(&mm->mmap_sem);
355 ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff);
356 up_write(&mm->mmap_sem);
357 }
358 return ret;
359}
360
361unsigned long vm_mmap(struct file *file, unsigned long addr,
362 unsigned long len, unsigned long prot,
363 unsigned long flag, unsigned long offset)
364{
365 if (unlikely(offset + PAGE_ALIGN(len) < offset))
366 return -EINVAL;
367 if (unlikely(offset & ~PAGE_MASK))
368 return -EINVAL;
369
370 return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
371}
372EXPORT_SYMBOL(vm_mmap);
373
344/* Tracepoints definitions. */ 374/* Tracepoints definitions. */
345EXPORT_TRACEPOINT_SYMBOL(kmalloc); 375EXPORT_TRACEPOINT_SYMBOL(kmalloc);
346EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); 376EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 8522a4793374..ca8e0a57d945 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -16,8 +16,6 @@
16#include <net/netlink.h> 16#include <net/netlink.h>
17#include <net/pkt_sched.h> 17#include <net/pkt_sched.h>
18 18
19extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */
20
21/* 19/*
22 * The ATM queuing discipline provides a framework for invoking classifiers 20 * The ATM queuing discipline provides a framework for invoking classifiers
23 * (aka "filters"), which in turn select classes of this queuing discipline. 21 * (aka "filters"), which in turn select classes of this queuing discipline.
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index 38f388c39dce..107c4528654f 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -381,21 +381,53 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
381} 381}
382 382
383/* 383/*
384 * We cannot currently handle tokens with rotated data. We need a 384 * We can shift data by up to LOCAL_BUF_LEN bytes in a pass. If we need
385 * generalized routine to rotate the data in place. It is anticipated 385 * to do more than that, we shift repeatedly. Kevin Coffman reports
386 * that we won't encounter rotated data in the general case. 386 * seeing 28 bytes as the value used by Microsoft clients and servers
387 * with AES, so this constant is chosen to allow handling 28 in one pass
388 * without using too much stack space.
389 *
390 * If that proves to a problem perhaps we could use a more clever
391 * algorithm.
387 */ 392 */
388static u32 393#define LOCAL_BUF_LEN 32u
389rotate_left(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, u16 rrc) 394
395static void rotate_buf_a_little(struct xdr_buf *buf, unsigned int shift)
390{ 396{
391 unsigned int realrrc = rrc % (buf->len - offset - GSS_KRB5_TOK_HDR_LEN); 397 char head[LOCAL_BUF_LEN];
398 char tmp[LOCAL_BUF_LEN];
399 unsigned int this_len, i;
400
401 BUG_ON(shift > LOCAL_BUF_LEN);
392 402
393 if (realrrc == 0) 403 read_bytes_from_xdr_buf(buf, 0, head, shift);
394 return 0; 404 for (i = 0; i + shift < buf->len; i += LOCAL_BUF_LEN) {
405 this_len = min(LOCAL_BUF_LEN, buf->len - (i + shift));
406 read_bytes_from_xdr_buf(buf, i+shift, tmp, this_len);
407 write_bytes_to_xdr_buf(buf, i, tmp, this_len);
408 }
409 write_bytes_to_xdr_buf(buf, buf->len - shift, head, shift);
410}
395 411
396 dprintk("%s: cannot process token with rotated data: " 412static void _rotate_left(struct xdr_buf *buf, unsigned int shift)
397 "rrc %u, realrrc %u\n", __func__, rrc, realrrc); 413{
398 return 1; 414 int shifted = 0;
415 int this_shift;
416
417 shift %= buf->len;
418 while (shifted < shift) {
419 this_shift = min(shift - shifted, LOCAL_BUF_LEN);
420 rotate_buf_a_little(buf, this_shift);
421 shifted += this_shift;
422 }
423}
424
425static void rotate_left(u32 base, struct xdr_buf *buf, unsigned int shift)
426{
427 struct xdr_buf subbuf;
428
429 xdr_buf_subsegment(buf, &subbuf, base, buf->len - base);
430 _rotate_left(&subbuf, shift);
399} 431}
400 432
401static u32 433static u32
@@ -495,11 +527,8 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
495 527
496 seqnum = be64_to_cpup((__be64 *)(ptr + 8)); 528 seqnum = be64_to_cpup((__be64 *)(ptr + 8));
497 529
498 if (rrc != 0) { 530 if (rrc != 0)
499 err = rotate_left(kctx, offset, buf, rrc); 531 rotate_left(offset + 16, buf, rrc);
500 if (err)
501 return GSS_S_FAILURE;
502 }
503 532
504 err = (*kctx->gk5e->decrypt_v2)(kctx, offset, buf, 533 err = (*kctx->gk5e->decrypt_v2)(kctx, offset, buf,
505 &headskip, &tailskip); 534 &headskip, &tailskip);
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 3089de37c433..73e957386600 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -336,7 +336,6 @@ struct rsc {
336 struct svc_cred cred; 336 struct svc_cred cred;
337 struct gss_svc_seq_data seqdata; 337 struct gss_svc_seq_data seqdata;
338 struct gss_ctx *mechctx; 338 struct gss_ctx *mechctx;
339 char *client_name;
340}; 339};
341 340
342static struct rsc *rsc_update(struct cache_detail *cd, struct rsc *new, struct rsc *old); 341static struct rsc *rsc_update(struct cache_detail *cd, struct rsc *new, struct rsc *old);
@@ -347,9 +346,7 @@ static void rsc_free(struct rsc *rsci)
347 kfree(rsci->handle.data); 346 kfree(rsci->handle.data);
348 if (rsci->mechctx) 347 if (rsci->mechctx)
349 gss_delete_sec_context(&rsci->mechctx); 348 gss_delete_sec_context(&rsci->mechctx);
350 if (rsci->cred.cr_group_info) 349 free_svc_cred(&rsci->cred);
351 put_group_info(rsci->cred.cr_group_info);
352 kfree(rsci->client_name);
353} 350}
354 351
355static void rsc_put(struct kref *ref) 352static void rsc_put(struct kref *ref)
@@ -387,7 +384,7 @@ rsc_init(struct cache_head *cnew, struct cache_head *ctmp)
387 tmp->handle.data = NULL; 384 tmp->handle.data = NULL;
388 new->mechctx = NULL; 385 new->mechctx = NULL;
389 new->cred.cr_group_info = NULL; 386 new->cred.cr_group_info = NULL;
390 new->client_name = NULL; 387 new->cred.cr_principal = NULL;
391} 388}
392 389
393static void 390static void
@@ -402,8 +399,8 @@ update_rsc(struct cache_head *cnew, struct cache_head *ctmp)
402 spin_lock_init(&new->seqdata.sd_lock); 399 spin_lock_init(&new->seqdata.sd_lock);
403 new->cred = tmp->cred; 400 new->cred = tmp->cred;
404 tmp->cred.cr_group_info = NULL; 401 tmp->cred.cr_group_info = NULL;
405 new->client_name = tmp->client_name; 402 new->cred.cr_principal = tmp->cred.cr_principal;
406 tmp->client_name = NULL; 403 tmp->cred.cr_principal = NULL;
407} 404}
408 405
409static struct cache_head * 406static struct cache_head *
@@ -501,8 +498,8 @@ static int rsc_parse(struct cache_detail *cd,
501 /* get client name */ 498 /* get client name */
502 len = qword_get(&mesg, buf, mlen); 499 len = qword_get(&mesg, buf, mlen);
503 if (len > 0) { 500 if (len > 0) {
504 rsci.client_name = kstrdup(buf, GFP_KERNEL); 501 rsci.cred.cr_principal = kstrdup(buf, GFP_KERNEL);
505 if (!rsci.client_name) 502 if (!rsci.cred.cr_principal)
506 goto out; 503 goto out;
507 } 504 }
508 505
@@ -932,16 +929,6 @@ struct gss_svc_data {
932 struct rsc *rsci; 929 struct rsc *rsci;
933}; 930};
934 931
935char *svc_gss_principal(struct svc_rqst *rqstp)
936{
937 struct gss_svc_data *gd = (struct gss_svc_data *)rqstp->rq_auth_data;
938
939 if (gd && gd->rsci)
940 return gd->rsci->client_name;
941 return NULL;
942}
943EXPORT_SYMBOL_GPL(svc_gss_principal);
944
945static int 932static int
946svcauth_gss_set_client(struct svc_rqst *rqstp) 933svcauth_gss_set_client(struct svc_rqst *rqstp)
947{ 934{
@@ -1220,7 +1207,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
1220 } 1207 }
1221 svcdata->rsci = rsci; 1208 svcdata->rsci = rsci;
1222 cache_get(&rsci->h); 1209 cache_get(&rsci->h);
1223 rqstp->rq_flavor = gss_svc_to_pseudoflavor( 1210 rqstp->rq_cred.cr_flavor = gss_svc_to_pseudoflavor(
1224 rsci->mechctx->mech_type, gc->gc_svc); 1211 rsci->mechctx->mech_type, gc->gc_svc);
1225 ret = SVC_OK; 1212 ret = SVC_OK;
1226 goto out; 1213 goto out;
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 3c0653439f3d..92509ffe15fc 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -180,14 +180,16 @@ void rpcb_put_local(struct net *net)
180 struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); 180 struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
181 struct rpc_clnt *clnt = sn->rpcb_local_clnt; 181 struct rpc_clnt *clnt = sn->rpcb_local_clnt;
182 struct rpc_clnt *clnt4 = sn->rpcb_local_clnt4; 182 struct rpc_clnt *clnt4 = sn->rpcb_local_clnt4;
183 int shutdown; 183 int shutdown = 0;
184 184
185 spin_lock(&sn->rpcb_clnt_lock); 185 spin_lock(&sn->rpcb_clnt_lock);
186 if (--sn->rpcb_users == 0) { 186 if (sn->rpcb_users) {
187 sn->rpcb_local_clnt = NULL; 187 if (--sn->rpcb_users == 0) {
188 sn->rpcb_local_clnt4 = NULL; 188 sn->rpcb_local_clnt = NULL;
189 sn->rpcb_local_clnt4 = NULL;
190 }
191 shutdown = !sn->rpcb_users;
189 } 192 }
190 shutdown = !sn->rpcb_users;
191 spin_unlock(&sn->rpcb_clnt_lock); 193 spin_unlock(&sn->rpcb_clnt_lock);
192 194
193 if (shutdown) { 195 if (shutdown) {
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 017c0117d154..7e9baaa1e543 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -407,6 +407,14 @@ static int svc_uses_rpcbind(struct svc_serv *serv)
407 return 0; 407 return 0;
408} 408}
409 409
410int svc_bind(struct svc_serv *serv, struct net *net)
411{
412 if (!svc_uses_rpcbind(serv))
413 return 0;
414 return svc_rpcb_setup(serv, net);
415}
416EXPORT_SYMBOL_GPL(svc_bind);
417
410/* 418/*
411 * Create an RPC service 419 * Create an RPC service
412 */ 420 */
@@ -471,15 +479,8 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
471 spin_lock_init(&pool->sp_lock); 479 spin_lock_init(&pool->sp_lock);
472 } 480 }
473 481
474 if (svc_uses_rpcbind(serv)) { 482 if (svc_uses_rpcbind(serv) && (!serv->sv_shutdown))
475 if (svc_rpcb_setup(serv, current->nsproxy->net_ns) < 0) { 483 serv->sv_shutdown = svc_rpcb_cleanup;
476 kfree(serv->sv_pools);
477 kfree(serv);
478 return NULL;
479 }
480 if (!serv->sv_shutdown)
481 serv->sv_shutdown = svc_rpcb_cleanup;
482 }
483 484
484 return serv; 485 return serv;
485} 486}
@@ -536,8 +537,6 @@ EXPORT_SYMBOL_GPL(svc_shutdown_net);
536void 537void
537svc_destroy(struct svc_serv *serv) 538svc_destroy(struct svc_serv *serv)
538{ 539{
539 struct net *net = current->nsproxy->net_ns;
540
541 dprintk("svc: svc_destroy(%s, %d)\n", 540 dprintk("svc: svc_destroy(%s, %d)\n",
542 serv->sv_program->pg_name, 541 serv->sv_program->pg_name,
543 serv->sv_nrthreads); 542 serv->sv_nrthreads);
@@ -552,8 +551,6 @@ svc_destroy(struct svc_serv *serv)
552 551
553 del_timer_sync(&serv->sv_temptimer); 552 del_timer_sync(&serv->sv_temptimer);
554 553
555 svc_shutdown_net(serv, net);
556
557 /* 554 /*
558 * The last user is gone and thus all sockets have to be destroyed to 555 * The last user is gone and thus all sockets have to be destroyed to
559 * the point. Check this. 556 * the point. Check this.
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index b98ee3514912..88f2bf671960 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -598,6 +598,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
598 598
599 /* now allocate needed pages. If we get a failure, sleep briefly */ 599 /* now allocate needed pages. If we get a failure, sleep briefly */
600 pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE; 600 pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE;
601 BUG_ON(pages >= RPCSVC_MAXPAGES);
601 for (i = 0; i < pages ; i++) 602 for (i = 0; i < pages ; i++)
602 while (rqstp->rq_pages[i] == NULL) { 603 while (rqstp->rq_pages[i] == NULL) {
603 struct page *p = alloc_page(GFP_KERNEL); 604 struct page *p = alloc_page(GFP_KERNEL);
@@ -612,7 +613,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
612 rqstp->rq_pages[i] = p; 613 rqstp->rq_pages[i] = p;
613 } 614 }
614 rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */ 615 rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
615 BUG_ON(pages >= RPCSVC_MAXPAGES);
616 616
617 /* Make arg->head point to first page and arg->pages point to rest */ 617 /* Make arg->head point to first page and arg->pages point to rest */
618 arg = &rqstp->rq_arg; 618 arg = &rqstp->rq_arg;
@@ -973,7 +973,7 @@ void svc_close_net(struct svc_serv *serv, struct net *net)
973 svc_clear_pools(serv, net); 973 svc_clear_pools(serv, net);
974 /* 974 /*
975 * At this point the sp_sockets lists will stay empty, since 975 * At this point the sp_sockets lists will stay empty, since
976 * svc_enqueue will not add new entries without taking the 976 * svc_xprt_enqueue will not add new entries without taking the
977 * sp_lock and checking XPT_BUSY. 977 * sp_lock and checking XPT_BUSY.
978 */ 978 */
979 svc_clear_list(&serv->sv_tempsocks, net); 979 svc_clear_list(&serv->sv_tempsocks, net);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 6138c925923d..2777fa896645 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -746,6 +746,7 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp)
746 struct svc_cred *cred = &rqstp->rq_cred; 746 struct svc_cred *cred = &rqstp->rq_cred;
747 747
748 cred->cr_group_info = NULL; 748 cred->cr_group_info = NULL;
749 cred->cr_principal = NULL;
749 rqstp->rq_client = NULL; 750 rqstp->rq_client = NULL;
750 751
751 if (argv->iov_len < 3*4) 752 if (argv->iov_len < 3*4)
@@ -773,7 +774,7 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp)
773 svc_putnl(resv, RPC_AUTH_NULL); 774 svc_putnl(resv, RPC_AUTH_NULL);
774 svc_putnl(resv, 0); 775 svc_putnl(resv, 0);
775 776
776 rqstp->rq_flavor = RPC_AUTH_NULL; 777 rqstp->rq_cred.cr_flavor = RPC_AUTH_NULL;
777 return SVC_OK; 778 return SVC_OK;
778} 779}
779 780
@@ -811,6 +812,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
811 int len = argv->iov_len; 812 int len = argv->iov_len;
812 813
813 cred->cr_group_info = NULL; 814 cred->cr_group_info = NULL;
815 cred->cr_principal = NULL;
814 rqstp->rq_client = NULL; 816 rqstp->rq_client = NULL;
815 817
816 if ((len -= 3*4) < 0) 818 if ((len -= 3*4) < 0)
@@ -847,7 +849,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
847 svc_putnl(resv, RPC_AUTH_NULL); 849 svc_putnl(resv, RPC_AUTH_NULL);
848 svc_putnl(resv, 0); 850 svc_putnl(resv, 0);
849 851
850 rqstp->rq_flavor = RPC_AUTH_UNIX; 852 rqstp->rq_cred.cr_flavor = RPC_AUTH_UNIX;
851 return SVC_OK; 853 return SVC_OK;
852 854
853badcred: 855badcred:
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 032daab449b0..8ea39aabe948 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -490,17 +490,9 @@ static int common_mmap(int op, struct file *file, unsigned long prot,
490 return common_file_perm(op, file, mask); 490 return common_file_perm(op, file, mask);
491} 491}
492 492
493static int apparmor_file_mmap(struct file *file, unsigned long reqprot, 493static int apparmor_mmap_file(struct file *file, unsigned long reqprot,
494 unsigned long prot, unsigned long flags, 494 unsigned long prot, unsigned long flags)
495 unsigned long addr, unsigned long addr_only)
496{ 495{
497 int rc = 0;
498
499 /* do DAC check */
500 rc = cap_file_mmap(file, reqprot, prot, flags, addr, addr_only);
501 if (rc || addr_only)
502 return rc;
503
504 return common_mmap(OP_FMMAP, file, prot, flags); 496 return common_mmap(OP_FMMAP, file, prot, flags);
505} 497}
506 498
@@ -646,7 +638,8 @@ static struct security_operations apparmor_ops = {
646 .file_permission = apparmor_file_permission, 638 .file_permission = apparmor_file_permission,
647 .file_alloc_security = apparmor_file_alloc_security, 639 .file_alloc_security = apparmor_file_alloc_security,
648 .file_free_security = apparmor_file_free_security, 640 .file_free_security = apparmor_file_free_security,
649 .file_mmap = apparmor_file_mmap, 641 .mmap_file = apparmor_mmap_file,
642 .mmap_addr = cap_mmap_addr,
650 .file_mprotect = apparmor_file_mprotect, 643 .file_mprotect = apparmor_file_mprotect,
651 .file_lock = apparmor_file_lock, 644 .file_lock = apparmor_file_lock,
652 645
diff --git a/security/capability.c b/security/capability.c
index fca889676c5e..61095df8b89a 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -949,7 +949,8 @@ void __init security_fixup_ops(struct security_operations *ops)
949 set_to_cap_if_null(ops, file_alloc_security); 949 set_to_cap_if_null(ops, file_alloc_security);
950 set_to_cap_if_null(ops, file_free_security); 950 set_to_cap_if_null(ops, file_free_security);
951 set_to_cap_if_null(ops, file_ioctl); 951 set_to_cap_if_null(ops, file_ioctl);
952 set_to_cap_if_null(ops, file_mmap); 952 set_to_cap_if_null(ops, mmap_addr);
953 set_to_cap_if_null(ops, mmap_file);
953 set_to_cap_if_null(ops, file_mprotect); 954 set_to_cap_if_null(ops, file_mprotect);
954 set_to_cap_if_null(ops, file_lock); 955 set_to_cap_if_null(ops, file_lock);
955 set_to_cap_if_null(ops, file_fcntl); 956 set_to_cap_if_null(ops, file_fcntl);
diff --git a/security/commoncap.c b/security/commoncap.c
index e771cb1b2d79..6dbae4650abe 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -958,22 +958,15 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages)
958} 958}
959 959
960/* 960/*
961 * cap_file_mmap - check if able to map given addr 961 * cap_mmap_addr - check if able to map given addr
962 * @file: unused
963 * @reqprot: unused
964 * @prot: unused
965 * @flags: unused
966 * @addr: address attempting to be mapped 962 * @addr: address attempting to be mapped
967 * @addr_only: unused
968 * 963 *
969 * If the process is attempting to map memory below dac_mmap_min_addr they need 964 * If the process is attempting to map memory below dac_mmap_min_addr they need
970 * CAP_SYS_RAWIO. The other parameters to this function are unused by the 965 * CAP_SYS_RAWIO. The other parameters to this function are unused by the
971 * capability security module. Returns 0 if this mapping should be allowed 966 * capability security module. Returns 0 if this mapping should be allowed
972 * -EPERM if not. 967 * -EPERM if not.
973 */ 968 */
974int cap_file_mmap(struct file *file, unsigned long reqprot, 969int cap_mmap_addr(unsigned long addr)
975 unsigned long prot, unsigned long flags,
976 unsigned long addr, unsigned long addr_only)
977{ 970{
978 int ret = 0; 971 int ret = 0;
979 972
@@ -986,3 +979,9 @@ int cap_file_mmap(struct file *file, unsigned long reqprot,
986 } 979 }
987 return ret; 980 return ret;
988} 981}
982
983int cap_mmap_file(struct file *file, unsigned long reqprot,
984 unsigned long prot, unsigned long flags)
985{
986 return 0;
987}
diff --git a/security/security.c b/security/security.c
index 5497a57fba01..3efc9b12aef4 100644
--- a/security/security.c
+++ b/security/security.c
@@ -20,6 +20,9 @@
20#include <linux/ima.h> 20#include <linux/ima.h>
21#include <linux/evm.h> 21#include <linux/evm.h>
22#include <linux/fsnotify.h> 22#include <linux/fsnotify.h>
23#include <linux/mman.h>
24#include <linux/mount.h>
25#include <linux/personality.h>
23#include <net/flow.h> 26#include <net/flow.h>
24 27
25#define MAX_LSM_EVM_XATTR 2 28#define MAX_LSM_EVM_XATTR 2
@@ -657,18 +660,56 @@ int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
657 return security_ops->file_ioctl(file, cmd, arg); 660 return security_ops->file_ioctl(file, cmd, arg);
658} 661}
659 662
660int security_file_mmap(struct file *file, unsigned long reqprot, 663static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
661 unsigned long prot, unsigned long flags,
662 unsigned long addr, unsigned long addr_only)
663{ 664{
664 int ret; 665 /*
666 * Does we have PROT_READ and does the application expect
667 * it to imply PROT_EXEC? If not, nothing to talk about...
668 */
669 if ((prot & (PROT_READ | PROT_EXEC)) != PROT_READ)
670 return prot;
671 if (!(current->personality & READ_IMPLIES_EXEC))
672 return prot;
673 /*
674 * if that's an anonymous mapping, let it.
675 */
676 if (!file)
677 return prot | PROT_EXEC;
678 /*
679 * ditto if it's not on noexec mount, except that on !MMU we need
680 * BDI_CAP_EXEC_MMAP (== VM_MAYEXEC) in this case
681 */
682 if (!(file->f_path.mnt->mnt_flags & MNT_NOEXEC)) {
683#ifndef CONFIG_MMU
684 unsigned long caps = 0;
685 struct address_space *mapping = file->f_mapping;
686 if (mapping && mapping->backing_dev_info)
687 caps = mapping->backing_dev_info->capabilities;
688 if (!(caps & BDI_CAP_EXEC_MAP))
689 return prot;
690#endif
691 return prot | PROT_EXEC;
692 }
693 /* anything on noexec mount won't get PROT_EXEC */
694 return prot;
695}
665 696
666 ret = security_ops->file_mmap(file, reqprot, prot, flags, addr, addr_only); 697int security_mmap_file(struct file *file, unsigned long prot,
698 unsigned long flags)
699{
700 int ret;
701 ret = security_ops->mmap_file(file, prot,
702 mmap_prot(file, prot), flags);
667 if (ret) 703 if (ret)
668 return ret; 704 return ret;
669 return ima_file_mmap(file, prot); 705 return ima_file_mmap(file, prot);
670} 706}
671 707
708int security_mmap_addr(unsigned long addr)
709{
710 return security_ops->mmap_addr(addr);
711}
712
672int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, 713int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
673 unsigned long prot) 714 unsigned long prot)
674{ 715{
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index fa2341b68331..372ec6502aa8 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3083,9 +3083,7 @@ error:
3083 return rc; 3083 return rc;
3084} 3084}
3085 3085
3086static int selinux_file_mmap(struct file *file, unsigned long reqprot, 3086static int selinux_mmap_addr(unsigned long addr)
3087 unsigned long prot, unsigned long flags,
3088 unsigned long addr, unsigned long addr_only)
3089{ 3087{
3090 int rc = 0; 3088 int rc = 0;
3091 u32 sid = current_sid(); 3089 u32 sid = current_sid();
@@ -3104,10 +3102,12 @@ static int selinux_file_mmap(struct file *file, unsigned long reqprot,
3104 } 3102 }
3105 3103
3106 /* do DAC check on address space usage */ 3104 /* do DAC check on address space usage */
3107 rc = cap_file_mmap(file, reqprot, prot, flags, addr, addr_only); 3105 return cap_mmap_addr(addr);
3108 if (rc || addr_only) 3106}
3109 return rc;
3110 3107
3108static int selinux_mmap_file(struct file *file, unsigned long reqprot,
3109 unsigned long prot, unsigned long flags)
3110{
3111 if (selinux_checkreqprot) 3111 if (selinux_checkreqprot)
3112 prot = reqprot; 3112 prot = reqprot;
3113 3113
@@ -5570,7 +5570,8 @@ static struct security_operations selinux_ops = {
5570 .file_alloc_security = selinux_file_alloc_security, 5570 .file_alloc_security = selinux_file_alloc_security,
5571 .file_free_security = selinux_file_free_security, 5571 .file_free_security = selinux_file_free_security,
5572 .file_ioctl = selinux_file_ioctl, 5572 .file_ioctl = selinux_file_ioctl,
5573 .file_mmap = selinux_file_mmap, 5573 .mmap_file = selinux_mmap_file,
5574 .mmap_addr = selinux_mmap_addr,
5574 .file_mprotect = selinux_file_mprotect, 5575 .file_mprotect = selinux_file_mprotect,
5575 .file_lock = selinux_file_lock, 5576 .file_lock = selinux_file_lock,
5576 .file_fcntl = selinux_file_fcntl, 5577 .file_fcntl = selinux_file_fcntl,
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 4e93f9ef970b..3ad290251288 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -1259,12 +1259,8 @@ static int sel_make_bools(void)
1259 if (!inode) 1259 if (!inode)
1260 goto out; 1260 goto out;
1261 1261
1262 ret = -EINVAL;
1263 len = snprintf(page, PAGE_SIZE, "/%s/%s", BOOL_DIR_NAME, names[i]);
1264 if (len < 0)
1265 goto out;
1266
1267 ret = -ENAMETOOLONG; 1262 ret = -ENAMETOOLONG;
1263 len = snprintf(page, PAGE_SIZE, "/%s/%s", BOOL_DIR_NAME, names[i]);
1268 if (len >= PAGE_SIZE) 1264 if (len >= PAGE_SIZE)
1269 goto out; 1265 goto out;
1270 1266
@@ -1557,19 +1553,10 @@ static inline u32 sel_ino_to_perm(unsigned long ino)
1557static ssize_t sel_read_class(struct file *file, char __user *buf, 1553static ssize_t sel_read_class(struct file *file, char __user *buf,
1558 size_t count, loff_t *ppos) 1554 size_t count, loff_t *ppos)
1559{ 1555{
1560 ssize_t rc, len;
1561 char *page;
1562 unsigned long ino = file->f_path.dentry->d_inode->i_ino; 1556 unsigned long ino = file->f_path.dentry->d_inode->i_ino;
1563 1557 char res[TMPBUFLEN];
1564 page = (char *)__get_free_page(GFP_KERNEL); 1558 ssize_t len = snprintf(res, sizeof(res), "%d", sel_ino_to_class(ino));
1565 if (!page) 1559 return simple_read_from_buffer(buf, count, ppos, res, len);
1566 return -ENOMEM;
1567
1568 len = snprintf(page, PAGE_SIZE, "%d", sel_ino_to_class(ino));
1569 rc = simple_read_from_buffer(buf, count, ppos, page, len);
1570 free_page((unsigned long)page);
1571
1572 return rc;
1573} 1560}
1574 1561
1575static const struct file_operations sel_class_ops = { 1562static const struct file_operations sel_class_ops = {
@@ -1580,19 +1567,10 @@ static const struct file_operations sel_class_ops = {
1580static ssize_t sel_read_perm(struct file *file, char __user *buf, 1567static ssize_t sel_read_perm(struct file *file, char __user *buf,
1581 size_t count, loff_t *ppos) 1568 size_t count, loff_t *ppos)
1582{ 1569{
1583 ssize_t rc, len;
1584 char *page;
1585 unsigned long ino = file->f_path.dentry->d_inode->i_ino; 1570 unsigned long ino = file->f_path.dentry->d_inode->i_ino;
1586 1571 char res[TMPBUFLEN];
1587 page = (char *)__get_free_page(GFP_KERNEL); 1572 ssize_t len = snprintf(res, sizeof(res), "%d", sel_ino_to_perm(ino));
1588 if (!page) 1573 return simple_read_from_buffer(buf, count, ppos, res, len);
1589 return -ENOMEM;
1590
1591 len = snprintf(page, PAGE_SIZE, "%d", sel_ino_to_perm(ino));
1592 rc = simple_read_from_buffer(buf, count, ppos, page, len);
1593 free_page((unsigned long)page);
1594
1595 return rc;
1596} 1574}
1597 1575
1598static const struct file_operations sel_perm_ops = { 1576static const struct file_operations sel_perm_ops = {
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index d583c0545808..ee0bb5735f35 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1171,7 +1171,7 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd,
1171} 1171}
1172 1172
1173/** 1173/**
1174 * smack_file_mmap : 1174 * smack_mmap_file :
1175 * Check permissions for a mmap operation. The @file may be NULL, e.g. 1175 * Check permissions for a mmap operation. The @file may be NULL, e.g.
1176 * if mapping anonymous memory. 1176 * if mapping anonymous memory.
1177 * @file contains the file structure for file to map (may be NULL). 1177 * @file contains the file structure for file to map (may be NULL).
@@ -1180,10 +1180,9 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd,
1180 * @flags contains the operational flags. 1180 * @flags contains the operational flags.
1181 * Return 0 if permission is granted. 1181 * Return 0 if permission is granted.
1182 */ 1182 */
1183static int smack_file_mmap(struct file *file, 1183static int smack_mmap_file(struct file *file,
1184 unsigned long reqprot, unsigned long prot, 1184 unsigned long reqprot, unsigned long prot,
1185 unsigned long flags, unsigned long addr, 1185 unsigned long flags)
1186 unsigned long addr_only)
1187{ 1186{
1188 struct smack_known *skp; 1187 struct smack_known *skp;
1189 struct smack_rule *srp; 1188 struct smack_rule *srp;
@@ -1198,11 +1197,6 @@ static int smack_file_mmap(struct file *file,
1198 int tmay; 1197 int tmay;
1199 int rc; 1198 int rc;
1200 1199
1201 /* do DAC check on address space usage */
1202 rc = cap_file_mmap(file, reqprot, prot, flags, addr, addr_only);
1203 if (rc || addr_only)
1204 return rc;
1205
1206 if (file == NULL || file->f_dentry == NULL) 1200 if (file == NULL || file->f_dentry == NULL)
1207 return 0; 1201 return 0;
1208 1202
@@ -3482,7 +3476,8 @@ struct security_operations smack_ops = {
3482 .file_ioctl = smack_file_ioctl, 3476 .file_ioctl = smack_file_ioctl,
3483 .file_lock = smack_file_lock, 3477 .file_lock = smack_file_lock,
3484 .file_fcntl = smack_file_fcntl, 3478 .file_fcntl = smack_file_fcntl,
3485 .file_mmap = smack_file_mmap, 3479 .mmap_file = smack_mmap_file,
3480 .mmap_addr = cap_mmap_addr,
3486 .file_set_fowner = smack_file_set_fowner, 3481 .file_set_fowner = smack_file_set_fowner,
3487 .file_send_sigiotask = smack_file_send_sigiotask, 3482 .file_send_sigiotask = smack_file_send_sigiotask,
3488 .file_receive = smack_file_receive, 3483 .file_receive = smack_file_receive,