aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/conv.c6
-rw-r--r--fs/9p/vfs_inode.c6
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/adfs/super.c2
-rw-r--r--fs/befs/linuxvfs.c11
-rw-r--r--fs/binfmt_elf.c17
-rw-r--r--fs/binfmt_elf_fdpic.c980
-rw-r--r--fs/block_dev.c180
-rw-r--r--fs/buffer.c7
-rw-r--r--fs/char_dev.c22
-rw-r--r--fs/cifs/CHANGES10
-rw-r--r--fs/cifs/README2
-rw-r--r--fs/cifs/cifsencrypt.c3
-rw-r--r--fs/cifs/cifsfs.c6
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h18
-rw-r--r--fs/cifs/cifsproto.h4
-rw-r--r--fs/cifs/cifssmb.c28
-rw-r--r--fs/cifs/connect.c32
-rw-r--r--fs/cifs/dir.c4
-rw-r--r--fs/cifs/file.c97
-rw-r--r--fs/cifs/netmisc.c1
-rw-r--r--fs/cifs/readdir.c13
-rw-r--r--fs/cifs/sess.c2
-rw-r--r--fs/cifs/smberr.h1
-rw-r--r--fs/cifs/transport.c618
-rw-r--r--fs/cifs/xattr.c6
-rw-r--r--fs/coda/file.c4
-rw-r--r--fs/dcache.c6
-rw-r--r--fs/direct-io.c6
-rw-r--r--fs/efs/symlink.c3
-rw-r--r--fs/eventpoll.c8
-rw-r--r--fs/exec.c10
-rw-r--r--fs/ext2/super.c43
-rw-r--r--fs/ext3/acl.h3
-rw-r--r--fs/ext3/balloc.c6
-rw-r--r--fs/ext3/inode.c32
-rw-r--r--fs/ext3/namei.c15
-rw-r--r--fs/ext3/super.c44
-rw-r--r--fs/file.c14
-rw-r--r--fs/freevxfs/vxfs_lookup.c2
-rw-r--r--fs/fuse/control.c4
-rw-r--r--fs/fuse/dir.c47
-rw-r--r--fs/fuse/file.c10
-rw-r--r--fs/fuse/fuse_i.h2
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/hugetlbfs/inode.c4
-rw-r--r--fs/inotify_user.c2
-rw-r--r--fs/ioprio.c30
-rw-r--r--fs/jbd/commit.c6
-rw-r--r--fs/jbd/journal.c92
-rw-r--r--fs/jbd/transaction.c11
-rw-r--r--fs/jffs2/acl.c4
-rw-r--r--fs/jffs2/acl.h4
-rw-r--r--fs/jffs2/malloc.c2
-rw-r--r--fs/jffs2/nodelist.h2
-rw-r--r--fs/jffs2/readinode.c1
-rw-r--r--fs/jffs2/scan.c4
-rw-r--r--fs/jffs2/summary.c5
-rw-r--r--fs/jffs2/xattr.c45
-rw-r--r--fs/jfs/inode.c16
-rw-r--r--fs/jfs/jfs_inode.h1
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/jfs/namei.c33
-rw-r--r--fs/jfs/super.c118
-rw-r--r--fs/lockd/clntproc.c26
-rw-r--r--fs/lockd/svclock.c12
-rw-r--r--fs/lockd/svcsubs.c15
-rw-r--r--fs/locks.c29
-rw-r--r--fs/minix/inode.c13
-rw-r--r--fs/namei.c47
-rw-r--r--fs/nfs/dir.c4
-rw-r--r--fs/nfs/direct.c427
-rw-r--r--fs/nfs/file.c8
-rw-r--r--fs/nfs/idmap.c4
-rw-r--r--fs/nfs/namespace.c4
-rw-r--r--fs/nfs/nfs4proc.c103
-rw-r--r--fs/nfs/nfs4xdr.c21
-rw-r--r--fs/nfs/read.c49
-rw-r--r--fs/nfs/write.c59
-rw-r--r--fs/nfsd/nfs4proc.c8
-rw-r--r--fs/nfsd/nfsfh.c20
-rw-r--r--fs/nfsd/stats.c10
-rw-r--r--fs/ntfs/inode.c33
-rw-r--r--fs/ntfs/super.c31
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c1
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c43
-rw-r--r--fs/ocfs2/localalloc.c8
-rw-r--r--fs/ocfs2/ocfs2.h2
-rw-r--r--fs/ocfs2/suballoc.c261
-rw-r--r--fs/ocfs2/suballoc.h2
-rw-r--r--fs/ocfs2/super.c8
-rw-r--r--fs/partitions/Kconfig2
-rw-r--r--fs/partitions/check.c1
-rw-r--r--fs/partitions/sun.c2
-rw-r--r--fs/proc/array.c6
-rw-r--r--fs/proc/base.c33
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/kcore.c4
-rw-r--r--fs/proc/proc_misc.c2
-rw-r--r--fs/proc/task_nommu.c2
-rw-r--r--fs/ramfs/file-nommu.c4
-rw-r--r--fs/read_write.c2
-rw-r--r--fs/reiserfs/file.c8
-rw-r--r--fs/reiserfs/inode.c26
-rw-r--r--fs/reiserfs/ioctl.c2
-rw-r--r--fs/reiserfs/procfs.c25
-rw-r--r--fs/reiserfs/super.c2
-rw-r--r--fs/reiserfs/xattr.c2
-rw-r--r--fs/splice.c238
-rw-r--r--fs/super.c12
-rw-r--r--fs/sysfs/inode.c12
-rw-r--r--fs/udf/ialloc.c11
-rw-r--r--fs/udf/super.c9
-rw-r--r--fs/udf/truncate.c64
-rw-r--r--fs/ufs/balloc.c2
-rw-r--r--fs/ufs/inode.c35
-rw-r--r--fs/ufs/namei.c3
-rw-r--r--fs/ufs/super.c2
-rw-r--r--fs/ufs/truncate.c77
-rw-r--r--fs/ufs/util.c17
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c18
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h4
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c27
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c7
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c21
-rw-r--r--fs/xfs/xfs_alloc.c103
-rw-r--r--fs/xfs/xfs_alloc.h20
-rw-r--r--fs/xfs/xfs_bmap.c2
-rw-r--r--fs/xfs/xfs_fsops.c16
-rw-r--r--fs/xfs/xfs_inode.c17
-rw-r--r--fs/xfs/xfs_log.c12
-rw-r--r--fs/xfs/xfs_mount.c32
-rw-r--r--fs/xfs/xfs_vfsops.c5
135 files changed, 3413 insertions, 1449 deletions
diff --git a/fs/9p/conv.c b/fs/9p/conv.c
index 1e898144eb7c..56d88c1a09c5 100644
--- a/fs/9p/conv.c
+++ b/fs/9p/conv.c
@@ -673,8 +673,10 @@ struct v9fs_fcall *v9fs_create_tcreate(u32 fid, char *name, u32 perm, u8 mode,
673 struct cbuf *bufp = &buffer; 673 struct cbuf *bufp = &buffer;
674 674
675 size = 4 + 2 + strlen(name) + 4 + 1; /* fid[4] name[s] perm[4] mode[1] */ 675 size = 4 + 2 + strlen(name) + 4 + 1; /* fid[4] name[s] perm[4] mode[1] */
676 if (extended && extension!=NULL) 676 if (extended) {
677 size += 2 + strlen(extension); /* extension[s] */ 677 size += 2 + /* extension[s] */
678 (extension == NULL ? 0 : strlen(extension));
679 }
678 680
679 fc = v9fs_create_common(bufp, size, TCREATE); 681 fc = v9fs_create_common(bufp, size, TCREATE);
680 if (IS_ERR(fc)) 682 if (IS_ERR(fc))
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 2f580a197b8d..eae50c9d6dc4 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -434,11 +434,11 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
434 result = v9fs_t_remove(v9ses, fid, &fcall); 434 result = v9fs_t_remove(v9ses, fid, &fcall);
435 if (result < 0) { 435 if (result < 0) {
436 PRINT_FCALL_ERROR("remove fails", fcall); 436 PRINT_FCALL_ERROR("remove fails", fcall);
437 } else {
438 v9fs_put_idpool(fid, &v9ses->fidpool);
439 v9fs_fid_destroy(v9fid);
440 } 437 }
441 438
439 v9fs_put_idpool(fid, &v9ses->fidpool);
440 v9fs_fid_destroy(v9fid);
441
442 kfree(fcall); 442 kfree(fcall);
443 return result; 443 return result;
444} 444}
diff --git a/fs/Kconfig b/fs/Kconfig
index 53f5c6d61121..3f00a9faabcb 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1801,6 +1801,7 @@ config CIFS_POSIX
1801 1801
1802config CIFS_DEBUG2 1802config CIFS_DEBUG2
1803 bool "Enable additional CIFS debugging routines" 1803 bool "Enable additional CIFS debugging routines"
1804 depends on CIFS
1804 help 1805 help
1805 Enabling this option adds a few more debugging routines 1806 Enabling this option adds a few more debugging routines
1806 to the cifs code which slightly increases the size of 1807 to the cifs code which slightly increases the size of
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index ba1c88af49fe..82011019494c 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -308,7 +308,7 @@ static struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_di
308 if (adfs_checkmap(sb, dm)) 308 if (adfs_checkmap(sb, dm))
309 return dm; 309 return dm;
310 310
311 adfs_error(sb, NULL, "map corrupted"); 311 adfs_error(sb, "map corrupted");
312 312
313error_free: 313error_free:
314 while (--zone >= 0) 314 while (--zone >= 0)
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index fcaeead9696b..50cfca5c7efd 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -512,7 +512,11 @@ befs_utf2nls(struct super_block *sb, const char *in,
512 wchar_t uni; 512 wchar_t uni;
513 int unilen, utflen; 513 int unilen, utflen;
514 char *result; 514 char *result;
515 int maxlen = in_len; /* The utf8->nls conversion can't make more chars */ 515 /* The utf8->nls conversion won't make the final nls string bigger
516 * than the utf one, but if the string is pure ascii they'll have the
517 * same width and an extra char is needed to save the additional \0
518 */
519 int maxlen = in_len + 1;
516 520
517 befs_debug(sb, "---> utf2nls()"); 521 befs_debug(sb, "---> utf2nls()");
518 522
@@ -588,7 +592,10 @@ befs_nls2utf(struct super_block *sb, const char *in,
588 wchar_t uni; 592 wchar_t uni;
589 int unilen, utflen; 593 int unilen, utflen;
590 char *result; 594 char *result;
591 int maxlen = 3 * in_len; 595 /* There're nls characters that will translate to 3-chars-wide UTF-8
596 * characters, a additional byte is needed to save the final \0
597 * in special cases */
598 int maxlen = (3 * in_len) + 1;
592 599
593 befs_debug(sb, "---> nls2utf()\n"); 600 befs_debug(sb, "---> nls2utf()\n");
594 601
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d0434406eaeb..672a3b90bc55 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -84,7 +84,7 @@ static struct linux_binfmt elf_format = {
84 .min_coredump = ELF_EXEC_PAGESIZE 84 .min_coredump = ELF_EXEC_PAGESIZE
85}; 85};
86 86
87#define BAD_ADDR(x) ((unsigned long)(x) > TASK_SIZE) 87#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
88 88
89static int set_brk(unsigned long start, unsigned long end) 89static int set_brk(unsigned long start, unsigned long end)
90{ 90{
@@ -394,7 +394,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
394 * <= p_memsize so it's only necessary to check p_memsz. 394 * <= p_memsize so it's only necessary to check p_memsz.
395 */ 395 */
396 k = load_addr + eppnt->p_vaddr; 396 k = load_addr + eppnt->p_vaddr;
397 if (k > TASK_SIZE || 397 if (BAD_ADDR(k) ||
398 eppnt->p_filesz > eppnt->p_memsz || 398 eppnt->p_filesz > eppnt->p_memsz ||
399 eppnt->p_memsz > TASK_SIZE || 399 eppnt->p_memsz > TASK_SIZE ||
400 TASK_SIZE - eppnt->p_memsz < k) { 400 TASK_SIZE - eppnt->p_memsz < k) {
@@ -887,7 +887,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
887 * allowed task size. Note that p_filesz must always be 887 * allowed task size. Note that p_filesz must always be
888 * <= p_memsz so it is only necessary to check p_memsz. 888 * <= p_memsz so it is only necessary to check p_memsz.
889 */ 889 */
890 if (k > TASK_SIZE || elf_ppnt->p_filesz > elf_ppnt->p_memsz || 890 if (BAD_ADDR(k) || elf_ppnt->p_filesz > elf_ppnt->p_memsz ||
891 elf_ppnt->p_memsz > TASK_SIZE || 891 elf_ppnt->p_memsz > TASK_SIZE ||
892 TASK_SIZE - elf_ppnt->p_memsz < k) { 892 TASK_SIZE - elf_ppnt->p_memsz < k) {
893 /* set_brk can never work. Avoid overflows. */ 893 /* set_brk can never work. Avoid overflows. */
@@ -941,10 +941,9 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
941 interpreter, 941 interpreter,
942 &interp_load_addr); 942 &interp_load_addr);
943 if (BAD_ADDR(elf_entry)) { 943 if (BAD_ADDR(elf_entry)) {
944 printk(KERN_ERR "Unable to load interpreter %.128s\n",
945 elf_interpreter);
946 force_sig(SIGSEGV, current); 944 force_sig(SIGSEGV, current);
947 retval = -ENOEXEC; /* Nobody gets to see this, but.. */ 945 retval = IS_ERR((void *)elf_entry) ?
946 (int)elf_entry : -EINVAL;
948 goto out_free_dentry; 947 goto out_free_dentry;
949 } 948 }
950 reloc_func_desc = interp_load_addr; 949 reloc_func_desc = interp_load_addr;
@@ -955,8 +954,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
955 } else { 954 } else {
956 elf_entry = loc->elf_ex.e_entry; 955 elf_entry = loc->elf_ex.e_entry;
957 if (BAD_ADDR(elf_entry)) { 956 if (BAD_ADDR(elf_entry)) {
958 send_sig(SIGSEGV, current, 0); 957 force_sig(SIGSEGV, current);
959 retval = -ENOEXEC; /* Nobody gets to see this, but.. */ 958 retval = -EINVAL;
960 goto out_free_dentry; 959 goto out_free_dentry;
961 } 960 }
962 } 961 }
@@ -1186,8 +1185,6 @@ static int maydump(struct vm_area_struct *vma)
1186 return 1; 1185 return 1;
1187} 1186}
1188 1187
1189#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
1190
1191/* An ELF note in memory */ 1188/* An ELF note in memory */
1192struct memelfnote 1189struct memelfnote
1193{ 1190{
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index eba4e23b9ca0..2f3365829229 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1,6 +1,6 @@
1/* binfmt_elf_fdpic.c: FDPIC ELF binary format 1/* binfmt_elf_fdpic.c: FDPIC ELF binary format
2 * 2 *
3 * Copyright (C) 2003, 2004 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2003, 2004, 2006 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * Derived from binfmt_elf.c 5 * Derived from binfmt_elf.c
6 * 6 *
@@ -24,7 +24,9 @@
24#include <linux/file.h> 24#include <linux/file.h>
25#include <linux/fcntl.h> 25#include <linux/fcntl.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/pagemap.h>
27#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/highuid.h>
28#include <linux/personality.h> 30#include <linux/personality.h>
29#include <linux/ptrace.h> 31#include <linux/ptrace.h>
30#include <linux/init.h> 32#include <linux/init.h>
@@ -48,45 +50,59 @@ typedef char *elf_caddr_t;
48#define kdebug(fmt, ...) do {} while(0) 50#define kdebug(fmt, ...) do {} while(0)
49#endif 51#endif
50 52
53#if 0
54#define kdcore(fmt, ...) printk("FDPIC "fmt"\n" ,##__VA_ARGS__ )
55#else
56#define kdcore(fmt, ...) do {} while(0)
57#endif
58
51MODULE_LICENSE("GPL"); 59MODULE_LICENSE("GPL");
52 60
53static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs); 61static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *);
54//static int load_elf_fdpic_library(struct file *); 62static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *);
55static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, struct file *file); 63static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *,
56static int elf_fdpic_map_file(struct elf_fdpic_params *params, 64 struct mm_struct *, const char *);
57 struct file *file,
58 struct mm_struct *mm,
59 const char *what);
60 65
61static int create_elf_fdpic_tables(struct linux_binprm *bprm, 66static int create_elf_fdpic_tables(struct linux_binprm *, struct mm_struct *,
62 struct mm_struct *mm, 67 struct elf_fdpic_params *,
63 struct elf_fdpic_params *exec_params, 68 struct elf_fdpic_params *);
64 struct elf_fdpic_params *interp_params);
65 69
66#ifndef CONFIG_MMU 70#ifndef CONFIG_MMU
67static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *_sp); 71static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *,
68static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *params, 72 unsigned long *);
69 struct file *file, 73static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *,
70 struct mm_struct *mm); 74 struct file *,
75 struct mm_struct *);
71#endif 76#endif
72 77
73static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, 78static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *,
74 struct file *file, 79 struct file *, struct mm_struct *);
75 struct mm_struct *mm); 80
81#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
82static int elf_fdpic_core_dump(long, struct pt_regs *, struct file *);
83#endif
76 84
77static struct linux_binfmt elf_fdpic_format = { 85static struct linux_binfmt elf_fdpic_format = {
78 .module = THIS_MODULE, 86 .module = THIS_MODULE,
79 .load_binary = load_elf_fdpic_binary, 87 .load_binary = load_elf_fdpic_binary,
80// .load_shlib = load_elf_fdpic_library, 88#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
81// .core_dump = elf_fdpic_core_dump, 89 .core_dump = elf_fdpic_core_dump,
90#endif
82 .min_coredump = ELF_EXEC_PAGESIZE, 91 .min_coredump = ELF_EXEC_PAGESIZE,
83}; 92};
84 93
85static int __init init_elf_fdpic_binfmt(void) { return register_binfmt(&elf_fdpic_format); } 94static int __init init_elf_fdpic_binfmt(void)
86static void __exit exit_elf_fdpic_binfmt(void) { unregister_binfmt(&elf_fdpic_format); } 95{
96 return register_binfmt(&elf_fdpic_format);
97}
98
99static void __exit exit_elf_fdpic_binfmt(void)
100{
101 unregister_binfmt(&elf_fdpic_format);
102}
87 103
88module_init(init_elf_fdpic_binfmt) 104core_initcall(init_elf_fdpic_binfmt);
89module_exit(exit_elf_fdpic_binfmt) 105module_exit(exit_elf_fdpic_binfmt);
90 106
91static int is_elf_fdpic(struct elfhdr *hdr, struct file *file) 107static int is_elf_fdpic(struct elfhdr *hdr, struct file *file)
92{ 108{
@@ -105,7 +121,8 @@ static int is_elf_fdpic(struct elfhdr *hdr, struct file *file)
105/* 121/*
106 * read the program headers table into memory 122 * read the program headers table into memory
107 */ 123 */
108static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, struct file *file) 124static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params,
125 struct file *file)
109{ 126{
110 struct elf32_phdr *phdr; 127 struct elf32_phdr *phdr;
111 unsigned long size; 128 unsigned long size;
@@ -121,7 +138,8 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, struct file *f
121 if (!params->phdrs) 138 if (!params->phdrs)
122 return -ENOMEM; 139 return -ENOMEM;
123 140
124 retval = kernel_read(file, params->hdr.e_phoff, (char *) params->phdrs, size); 141 retval = kernel_read(file, params->hdr.e_phoff,
142 (char *) params->phdrs, size);
125 if (retval < 0) 143 if (retval < 0)
126 return retval; 144 return retval;
127 145
@@ -141,17 +159,24 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, struct file *f
141 } 159 }
142 160
143 return 0; 161 return 0;
144} /* end elf_fdpic_fetch_phdrs() */ 162}
145 163
146/*****************************************************************************/ 164/*****************************************************************************/
147/* 165/*
148 * load an fdpic binary into various bits of memory 166 * load an fdpic binary into various bits of memory
149 */ 167 */
150static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs) 168static int load_elf_fdpic_binary(struct linux_binprm *bprm,
169 struct pt_regs *regs)
151{ 170{
152 struct elf_fdpic_params exec_params, interp_params; 171 struct elf_fdpic_params exec_params, interp_params;
153 struct elf_phdr *phdr; 172 struct elf_phdr *phdr;
154 unsigned long stack_size; 173 unsigned long stack_size, entryaddr;
174#ifndef CONFIG_MMU
175 unsigned long fullsize;
176#endif
177#ifdef ELF_FDPIC_PLAT_INIT
178 unsigned long dynaddr;
179#endif
155 struct file *interpreter = NULL; /* to shut gcc up */ 180 struct file *interpreter = NULL; /* to shut gcc up */
156 char *interpreter_name = NULL; 181 char *interpreter_name = NULL;
157 int executable_stack; 182 int executable_stack;
@@ -212,7 +237,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs
212 goto error; 237 goto error;
213 } 238 }
214 239
215 retval = kernel_read(interpreter, 0, bprm->buf, BINPRM_BUF_SIZE); 240 retval = kernel_read(interpreter, 0, bprm->buf,
241 BINPRM_BUF_SIZE);
216 if (retval < 0) 242 if (retval < 0)
217 goto error; 243 goto error;
218 244
@@ -295,7 +321,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs
295 &current->mm->start_stack, 321 &current->mm->start_stack,
296 &current->mm->start_brk); 322 &current->mm->start_brk);
297 323
298 retval = setup_arg_pages(bprm, current->mm->start_stack, executable_stack); 324 retval = setup_arg_pages(bprm, current->mm->start_stack,
325 executable_stack);
299 if (retval < 0) { 326 if (retval < 0) {
300 send_sig(SIGKILL, current, 0); 327 send_sig(SIGKILL, current, 0);
301 goto error_kill; 328 goto error_kill;
@@ -303,7 +330,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs
303#endif 330#endif
304 331
305 /* load the executable and interpreter into memory */ 332 /* load the executable and interpreter into memory */
306 retval = elf_fdpic_map_file(&exec_params, bprm->file, current->mm, "executable"); 333 retval = elf_fdpic_map_file(&exec_params, bprm->file, current->mm,
334 "executable");
307 if (retval < 0) 335 if (retval < 0)
308 goto error_kill; 336 goto error_kill;
309 337
@@ -324,7 +352,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs
324 if (!current->mm->start_brk) 352 if (!current->mm->start_brk)
325 current->mm->start_brk = current->mm->end_data; 353 current->mm->start_brk = current->mm->end_data;
326 354
327 current->mm->brk = current->mm->start_brk = PAGE_ALIGN(current->mm->start_brk); 355 current->mm->brk = current->mm->start_brk =
356 PAGE_ALIGN(current->mm->start_brk);
328 357
329#else 358#else
330 /* create a stack and brk area big enough for everyone 359 /* create a stack and brk area big enough for everyone
@@ -336,47 +365,45 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs
336 stack_size = PAGE_SIZE * 2; 365 stack_size = PAGE_SIZE * 2;
337 366
338 down_write(&current->mm->mmap_sem); 367 down_write(&current->mm->mmap_sem);
339 current->mm->start_brk = do_mmap(NULL, 368 current->mm->start_brk = do_mmap(NULL, 0, stack_size,
340 0,
341 stack_size,
342 PROT_READ | PROT_WRITE | PROT_EXEC, 369 PROT_READ | PROT_WRITE | PROT_EXEC,
343 MAP_PRIVATE | MAP_ANON | MAP_GROWSDOWN, 370 MAP_PRIVATE | MAP_ANON | MAP_GROWSDOWN,
344 0); 371 0);
345 372
346 if (IS_ERR((void *) current->mm->start_brk)) { 373 if (IS_ERR_VALUE(current->mm->start_brk)) {
347 up_write(&current->mm->mmap_sem); 374 up_write(&current->mm->mmap_sem);
348 retval = current->mm->start_brk; 375 retval = current->mm->start_brk;
349 current->mm->start_brk = 0; 376 current->mm->start_brk = 0;
350 goto error_kill; 377 goto error_kill;
351 } 378 }
352 379
353 if (do_mremap(current->mm->start_brk, 380 /* expand the stack mapping to use up the entire allocation granule */
354 stack_size, 381 fullsize = ksize((char *) current->mm->start_brk);
355 ksize((char *) current->mm->start_brk), 382 if (!IS_ERR_VALUE(do_mremap(current->mm->start_brk, stack_size,
356 0, 0 383 fullsize, 0, 0)))
357 ) == current->mm->start_brk 384 stack_size = fullsize;
358 )
359 stack_size = ksize((char *) current->mm->start_brk);
360 up_write(&current->mm->mmap_sem); 385 up_write(&current->mm->mmap_sem);
361 386
362 current->mm->brk = current->mm->start_brk; 387 current->mm->brk = current->mm->start_brk;
363 current->mm->context.end_brk = current->mm->start_brk; 388 current->mm->context.end_brk = current->mm->start_brk;
364 current->mm->context.end_brk += (stack_size > PAGE_SIZE) ? (stack_size - PAGE_SIZE) : 0; 389 current->mm->context.end_brk +=
390 (stack_size > PAGE_SIZE) ? (stack_size - PAGE_SIZE) : 0;
365 current->mm->start_stack = current->mm->start_brk + stack_size; 391 current->mm->start_stack = current->mm->start_brk + stack_size;
366#endif 392#endif
367 393
368 compute_creds(bprm); 394 compute_creds(bprm);
369 current->flags &= ~PF_FORKNOEXEC; 395 current->flags &= ~PF_FORKNOEXEC;
370 if (create_elf_fdpic_tables(bprm, current->mm, &exec_params, &interp_params) < 0) 396 if (create_elf_fdpic_tables(bprm, current->mm,
397 &exec_params, &interp_params) < 0)
371 goto error_kill; 398 goto error_kill;
372 399
373 kdebug("- start_code %lx", (long) current->mm->start_code); 400 kdebug("- start_code %lx", current->mm->start_code);
374 kdebug("- end_code %lx", (long) current->mm->end_code); 401 kdebug("- end_code %lx", current->mm->end_code);
375 kdebug("- start_data %lx", (long) current->mm->start_data); 402 kdebug("- start_data %lx", current->mm->start_data);
376 kdebug("- end_data %lx", (long) current->mm->end_data); 403 kdebug("- end_data %lx", current->mm->end_data);
377 kdebug("- start_brk %lx", (long) current->mm->start_brk); 404 kdebug("- start_brk %lx", current->mm->start_brk);
378 kdebug("- brk %lx", (long) current->mm->brk); 405 kdebug("- brk %lx", current->mm->brk);
379 kdebug("- start_stack %lx", (long) current->mm->start_stack); 406 kdebug("- start_stack %lx", current->mm->start_stack);
380 407
381#ifdef ELF_FDPIC_PLAT_INIT 408#ifdef ELF_FDPIC_PLAT_INIT
382 /* 409 /*
@@ -385,21 +412,18 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs
385 * example. This macro performs whatever initialization to 412 * example. This macro performs whatever initialization to
386 * the regs structure is required. 413 * the regs structure is required.
387 */ 414 */
388 ELF_FDPIC_PLAT_INIT(regs, 415 dynaddr = interp_params.dynamic_addr ?: exec_params.dynamic_addr;
389 exec_params.map_addr, 416 ELF_FDPIC_PLAT_INIT(regs, exec_params.map_addr, interp_params.map_addr,
390 interp_params.map_addr, 417 dynaddr);
391 interp_params.dynamic_addr ?: exec_params.dynamic_addr
392 );
393#endif 418#endif
394 419
395 /* everything is now ready... get the userspace context ready to roll */ 420 /* everything is now ready... get the userspace context ready to roll */
396 start_thread(regs, 421 entryaddr = interp_params.entry_addr ?: exec_params.entry_addr;
397 interp_params.entry_addr ?: exec_params.entry_addr, 422 start_thread(regs, entryaddr, current->mm->start_stack);
398 current->mm->start_stack);
399 423
400 if (unlikely(current->ptrace & PT_PTRACED)) { 424 if (unlikely(current->ptrace & PT_PTRACED)) {
401 if (current->ptrace & PT_TRACE_EXEC) 425 if (current->ptrace & PT_TRACE_EXEC)
402 ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); 426 ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
403 else 427 else
404 send_sig(SIGTRAP, current, 0); 428 send_sig(SIGTRAP, current, 0);
405 } 429 }
@@ -419,11 +443,11 @@ error:
419 return retval; 443 return retval;
420 444
421 /* unrecoverable error - kill the process */ 445 /* unrecoverable error - kill the process */
422 error_kill: 446error_kill:
423 send_sig(SIGSEGV, current, 0); 447 send_sig(SIGSEGV, current, 0);
424 goto error; 448 goto error;
425 449
426} /* end load_elf_fdpic_binary() */ 450}
427 451
428/*****************************************************************************/ 452/*****************************************************************************/
429/* 453/*
@@ -459,6 +483,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
459 */ 483 */
460 hwcap = ELF_HWCAP; 484 hwcap = ELF_HWCAP;
461 k_platform = ELF_PLATFORM; 485 k_platform = ELF_PLATFORM;
486 u_platform = NULL;
462 487
463 if (k_platform) { 488 if (k_platform) {
464 platform_len = strlen(k_platform) + 1; 489 platform_len = strlen(k_platform) + 1;
@@ -470,11 +495,11 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
470 495
471#if defined(__i386__) && defined(CONFIG_SMP) 496#if defined(__i386__) && defined(CONFIG_SMP)
472 /* in some cases (e.g. Hyper-Threading), we want to avoid L1 evictions 497 /* in some cases (e.g. Hyper-Threading), we want to avoid L1 evictions
473 * by the processes running on the same package. One thing we can do 498 * by the processes running on the same package. One thing we can do is
474 * is to shuffle the initial stack for them. 499 * to shuffle the initial stack for them.
475 * 500 *
476 * the conditionals here are unneeded, but kept in to make the 501 * the conditionals here are unneeded, but kept in to make the code
477 * code behaviour the same as pre change unless we have hyperthreaded 502 * behaviour the same as pre change unless we have hyperthreaded
478 * processors. This keeps Mr Marcelo Person happier but should be 503 * processors. This keeps Mr Marcelo Person happier but should be
479 * removed for 2.5 504 * removed for 2.5
480 */ 505 */
@@ -497,11 +522,13 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
497 522
498 if (interp_params->loadmap) { 523 if (interp_params->loadmap) {
499 len = sizeof(struct elf32_fdpic_loadmap); 524 len = sizeof(struct elf32_fdpic_loadmap);
500 len += sizeof(struct elf32_fdpic_loadseg) * interp_params->loadmap->nsegs; 525 len += sizeof(struct elf32_fdpic_loadseg) *
526 interp_params->loadmap->nsegs;
501 sp = (sp - len) & ~7UL; 527 sp = (sp - len) & ~7UL;
502 interp_params->map_addr = sp; 528 interp_params->map_addr = sp;
503 529
504 if (copy_to_user((void __user *) sp, interp_params->loadmap, len) != 0) 530 if (copy_to_user((void __user *) sp, interp_params->loadmap,
531 len) != 0)
505 return -EFAULT; 532 return -EFAULT;
506 533
507 current->mm->context.interp_fdpic_loadmap = (unsigned long) sp; 534 current->mm->context.interp_fdpic_loadmap = (unsigned long) sp;
@@ -525,34 +552,37 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
525 sp -= sp & 15UL; 552 sp -= sp & 15UL;
526 553
527 /* put the ELF interpreter info on the stack */ 554 /* put the ELF interpreter info on the stack */
528#define NEW_AUX_ENT(nr, id, val) \ 555#define NEW_AUX_ENT(nr, id, val) \
529 do { \ 556 do { \
530 struct { unsigned long _id, _val; } __user *ent = (void __user *) csp; \ 557 struct { unsigned long _id, _val; } __user *ent; \
531 __put_user((id), &ent[nr]._id); \ 558 \
532 __put_user((val), &ent[nr]._val); \ 559 ent = (void __user *) csp; \
560 __put_user((id), &ent[nr]._id); \
561 __put_user((val), &ent[nr]._val); \
533 } while (0) 562 } while (0)
534 563
535 csp -= 2 * sizeof(unsigned long); 564 csp -= 2 * sizeof(unsigned long);
536 NEW_AUX_ENT(0, AT_NULL, 0); 565 NEW_AUX_ENT(0, AT_NULL, 0);
537 if (k_platform) { 566 if (k_platform) {
538 csp -= 2 * sizeof(unsigned long); 567 csp -= 2 * sizeof(unsigned long);
539 NEW_AUX_ENT(0, AT_PLATFORM, (elf_addr_t)(unsigned long) u_platform); 568 NEW_AUX_ENT(0, AT_PLATFORM,
569 (elf_addr_t) (unsigned long) u_platform);
540 } 570 }
541 571
542 csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long); 572 csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long);
543 NEW_AUX_ENT( 0, AT_HWCAP, hwcap); 573 NEW_AUX_ENT( 0, AT_HWCAP, hwcap);
544 NEW_AUX_ENT( 1, AT_PAGESZ, PAGE_SIZE); 574 NEW_AUX_ENT( 1, AT_PAGESZ, PAGE_SIZE);
545 NEW_AUX_ENT( 2, AT_CLKTCK, CLOCKS_PER_SEC); 575 NEW_AUX_ENT( 2, AT_CLKTCK, CLOCKS_PER_SEC);
546 NEW_AUX_ENT( 3, AT_PHDR, exec_params->ph_addr); 576 NEW_AUX_ENT( 3, AT_PHDR, exec_params->ph_addr);
547 NEW_AUX_ENT( 4, AT_PHENT, sizeof(struct elf_phdr)); 577 NEW_AUX_ENT( 4, AT_PHENT, sizeof(struct elf_phdr));
548 NEW_AUX_ENT( 5, AT_PHNUM, exec_params->hdr.e_phnum); 578 NEW_AUX_ENT( 5, AT_PHNUM, exec_params->hdr.e_phnum);
549 NEW_AUX_ENT( 6, AT_BASE, interp_params->elfhdr_addr); 579 NEW_AUX_ENT( 6, AT_BASE, interp_params->elfhdr_addr);
550 NEW_AUX_ENT( 7, AT_FLAGS, 0); 580 NEW_AUX_ENT( 7, AT_FLAGS, 0);
551 NEW_AUX_ENT( 8, AT_ENTRY, exec_params->entry_addr); 581 NEW_AUX_ENT( 8, AT_ENTRY, exec_params->entry_addr);
552 NEW_AUX_ENT( 9, AT_UID, (elf_addr_t) current->uid); 582 NEW_AUX_ENT( 9, AT_UID, (elf_addr_t) current->uid);
553 NEW_AUX_ENT(10, AT_EUID, (elf_addr_t) current->euid); 583 NEW_AUX_ENT(10, AT_EUID, (elf_addr_t) current->euid);
554 NEW_AUX_ENT(11, AT_GID, (elf_addr_t) current->gid); 584 NEW_AUX_ENT(11, AT_GID, (elf_addr_t) current->gid);
555 NEW_AUX_ENT(12, AT_EGID, (elf_addr_t) current->egid); 585 NEW_AUX_ENT(12, AT_EGID, (elf_addr_t) current->egid);
556 586
557#ifdef ARCH_DLINFO 587#ifdef ARCH_DLINFO
558 /* ARCH_DLINFO must come last so platform specific code can enforce 588 /* ARCH_DLINFO must come last so platform specific code can enforce
@@ -578,7 +608,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
578#ifdef CONFIG_MMU 608#ifdef CONFIG_MMU
579 current->mm->arg_start = bprm->p; 609 current->mm->arg_start = bprm->p;
580#else 610#else
581 current->mm->arg_start = current->mm->start_stack - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p); 611 current->mm->arg_start = current->mm->start_stack -
612 (MAX_ARG_PAGES * PAGE_SIZE - bprm->p);
582#endif 613#endif
583 614
584 p = (char __user *) current->mm->arg_start; 615 p = (char __user *) current->mm->arg_start;
@@ -606,7 +637,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
606 637
607 mm->start_stack = (unsigned long) sp; 638 mm->start_stack = (unsigned long) sp;
608 return 0; 639 return 0;
609} /* end create_elf_fdpic_tables() */ 640}
610 641
611/*****************************************************************************/ 642/*****************************************************************************/
612/* 643/*
@@ -614,7 +645,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
614 * the stack 645 * the stack
615 */ 646 */
616#ifndef CONFIG_MMU 647#ifndef CONFIG_MMU
617static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *_sp) 648static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm,
649 unsigned long *_sp)
618{ 650{
619 unsigned long index, stop, sp; 651 unsigned long index, stop, sp;
620 char *src; 652 char *src;
@@ -635,9 +667,9 @@ static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, unsigned
635 667
636 *_sp = (*_sp - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p)) & ~15; 668 *_sp = (*_sp - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p)) & ~15;
637 669
638 out: 670out:
639 return ret; 671 return ret;
640} /* end elf_fdpic_transfer_args_to_stack() */ 672}
641#endif 673#endif
642 674
643/*****************************************************************************/ 675/*****************************************************************************/
@@ -712,17 +744,18 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
712 seg = loadmap->segs; 744 seg = loadmap->segs;
713 for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { 745 for (loop = loadmap->nsegs; loop > 0; loop--, seg++) {
714 if (params->hdr.e_entry >= seg->p_vaddr && 746 if (params->hdr.e_entry >= seg->p_vaddr &&
715 params->hdr.e_entry < seg->p_vaddr + seg->p_memsz 747 params->hdr.e_entry < seg->p_vaddr + seg->p_memsz) {
716 ) {
717 params->entry_addr = 748 params->entry_addr =
718 (params->hdr.e_entry - seg->p_vaddr) + seg->addr; 749 (params->hdr.e_entry - seg->p_vaddr) +
750 seg->addr;
719 break; 751 break;
720 } 752 }
721 } 753 }
722 } 754 }
723 755
724 /* determine where the program header table has wound up if mapped */ 756 /* determine where the program header table has wound up if mapped */
725 stop = params->hdr.e_phoff + params->hdr.e_phnum * sizeof (struct elf_phdr); 757 stop = params->hdr.e_phoff;
758 stop += params->hdr.e_phnum * sizeof (struct elf_phdr);
726 phdr = params->phdrs; 759 phdr = params->phdrs;
727 760
728 for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { 761 for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) {
@@ -736,9 +769,11 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
736 seg = loadmap->segs; 769 seg = loadmap->segs;
737 for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { 770 for (loop = loadmap->nsegs; loop > 0; loop--, seg++) {
738 if (phdr->p_vaddr >= seg->p_vaddr && 771 if (phdr->p_vaddr >= seg->p_vaddr &&
739 phdr->p_vaddr + phdr->p_filesz <= seg->p_vaddr + seg->p_memsz 772 phdr->p_vaddr + phdr->p_filesz <=
740 ) { 773 seg->p_vaddr + seg->p_memsz) {
741 params->ph_addr = (phdr->p_vaddr - seg->p_vaddr) + seg->addr + 774 params->ph_addr =
775 (phdr->p_vaddr - seg->p_vaddr) +
776 seg->addr +
742 params->hdr.e_phoff - phdr->p_offset; 777 params->hdr.e_phoff - phdr->p_offset;
743 break; 778 break;
744 } 779 }
@@ -755,18 +790,22 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
755 seg = loadmap->segs; 790 seg = loadmap->segs;
756 for (loop = loadmap->nsegs; loop > 0; loop--, seg++) { 791 for (loop = loadmap->nsegs; loop > 0; loop--, seg++) {
757 if (phdr->p_vaddr >= seg->p_vaddr && 792 if (phdr->p_vaddr >= seg->p_vaddr &&
758 phdr->p_vaddr + phdr->p_memsz <= seg->p_vaddr + seg->p_memsz 793 phdr->p_vaddr + phdr->p_memsz <=
759 ) { 794 seg->p_vaddr + seg->p_memsz) {
760 params->dynamic_addr = (phdr->p_vaddr - seg->p_vaddr) + seg->addr; 795 params->dynamic_addr =
761 796 (phdr->p_vaddr - seg->p_vaddr) +
762 /* check the dynamic section contains at least one item, and that 797 seg->addr;
763 * the last item is a NULL entry */ 798
799 /* check the dynamic section contains at least
800 * one item, and that the last item is a NULL
801 * entry */
764 if (phdr->p_memsz == 0 || 802 if (phdr->p_memsz == 0 ||
765 phdr->p_memsz % sizeof(Elf32_Dyn) != 0) 803 phdr->p_memsz % sizeof(Elf32_Dyn) != 0)
766 goto dynamic_error; 804 goto dynamic_error;
767 805
768 tmp = phdr->p_memsz / sizeof(Elf32_Dyn); 806 tmp = phdr->p_memsz / sizeof(Elf32_Dyn);
769 if (((Elf32_Dyn *) params->dynamic_addr)[tmp - 1].d_tag != 0) 807 if (((Elf32_Dyn *)
808 params->dynamic_addr)[tmp - 1].d_tag != 0)
770 goto dynamic_error; 809 goto dynamic_error;
771 break; 810 break;
772 } 811 }
@@ -775,8 +814,8 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
775 } 814 }
776 815
777 /* now elide adjacent segments in the load map on MMU linux 816 /* now elide adjacent segments in the load map on MMU linux
778 * - on uClinux the holes between may actually be filled with system stuff or stuff from 817 * - on uClinux the holes between may actually be filled with system
779 * other processes 818 * stuff or stuff from other processes
780 */ 819 */
781#ifdef CONFIG_MMU 820#ifdef CONFIG_MMU
782 nloads = loadmap->nsegs; 821 nloads = loadmap->nsegs;
@@ -787,7 +826,9 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
787 if (seg->p_vaddr - mseg->p_vaddr == seg->addr - mseg->addr) { 826 if (seg->p_vaddr - mseg->p_vaddr == seg->addr - mseg->addr) {
788 load_addr = PAGE_ALIGN(mseg->addr + mseg->p_memsz); 827 load_addr = PAGE_ALIGN(mseg->addr + mseg->p_memsz);
789 if (load_addr == (seg->addr & PAGE_MASK)) { 828 if (load_addr == (seg->addr & PAGE_MASK)) {
790 mseg->p_memsz += load_addr - (mseg->addr + mseg->p_memsz); 829 mseg->p_memsz +=
830 load_addr -
831 (mseg->addr + mseg->p_memsz);
791 mseg->p_memsz += seg->addr & ~PAGE_MASK; 832 mseg->p_memsz += seg->addr & ~PAGE_MASK;
792 mseg->p_memsz += seg->p_memsz; 833 mseg->p_memsz += seg->p_memsz;
793 loadmap->nsegs--; 834 loadmap->nsegs--;
@@ -815,20 +856,21 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
815 856
816 return 0; 857 return 0;
817 858
818 dynamic_error: 859dynamic_error:
819 printk("ELF FDPIC %s with invalid DYNAMIC section (inode=%lu)\n", 860 printk("ELF FDPIC %s with invalid DYNAMIC section (inode=%lu)\n",
820 what, file->f_dentry->d_inode->i_ino); 861 what, file->f_dentry->d_inode->i_ino);
821 return -ELIBBAD; 862 return -ELIBBAD;
822} /* end elf_fdpic_map_file() */ 863}
823 864
824/*****************************************************************************/ 865/*****************************************************************************/
825/* 866/*
826 * map a file with constant displacement under uClinux 867 * map a file with constant displacement under uClinux
827 */ 868 */
828#ifndef CONFIG_MMU 869#ifndef CONFIG_MMU
829static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *params, 870static int elf_fdpic_map_file_constdisp_on_uclinux(
830 struct file *file, 871 struct elf_fdpic_params *params,
831 struct mm_struct *mm) 872 struct file *file,
873 struct mm_struct *mm)
832{ 874{
833 struct elf32_fdpic_loadseg *seg; 875 struct elf32_fdpic_loadseg *seg;
834 struct elf32_phdr *phdr; 876 struct elf32_phdr *phdr;
@@ -839,7 +881,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *para
839 load_addr = params->load_addr; 881 load_addr = params->load_addr;
840 seg = params->loadmap->segs; 882 seg = params->loadmap->segs;
841 883
842 /* determine the bounds of the contiguous overall allocation we must make */ 884 /* determine the bounds of the contiguous overall allocation we must
885 * make */
843 phdr = params->phdrs; 886 phdr = params->phdrs;
844 for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { 887 for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) {
845 if (params->phdrs[loop].p_type != PT_LOAD) 888 if (params->phdrs[loop].p_type != PT_LOAD)
@@ -860,7 +903,7 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *para
860 maddr = do_mmap(NULL, load_addr, top - base, 903 maddr = do_mmap(NULL, load_addr, top - base,
861 PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0); 904 PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0);
862 up_write(&mm->mmap_sem); 905 up_write(&mm->mmap_sem);
863 if (IS_ERR((void *) maddr)) 906 if (IS_ERR_VALUE(maddr))
864 return (int) maddr; 907 return (int) maddr;
865 908
866 if (load_addr != 0) 909 if (load_addr != 0)
@@ -878,7 +921,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *para
878 seg->p_vaddr = phdr->p_vaddr; 921 seg->p_vaddr = phdr->p_vaddr;
879 seg->p_memsz = phdr->p_memsz; 922 seg->p_memsz = phdr->p_memsz;
880 923
881 ret = file->f_op->read(file, (void *) seg->addr, phdr->p_filesz, &fpos); 924 ret = file->f_op->read(file, (void *) seg->addr,
925 phdr->p_filesz, &fpos);
882 if (ret < 0) 926 if (ret < 0)
883 return ret; 927 return ret;
884 928
@@ -895,8 +939,7 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *para
895 if (phdr->p_flags & PF_X) { 939 if (phdr->p_flags & PF_X) {
896 mm->start_code = seg->addr; 940 mm->start_code = seg->addr;
897 mm->end_code = seg->addr + phdr->p_memsz; 941 mm->end_code = seg->addr + phdr->p_memsz;
898 } 942 } else if (!mm->start_data) {
899 else if (!mm->start_data) {
900 mm->start_data = seg->addr; 943 mm->start_data = seg->addr;
901#ifndef CONFIG_MMU 944#ifndef CONFIG_MMU
902 mm->end_data = seg->addr + phdr->p_memsz; 945 mm->end_data = seg->addr + phdr->p_memsz;
@@ -913,7 +956,7 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *para
913 } 956 }
914 957
915 return 0; 958 return 0;
916} /* end elf_fdpic_map_file_constdisp_on_uclinux() */ 959}
917#endif 960#endif
918 961
919/*****************************************************************************/ 962/*****************************************************************************/
@@ -974,14 +1017,14 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
974 1017
975 case ELF_FDPIC_FLAG_CONSTDISP: 1018 case ELF_FDPIC_FLAG_CONSTDISP:
976 /* constant displacement 1019 /* constant displacement
977 * - can be mapped anywhere, but must be mapped as a unit 1020 * - can be mapped anywhere, but must be mapped as a
1021 * unit
978 */ 1022 */
979 if (!dvset) { 1023 if (!dvset) {
980 maddr = load_addr; 1024 maddr = load_addr;
981 delta_vaddr = phdr->p_vaddr; 1025 delta_vaddr = phdr->p_vaddr;
982 dvset = 1; 1026 dvset = 1;
983 } 1027 } else {
984 else {
985 maddr = load_addr + phdr->p_vaddr - delta_vaddr; 1028 maddr = load_addr + phdr->p_vaddr - delta_vaddr;
986 flags |= MAP_FIXED; 1029 flags |= MAP_FIXED;
987 } 1030 }
@@ -1005,13 +1048,14 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1005 up_write(&mm->mmap_sem); 1048 up_write(&mm->mmap_sem);
1006 1049
1007 kdebug("mmap[%d] <file> sz=%lx pr=%x fl=%x of=%lx --> %08lx", 1050 kdebug("mmap[%d] <file> sz=%lx pr=%x fl=%x of=%lx --> %08lx",
1008 loop, phdr->p_memsz + disp, prot, flags, phdr->p_offset - disp, 1051 loop, phdr->p_memsz + disp, prot, flags,
1009 maddr); 1052 phdr->p_offset - disp, maddr);
1010 1053
1011 if (IS_ERR((void *) maddr)) 1054 if (IS_ERR_VALUE(maddr))
1012 return (int) maddr; 1055 return (int) maddr;
1013 1056
1014 if ((params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) == ELF_FDPIC_FLAG_CONTIGUOUS) 1057 if ((params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) ==
1058 ELF_FDPIC_FLAG_CONTIGUOUS)
1015 load_addr += PAGE_ALIGN(phdr->p_memsz + disp); 1059 load_addr += PAGE_ALIGN(phdr->p_memsz + disp);
1016 1060
1017 seg->addr = maddr + disp; 1061 seg->addr = maddr + disp;
@@ -1022,7 +1066,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1022 if (phdr->p_offset == 0) 1066 if (phdr->p_offset == 0)
1023 params->elfhdr_addr = seg->addr; 1067 params->elfhdr_addr = seg->addr;
1024 1068
1025 /* clear the bit between beginning of mapping and beginning of PT_LOAD */ 1069 /* clear the bit between beginning of mapping and beginning of
1070 * PT_LOAD */
1026 if (prot & PROT_WRITE && disp > 0) { 1071 if (prot & PROT_WRITE && disp > 0) {
1027 kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp); 1072 kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
1028 clear_user((void __user *) maddr, disp); 1073 clear_user((void __user *) maddr, disp);
@@ -1038,19 +1083,20 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1038 excess1 = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK); 1083 excess1 = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK);
1039 1084
1040#ifdef CONFIG_MMU 1085#ifdef CONFIG_MMU
1041
1042 if (excess > excess1) { 1086 if (excess > excess1) {
1043 unsigned long xaddr = maddr + phdr->p_filesz + excess1; 1087 unsigned long xaddr = maddr + phdr->p_filesz + excess1;
1044 unsigned long xmaddr; 1088 unsigned long xmaddr;
1045 1089
1046 flags |= MAP_FIXED | MAP_ANONYMOUS; 1090 flags |= MAP_FIXED | MAP_ANONYMOUS;
1047 down_write(&mm->mmap_sem); 1091 down_write(&mm->mmap_sem);
1048 xmaddr = do_mmap(NULL, xaddr, excess - excess1, prot, flags, 0); 1092 xmaddr = do_mmap(NULL, xaddr, excess - excess1,
1093 prot, flags, 0);
1049 up_write(&mm->mmap_sem); 1094 up_write(&mm->mmap_sem);
1050 1095
1051 kdebug("mmap[%d] <anon>" 1096 kdebug("mmap[%d] <anon>"
1052 " ad=%lx sz=%lx pr=%x fl=%x of=0 --> %08lx", 1097 " ad=%lx sz=%lx pr=%x fl=%x of=0 --> %08lx",
1053 loop, xaddr, excess - excess1, prot, flags, xmaddr); 1098 loop, xaddr, excess - excess1, prot, flags,
1099 xmaddr);
1054 1100
1055 if (xmaddr != xaddr) 1101 if (xmaddr != xaddr)
1056 return -ENOMEM; 1102 return -ENOMEM;
@@ -1059,7 +1105,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1059 if (prot & PROT_WRITE && excess1 > 0) { 1105 if (prot & PROT_WRITE && excess1 > 0) {
1060 kdebug("clear[%d] ad=%lx sz=%lx", 1106 kdebug("clear[%d] ad=%lx sz=%lx",
1061 loop, maddr + phdr->p_filesz, excess1); 1107 loop, maddr + phdr->p_filesz, excess1);
1062 clear_user((void __user *) maddr + phdr->p_filesz, excess1); 1108 clear_user((void __user *) maddr + phdr->p_filesz,
1109 excess1);
1063 } 1110 }
1064 1111
1065#else 1112#else
@@ -1074,8 +1121,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1074 if (phdr->p_flags & PF_X) { 1121 if (phdr->p_flags & PF_X) {
1075 mm->start_code = maddr; 1122 mm->start_code = maddr;
1076 mm->end_code = maddr + phdr->p_memsz; 1123 mm->end_code = maddr + phdr->p_memsz;
1077 } 1124 } else if (!mm->start_data) {
1078 else if (!mm->start_data) {
1079 mm->start_data = maddr; 1125 mm->start_data = maddr;
1080 mm->end_data = maddr + phdr->p_memsz; 1126 mm->end_data = maddr + phdr->p_memsz;
1081 } 1127 }
@@ -1085,4 +1131,662 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1085 } 1131 }
1086 1132
1087 return 0; 1133 return 0;
1088} /* end elf_fdpic_map_file_by_direct_mmap() */ 1134}
1135
1136/*****************************************************************************/
1137/*
1138 * ELF-FDPIC core dumper
1139 *
1140 * Modelled on fs/exec.c:aout_core_dump()
1141 * Jeremy Fitzhardinge <jeremy@sw.oz.au>
1142 *
1143 * Modelled on fs/binfmt_elf.c core dumper
1144 */
1145#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
1146
1147/*
1148 * These are the only things you should do on a core-file: use only these
1149 * functions to write out all the necessary info.
1150 */
1151static int dump_write(struct file *file, const void *addr, int nr)
1152{
1153 return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
1154}
1155
1156static int dump_seek(struct file *file, loff_t off)
1157{
1158 if (file->f_op->llseek) {
1159 if (file->f_op->llseek(file, off, SEEK_SET) != off)
1160 return 0;
1161 } else {
1162 file->f_pos = off;
1163 }
1164 return 1;
1165}
1166
1167/*
1168 * Decide whether a segment is worth dumping; default is yes to be
1169 * sure (missing info is worse than too much; etc).
1170 * Personally I'd include everything, and use the coredump limit...
1171 *
1172 * I think we should skip something. But I am not sure how. H.J.
1173 */
1174static int maydump(struct vm_area_struct *vma)
1175{
1176 /* Do not dump I/O mapped devices or special mappings */
1177 if (vma->vm_flags & (VM_IO | VM_RESERVED)) {
1178 kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags);
1179 return 0;
1180 }
1181
1182 /* If we may not read the contents, don't allow us to dump
1183 * them either. "dump_write()" can't handle it anyway.
1184 */
1185 if (!(vma->vm_flags & VM_READ)) {
1186 kdcore("%08lx: %08lx: no (!read)", vma->vm_start, vma->vm_flags);
1187 return 0;
1188 }
1189
1190 /* Dump shared memory only if mapped from an anonymous file. */
1191 if (vma->vm_flags & VM_SHARED) {
1192 if (vma->vm_file->f_dentry->d_inode->i_nlink == 0) {
1193 kdcore("%08lx: %08lx: no (share)", vma->vm_start, vma->vm_flags);
1194 return 1;
1195 }
1196
1197 kdcore("%08lx: %08lx: no (share)", vma->vm_start, vma->vm_flags);
1198 return 0;
1199 }
1200
1201#ifdef CONFIG_MMU
1202 /* If it hasn't been written to, don't write it out */
1203 if (!vma->anon_vma) {
1204 kdcore("%08lx: %08lx: no (!anon)", vma->vm_start, vma->vm_flags);
1205 return 0;
1206 }
1207#endif
1208
1209 kdcore("%08lx: %08lx: yes", vma->vm_start, vma->vm_flags);
1210 return 1;
1211}
1212
1213/* An ELF note in memory */
1214struct memelfnote
1215{
1216 const char *name;
1217 int type;
1218 unsigned int datasz;
1219 void *data;
1220};
1221
1222static int notesize(struct memelfnote *en)
1223{
1224 int sz;
1225
1226 sz = sizeof(struct elf_note);
1227 sz += roundup(strlen(en->name) + 1, 4);
1228 sz += roundup(en->datasz, 4);
1229
1230 return sz;
1231}
1232
1233/* #define DEBUG */
1234
1235#define DUMP_WRITE(addr, nr) \
1236 do { if (!dump_write(file, (addr), (nr))) return 0; } while(0)
1237#define DUMP_SEEK(off) \
1238 do { if (!dump_seek(file, (off))) return 0; } while(0)
1239
1240static int writenote(struct memelfnote *men, struct file *file)
1241{
1242 struct elf_note en;
1243
1244 en.n_namesz = strlen(men->name) + 1;
1245 en.n_descsz = men->datasz;
1246 en.n_type = men->type;
1247
1248 DUMP_WRITE(&en, sizeof(en));
1249 DUMP_WRITE(men->name, en.n_namesz);
1250 /* XXX - cast from long long to long to avoid need for libgcc.a */
1251 DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */
1252 DUMP_WRITE(men->data, men->datasz);
1253 DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */
1254
1255 return 1;
1256}
1257#undef DUMP_WRITE
1258#undef DUMP_SEEK
1259
1260#define DUMP_WRITE(addr, nr) \
1261 if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
1262 goto end_coredump;
1263#define DUMP_SEEK(off) \
1264 if (!dump_seek(file, (off))) \
1265 goto end_coredump;
1266
1267static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs)
1268{
1269 memcpy(elf->e_ident, ELFMAG, SELFMAG);
1270 elf->e_ident[EI_CLASS] = ELF_CLASS;
1271 elf->e_ident[EI_DATA] = ELF_DATA;
1272 elf->e_ident[EI_VERSION] = EV_CURRENT;
1273 elf->e_ident[EI_OSABI] = ELF_OSABI;
1274 memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
1275
1276 elf->e_type = ET_CORE;
1277 elf->e_machine = ELF_ARCH;
1278 elf->e_version = EV_CURRENT;
1279 elf->e_entry = 0;
1280 elf->e_phoff = sizeof(struct elfhdr);
1281 elf->e_shoff = 0;
1282 elf->e_flags = ELF_FDPIC_CORE_EFLAGS;
1283 elf->e_ehsize = sizeof(struct elfhdr);
1284 elf->e_phentsize = sizeof(struct elf_phdr);
1285 elf->e_phnum = segs;
1286 elf->e_shentsize = 0;
1287 elf->e_shnum = 0;
1288 elf->e_shstrndx = 0;
1289 return;
1290}
1291
1292static inline void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset)
1293{
1294 phdr->p_type = PT_NOTE;
1295 phdr->p_offset = offset;
1296 phdr->p_vaddr = 0;
1297 phdr->p_paddr = 0;
1298 phdr->p_filesz = sz;
1299 phdr->p_memsz = 0;
1300 phdr->p_flags = 0;
1301 phdr->p_align = 0;
1302 return;
1303}
1304
1305static inline void fill_note(struct memelfnote *note, const char *name, int type,
1306 unsigned int sz, void *data)
1307{
1308 note->name = name;
1309 note->type = type;
1310 note->datasz = sz;
1311 note->data = data;
1312 return;
1313}
1314
1315/*
1316 * fill up all the fields in prstatus from the given task struct, except
1317 * registers which need to be filled up seperately.
1318 */
1319static void fill_prstatus(struct elf_prstatus *prstatus,
1320 struct task_struct *p, long signr)
1321{
1322 prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
1323 prstatus->pr_sigpend = p->pending.signal.sig[0];
1324 prstatus->pr_sighold = p->blocked.sig[0];
1325 prstatus->pr_pid = p->pid;
1326 prstatus->pr_ppid = p->parent->pid;
1327 prstatus->pr_pgrp = process_group(p);
1328 prstatus->pr_sid = p->signal->session;
1329 if (thread_group_leader(p)) {
1330 /*
1331 * This is the record for the group leader. Add in the
1332 * cumulative times of previous dead threads. This total
1333 * won't include the time of each live thread whose state
1334 * is included in the core dump. The final total reported
1335 * to our parent process when it calls wait4 will include
1336 * those sums as well as the little bit more time it takes
1337 * this and each other thread to finish dying after the
1338 * core dump synchronization phase.
1339 */
1340 cputime_to_timeval(cputime_add(p->utime, p->signal->utime),
1341 &prstatus->pr_utime);
1342 cputime_to_timeval(cputime_add(p->stime, p->signal->stime),
1343 &prstatus->pr_stime);
1344 } else {
1345 cputime_to_timeval(p->utime, &prstatus->pr_utime);
1346 cputime_to_timeval(p->stime, &prstatus->pr_stime);
1347 }
1348 cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
1349 cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
1350
1351 prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap;
1352 prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap;
1353}
1354
1355static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
1356 struct mm_struct *mm)
1357{
1358 unsigned int i, len;
1359
1360 /* first copy the parameters from user space */
1361 memset(psinfo, 0, sizeof(struct elf_prpsinfo));
1362
1363 len = mm->arg_end - mm->arg_start;
1364 if (len >= ELF_PRARGSZ)
1365 len = ELF_PRARGSZ - 1;
1366 if (copy_from_user(&psinfo->pr_psargs,
1367 (const char __user *) mm->arg_start, len))
1368 return -EFAULT;
1369 for (i = 0; i < len; i++)
1370 if (psinfo->pr_psargs[i] == 0)
1371 psinfo->pr_psargs[i] = ' ';
1372 psinfo->pr_psargs[len] = 0;
1373
1374 psinfo->pr_pid = p->pid;
1375 psinfo->pr_ppid = p->parent->pid;
1376 psinfo->pr_pgrp = process_group(p);
1377 psinfo->pr_sid = p->signal->session;
1378
1379 i = p->state ? ffz(~p->state) + 1 : 0;
1380 psinfo->pr_state = i;
1381 psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i];
1382 psinfo->pr_zomb = psinfo->pr_sname == 'Z';
1383 psinfo->pr_nice = task_nice(p);
1384 psinfo->pr_flag = p->flags;
1385 SET_UID(psinfo->pr_uid, p->uid);
1386 SET_GID(psinfo->pr_gid, p->gid);
1387 strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
1388
1389 return 0;
1390}
1391
1392/* Here is the structure in which status of each thread is captured. */
1393struct elf_thread_status
1394{
1395 struct list_head list;
1396 struct elf_prstatus prstatus; /* NT_PRSTATUS */
1397 elf_fpregset_t fpu; /* NT_PRFPREG */
1398 struct task_struct *thread;
1399#ifdef ELF_CORE_COPY_XFPREGS
1400 elf_fpxregset_t xfpu; /* NT_PRXFPREG */
1401#endif
1402 struct memelfnote notes[3];
1403 int num_notes;
1404};
1405
1406/*
1407 * In order to add the specific thread information for the elf file format,
1408 * we need to keep a linked list of every thread's pr_status and then create
1409 * a single section for them in the final core file.
1410 */
1411static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
1412{
1413 struct task_struct *p = t->thread;
1414 int sz = 0;
1415
1416 t->num_notes = 0;
1417
1418 fill_prstatus(&t->prstatus, p, signr);
1419 elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
1420
1421 fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
1422 &t->prstatus);
1423 t->num_notes++;
1424 sz += notesize(&t->notes[0]);
1425
1426 t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL, &t->fpu);
1427 if (t->prstatus.pr_fpvalid) {
1428 fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu),
1429 &t->fpu);
1430 t->num_notes++;
1431 sz += notesize(&t->notes[1]);
1432 }
1433
1434#ifdef ELF_CORE_COPY_XFPREGS
1435 if (elf_core_copy_task_xfpregs(p, &t->xfpu)) {
1436 fill_note(&t->notes[2], "LINUX", NT_PRXFPREG, sizeof(t->xfpu),
1437 &t->xfpu);
1438 t->num_notes++;
1439 sz += notesize(&t->notes[2]);
1440 }
1441#endif
1442 return sz;
1443}
1444
1445/*
1446 * dump the segments for an MMU process
1447 */
1448#ifdef CONFIG_MMU
1449static int elf_fdpic_dump_segments(struct file *file, struct mm_struct *mm,
1450 size_t *size, unsigned long *limit)
1451{
1452 struct vm_area_struct *vma;
1453
1454 for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
1455 unsigned long addr;
1456
1457 if (!maydump(vma))
1458 continue;
1459
1460 for (addr = vma->vm_start;
1461 addr < vma->vm_end;
1462 addr += PAGE_SIZE
1463 ) {
1464 struct vm_area_struct *vma;
1465 struct page *page;
1466
1467 if (get_user_pages(current, current->mm, addr, 1, 0, 1,
1468 &page, &vma) <= 0) {
1469 DUMP_SEEK(file->f_pos + PAGE_SIZE);
1470 }
1471 else if (page == ZERO_PAGE(addr)) {
1472 DUMP_SEEK(file->f_pos + PAGE_SIZE);
1473 page_cache_release(page);
1474 }
1475 else {
1476 void *kaddr;
1477
1478 flush_cache_page(vma, addr, page_to_pfn(page));
1479 kaddr = kmap(page);
1480 if ((*size += PAGE_SIZE) > *limit ||
1481 !dump_write(file, kaddr, PAGE_SIZE)
1482 ) {
1483 kunmap(page);
1484 page_cache_release(page);
1485 return -EIO;
1486 }
1487 kunmap(page);
1488 page_cache_release(page);
1489 }
1490 }
1491 }
1492
1493 return 0;
1494
1495end_coredump:
1496 return -EFBIG;
1497}
1498#endif
1499
1500/*
1501 * dump the segments for a NOMMU process
1502 */
1503#ifndef CONFIG_MMU
1504static int elf_fdpic_dump_segments(struct file *file, struct mm_struct *mm,
1505 size_t *size, unsigned long *limit)
1506{
1507 struct vm_list_struct *vml;
1508
1509 for (vml = current->mm->context.vmlist; vml; vml = vml->next) {
1510 struct vm_area_struct *vma = vml->vma;
1511
1512 if (!maydump(vma))
1513 continue;
1514
1515 if ((*size += PAGE_SIZE) > *limit)
1516 return -EFBIG;
1517
1518 if (!dump_write(file, (void *) vma->vm_start,
1519 vma->vm_end - vma->vm_start))
1520 return -EIO;
1521 }
1522
1523 return 0;
1524}
1525#endif
1526
1527/*
1528 * Actual dumper
1529 *
1530 * This is a two-pass process; first we find the offsets of the bits,
1531 * and then they are actually written out. If we run out of core limit
1532 * we just truncate.
1533 */
1534static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1535 struct file *file)
1536{
1537#define NUM_NOTES 6
1538 int has_dumped = 0;
1539 mm_segment_t fs;
1540 int segs;
1541 size_t size = 0;
1542 int i;
1543 struct vm_area_struct *vma;
1544 struct elfhdr *elf = NULL;
1545 loff_t offset = 0, dataoff;
1546 unsigned long limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
1547 int numnote;
1548 struct memelfnote *notes = NULL;
1549 struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */
1550 struct elf_prpsinfo *psinfo = NULL; /* NT_PRPSINFO */
1551 struct task_struct *g, *p;
1552 LIST_HEAD(thread_list);
1553 struct list_head *t;
1554 elf_fpregset_t *fpu = NULL;
1555#ifdef ELF_CORE_COPY_XFPREGS
1556 elf_fpxregset_t *xfpu = NULL;
1557#endif
1558 int thread_status_size = 0;
1559#ifndef CONFIG_MMU
1560 struct vm_list_struct *vml;
1561#endif
1562 elf_addr_t *auxv;
1563
1564 /*
1565 * We no longer stop all VM operations.
1566 *
1567 * This is because those proceses that could possibly change map_count
1568 * or the mmap / vma pages are now blocked in do_exit on current
1569 * finishing this core dump.
1570 *
1571 * Only ptrace can touch these memory addresses, but it doesn't change
1572 * the map_count or the pages allocated. So no possibility of crashing
1573 * exists while dumping the mm->vm_next areas to the core file.
1574 */
1575
1576 /* alloc memory for large data structures: too large to be on stack */
1577 elf = kmalloc(sizeof(*elf), GFP_KERNEL);
1578 if (!elf)
1579 goto cleanup;
1580 prstatus = kzalloc(sizeof(*prstatus), GFP_KERNEL);
1581 if (!prstatus)
1582 goto cleanup;
1583 psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
1584 if (!psinfo)
1585 goto cleanup;
1586 notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), GFP_KERNEL);
1587 if (!notes)
1588 goto cleanup;
1589 fpu = kmalloc(sizeof(*fpu), GFP_KERNEL);
1590 if (!fpu)
1591 goto cleanup;
1592#ifdef ELF_CORE_COPY_XFPREGS
1593 xfpu = kmalloc(sizeof(*xfpu), GFP_KERNEL);
1594 if (!xfpu)
1595 goto cleanup;
1596#endif
1597
1598 if (signr) {
1599 struct elf_thread_status *tmp;
1600 read_lock(&tasklist_lock);
1601 do_each_thread(g,p)
1602 if (current->mm == p->mm && current != p) {
1603 tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
1604 if (!tmp) {
1605 read_unlock(&tasklist_lock);
1606 goto cleanup;
1607 }
1608 INIT_LIST_HEAD(&tmp->list);
1609 tmp->thread = p;
1610 list_add(&tmp->list, &thread_list);
1611 }
1612 while_each_thread(g,p);
1613 read_unlock(&tasklist_lock);
1614 list_for_each(t, &thread_list) {
1615 struct elf_thread_status *tmp;
1616 int sz;
1617
1618 tmp = list_entry(t, struct elf_thread_status, list);
1619 sz = elf_dump_thread_status(signr, tmp);
1620 thread_status_size += sz;
1621 }
1622 }
1623
1624 /* now collect the dump for the current */
1625 fill_prstatus(prstatus, current, signr);
1626 elf_core_copy_regs(&prstatus->pr_reg, regs);
1627
1628#ifdef CONFIG_MMU
1629 segs = current->mm->map_count;
1630#else
1631 segs = 0;
1632 for (vml = current->mm->context.vmlist; vml; vml = vml->next)
1633 segs++;
1634#endif
1635#ifdef ELF_CORE_EXTRA_PHDRS
1636 segs += ELF_CORE_EXTRA_PHDRS;
1637#endif
1638
1639 /* Set up header */
1640 fill_elf_fdpic_header(elf, segs + 1); /* including notes section */
1641
1642 has_dumped = 1;
1643 current->flags |= PF_DUMPCORE;
1644
1645 /*
1646 * Set up the notes in similar form to SVR4 core dumps made
1647 * with info from their /proc.
1648 */
1649
1650 fill_note(notes + 0, "CORE", NT_PRSTATUS, sizeof(*prstatus), prstatus);
1651 fill_psinfo(psinfo, current->group_leader, current->mm);
1652 fill_note(notes + 1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
1653
1654 numnote = 2;
1655
1656 auxv = (elf_addr_t *) current->mm->saved_auxv;
1657
1658 i = 0;
1659 do
1660 i += 2;
1661 while (auxv[i - 2] != AT_NULL);
1662 fill_note(&notes[numnote++], "CORE", NT_AUXV,
1663 i * sizeof(elf_addr_t), auxv);
1664
1665 /* Try to dump the FPU. */
1666 if ((prstatus->pr_fpvalid =
1667 elf_core_copy_task_fpregs(current, regs, fpu)))
1668 fill_note(notes + numnote++,
1669 "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
1670#ifdef ELF_CORE_COPY_XFPREGS
1671 if (elf_core_copy_task_xfpregs(current, xfpu))
1672 fill_note(notes + numnote++,
1673 "LINUX", NT_PRXFPREG, sizeof(*xfpu), xfpu);
1674#endif
1675
1676 fs = get_fs();
1677 set_fs(KERNEL_DS);
1678
1679 DUMP_WRITE(elf, sizeof(*elf));
1680 offset += sizeof(*elf); /* Elf header */
1681 offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */
1682
1683 /* Write notes phdr entry */
1684 {
1685 struct elf_phdr phdr;
1686 int sz = 0;
1687
1688 for (i = 0; i < numnote; i++)
1689 sz += notesize(notes + i);
1690
1691 sz += thread_status_size;
1692
1693 fill_elf_note_phdr(&phdr, sz, offset);
1694 offset += sz;
1695 DUMP_WRITE(&phdr, sizeof(phdr));
1696 }
1697
1698 /* Page-align dumped data */
1699 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
1700
1701 /* write program headers for segments dump */
1702 for (
1703#ifdef CONFIG_MMU
1704 vma = current->mm->mmap; vma; vma = vma->vm_next
1705#else
1706 vml = current->mm->context.vmlist; vml; vml = vml->next
1707#endif
1708 ) {
1709 struct elf_phdr phdr;
1710 size_t sz;
1711
1712#ifndef CONFIG_MMU
1713 vma = vml->vma;
1714#endif
1715
1716 sz = vma->vm_end - vma->vm_start;
1717
1718 phdr.p_type = PT_LOAD;
1719 phdr.p_offset = offset;
1720 phdr.p_vaddr = vma->vm_start;
1721 phdr.p_paddr = 0;
1722 phdr.p_filesz = maydump(vma) ? sz : 0;
1723 phdr.p_memsz = sz;
1724 offset += phdr.p_filesz;
1725 phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
1726 if (vma->vm_flags & VM_WRITE)
1727 phdr.p_flags |= PF_W;
1728 if (vma->vm_flags & VM_EXEC)
1729 phdr.p_flags |= PF_X;
1730 phdr.p_align = ELF_EXEC_PAGESIZE;
1731
1732 DUMP_WRITE(&phdr, sizeof(phdr));
1733 }
1734
1735#ifdef ELF_CORE_WRITE_EXTRA_PHDRS
1736 ELF_CORE_WRITE_EXTRA_PHDRS;
1737#endif
1738
1739 /* write out the notes section */
1740 for (i = 0; i < numnote; i++)
1741 if (!writenote(notes + i, file))
1742 goto end_coredump;
1743
1744 /* write out the thread status notes section */
1745 list_for_each(t, &thread_list) {
1746 struct elf_thread_status *tmp =
1747 list_entry(t, struct elf_thread_status, list);
1748
1749 for (i = 0; i < tmp->num_notes; i++)
1750 if (!writenote(&tmp->notes[i], file))
1751 goto end_coredump;
1752 }
1753
1754 DUMP_SEEK(dataoff);
1755
1756 if (elf_fdpic_dump_segments(file, current->mm, &size, &limit) < 0)
1757 goto end_coredump;
1758
1759#ifdef ELF_CORE_WRITE_EXTRA_DATA
1760 ELF_CORE_WRITE_EXTRA_DATA;
1761#endif
1762
1763 if (file->f_pos != offset) {
1764 /* Sanity check */
1765 printk(KERN_WARNING
1766 "elf_core_dump: file->f_pos (%lld) != offset (%lld)\n",
1767 file->f_pos, offset);
1768 }
1769
1770end_coredump:
1771 set_fs(fs);
1772
1773cleanup:
1774 while (!list_empty(&thread_list)) {
1775 struct list_head *tmp = thread_list.next;
1776 list_del(tmp);
1777 kfree(list_entry(tmp, struct elf_thread_status, list));
1778 }
1779
1780 kfree(elf);
1781 kfree(prstatus);
1782 kfree(psinfo);
1783 kfree(notes);
1784 kfree(fpu);
1785#ifdef ELF_CORE_COPY_XFPREGS
1786 kfree(xfpu);
1787#endif
1788 return has_dumped;
1789#undef NUM_NOTES
1790}
1791
1792#endif /* USE_ELF_CORE_DUMP */
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9633a490dab0..045f98854f14 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -739,7 +739,7 @@ static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
739 if (!bo) 739 if (!bo)
740 return -ENOMEM; 740 return -ENOMEM;
741 741
742 mutex_lock(&bdev->bd_mutex); 742 mutex_lock_nested(&bdev->bd_mutex, BD_MUTEX_PARTITION);
743 res = bd_claim(bdev, holder); 743 res = bd_claim(bdev, holder);
744 if (res || !add_bd_holder(bdev, bo)) 744 if (res || !add_bd_holder(bdev, bo))
745 free_bd_holder(bo); 745 free_bd_holder(bo);
@@ -764,7 +764,7 @@ static void bd_release_from_kobject(struct block_device *bdev,
764 if (!kobj) 764 if (!kobj)
765 return; 765 return;
766 766
767 mutex_lock(&bdev->bd_mutex); 767 mutex_lock_nested(&bdev->bd_mutex, BD_MUTEX_PARTITION);
768 bd_release(bdev); 768 bd_release(bdev);
769 if ((bo = del_bd_holder(bdev, kobj))) 769 if ((bo = del_bd_holder(bdev, kobj)))
770 free_bd_holder(bo); 770 free_bd_holder(bo);
@@ -822,6 +822,22 @@ struct block_device *open_by_devnum(dev_t dev, unsigned mode)
822 822
823EXPORT_SYMBOL(open_by_devnum); 823EXPORT_SYMBOL(open_by_devnum);
824 824
825static int
826blkdev_get_partition(struct block_device *bdev, mode_t mode, unsigned flags);
827
828struct block_device *open_partition_by_devnum(dev_t dev, unsigned mode)
829{
830 struct block_device *bdev = bdget(dev);
831 int err = -ENOMEM;
832 int flags = mode & FMODE_WRITE ? O_RDWR : O_RDONLY;
833 if (bdev)
834 err = blkdev_get_partition(bdev, mode, flags);
835 return err ? ERR_PTR(err) : bdev;
836}
837
838EXPORT_SYMBOL(open_partition_by_devnum);
839
840
825/* 841/*
826 * This routine checks whether a removable media has been changed, 842 * This routine checks whether a removable media has been changed,
827 * and invalidates all buffer-cache-entries in that case. This 843 * and invalidates all buffer-cache-entries in that case. This
@@ -868,7 +884,66 @@ void bd_set_size(struct block_device *bdev, loff_t size)
868} 884}
869EXPORT_SYMBOL(bd_set_size); 885EXPORT_SYMBOL(bd_set_size);
870 886
871static int do_open(struct block_device *bdev, struct file *file) 887static int __blkdev_put(struct block_device *bdev, unsigned int subclass)
888{
889 int ret = 0;
890 struct inode *bd_inode = bdev->bd_inode;
891 struct gendisk *disk = bdev->bd_disk;
892
893 mutex_lock_nested(&bdev->bd_mutex, subclass);
894 lock_kernel();
895 if (!--bdev->bd_openers) {
896 sync_blockdev(bdev);
897 kill_bdev(bdev);
898 }
899 if (bdev->bd_contains == bdev) {
900 if (disk->fops->release)
901 ret = disk->fops->release(bd_inode, NULL);
902 } else {
903 mutex_lock_nested(&bdev->bd_contains->bd_mutex,
904 subclass + 1);
905 bdev->bd_contains->bd_part_count--;
906 mutex_unlock(&bdev->bd_contains->bd_mutex);
907 }
908 if (!bdev->bd_openers) {
909 struct module *owner = disk->fops->owner;
910
911 put_disk(disk);
912 module_put(owner);
913
914 if (bdev->bd_contains != bdev) {
915 kobject_put(&bdev->bd_part->kobj);
916 bdev->bd_part = NULL;
917 }
918 bdev->bd_disk = NULL;
919 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
920 if (bdev != bdev->bd_contains)
921 __blkdev_put(bdev->bd_contains, subclass + 1);
922 bdev->bd_contains = NULL;
923 }
924 unlock_kernel();
925 mutex_unlock(&bdev->bd_mutex);
926 bdput(bdev);
927 return ret;
928}
929
930int blkdev_put(struct block_device *bdev)
931{
932 return __blkdev_put(bdev, BD_MUTEX_NORMAL);
933}
934EXPORT_SYMBOL(blkdev_put);
935
936int blkdev_put_partition(struct block_device *bdev)
937{
938 return __blkdev_put(bdev, BD_MUTEX_PARTITION);
939}
940EXPORT_SYMBOL(blkdev_put_partition);
941
942static int
943blkdev_get_whole(struct block_device *bdev, mode_t mode, unsigned flags);
944
945static int
946do_open(struct block_device *bdev, struct file *file, unsigned int subclass)
872{ 947{
873 struct module *owner = NULL; 948 struct module *owner = NULL;
874 struct gendisk *disk; 949 struct gendisk *disk;
@@ -885,7 +960,8 @@ static int do_open(struct block_device *bdev, struct file *file)
885 } 960 }
886 owner = disk->fops->owner; 961 owner = disk->fops->owner;
887 962
888 mutex_lock(&bdev->bd_mutex); 963 mutex_lock_nested(&bdev->bd_mutex, subclass);
964
889 if (!bdev->bd_openers) { 965 if (!bdev->bd_openers) {
890 bdev->bd_disk = disk; 966 bdev->bd_disk = disk;
891 bdev->bd_contains = bdev; 967 bdev->bd_contains = bdev;
@@ -912,11 +988,11 @@ static int do_open(struct block_device *bdev, struct file *file)
912 ret = -ENOMEM; 988 ret = -ENOMEM;
913 if (!whole) 989 if (!whole)
914 goto out_first; 990 goto out_first;
915 ret = blkdev_get(whole, file->f_mode, file->f_flags); 991 ret = blkdev_get_whole(whole, file->f_mode, file->f_flags);
916 if (ret) 992 if (ret)
917 goto out_first; 993 goto out_first;
918 bdev->bd_contains = whole; 994 bdev->bd_contains = whole;
919 mutex_lock(&whole->bd_mutex); 995 mutex_lock_nested(&whole->bd_mutex, BD_MUTEX_WHOLE);
920 whole->bd_part_count++; 996 whole->bd_part_count++;
921 p = disk->part[part - 1]; 997 p = disk->part[part - 1];
922 bdev->bd_inode->i_data.backing_dev_info = 998 bdev->bd_inode->i_data.backing_dev_info =
@@ -944,7 +1020,8 @@ static int do_open(struct block_device *bdev, struct file *file)
944 if (bdev->bd_invalidated) 1020 if (bdev->bd_invalidated)
945 rescan_partitions(bdev->bd_disk, bdev); 1021 rescan_partitions(bdev->bd_disk, bdev);
946 } else { 1022 } else {
947 mutex_lock(&bdev->bd_contains->bd_mutex); 1023 mutex_lock_nested(&bdev->bd_contains->bd_mutex,
1024 BD_MUTEX_PARTITION);
948 bdev->bd_contains->bd_part_count++; 1025 bdev->bd_contains->bd_part_count++;
949 mutex_unlock(&bdev->bd_contains->bd_mutex); 1026 mutex_unlock(&bdev->bd_contains->bd_mutex);
950 } 1027 }
@@ -958,7 +1035,7 @@ out_first:
958 bdev->bd_disk = NULL; 1035 bdev->bd_disk = NULL;
959 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1036 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
960 if (bdev != bdev->bd_contains) 1037 if (bdev != bdev->bd_contains)
961 blkdev_put(bdev->bd_contains); 1038 __blkdev_put(bdev->bd_contains, BD_MUTEX_WHOLE);
962 bdev->bd_contains = NULL; 1039 bdev->bd_contains = NULL;
963 put_disk(disk); 1040 put_disk(disk);
964 module_put(owner); 1041 module_put(owner);
@@ -985,11 +1062,49 @@ int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags)
985 fake_file.f_dentry = &fake_dentry; 1062 fake_file.f_dentry = &fake_dentry;
986 fake_dentry.d_inode = bdev->bd_inode; 1063 fake_dentry.d_inode = bdev->bd_inode;
987 1064
988 return do_open(bdev, &fake_file); 1065 return do_open(bdev, &fake_file, BD_MUTEX_NORMAL);
989} 1066}
990 1067
991EXPORT_SYMBOL(blkdev_get); 1068EXPORT_SYMBOL(blkdev_get);
992 1069
1070static int
1071blkdev_get_whole(struct block_device *bdev, mode_t mode, unsigned flags)
1072{
1073 /*
1074 * This crockload is due to bad choice of ->open() type.
1075 * It will go away.
1076 * For now, block device ->open() routine must _not_
1077 * examine anything in 'inode' argument except ->i_rdev.
1078 */
1079 struct file fake_file = {};
1080 struct dentry fake_dentry = {};
1081 fake_file.f_mode = mode;
1082 fake_file.f_flags = flags;
1083 fake_file.f_dentry = &fake_dentry;
1084 fake_dentry.d_inode = bdev->bd_inode;
1085
1086 return do_open(bdev, &fake_file, BD_MUTEX_WHOLE);
1087}
1088
1089static int
1090blkdev_get_partition(struct block_device *bdev, mode_t mode, unsigned flags)
1091{
1092 /*
1093 * This crockload is due to bad choice of ->open() type.
1094 * It will go away.
1095 * For now, block device ->open() routine must _not_
1096 * examine anything in 'inode' argument except ->i_rdev.
1097 */
1098 struct file fake_file = {};
1099 struct dentry fake_dentry = {};
1100 fake_file.f_mode = mode;
1101 fake_file.f_flags = flags;
1102 fake_file.f_dentry = &fake_dentry;
1103 fake_dentry.d_inode = bdev->bd_inode;
1104
1105 return do_open(bdev, &fake_file, BD_MUTEX_PARTITION);
1106}
1107
993static int blkdev_open(struct inode * inode, struct file * filp) 1108static int blkdev_open(struct inode * inode, struct file * filp)
994{ 1109{
995 struct block_device *bdev; 1110 struct block_device *bdev;
@@ -1005,7 +1120,7 @@ static int blkdev_open(struct inode * inode, struct file * filp)
1005 1120
1006 bdev = bd_acquire(inode); 1121 bdev = bd_acquire(inode);
1007 1122
1008 res = do_open(bdev, filp); 1123 res = do_open(bdev, filp, BD_MUTEX_NORMAL);
1009 if (res) 1124 if (res)
1010 return res; 1125 return res;
1011 1126
@@ -1019,51 +1134,6 @@ static int blkdev_open(struct inode * inode, struct file * filp)
1019 return res; 1134 return res;
1020} 1135}
1021 1136
1022int blkdev_put(struct block_device *bdev)
1023{
1024 int ret = 0;
1025 struct inode *bd_inode = bdev->bd_inode;
1026 struct gendisk *disk = bdev->bd_disk;
1027
1028 mutex_lock(&bdev->bd_mutex);
1029 lock_kernel();
1030 if (!--bdev->bd_openers) {
1031 sync_blockdev(bdev);
1032 kill_bdev(bdev);
1033 }
1034 if (bdev->bd_contains == bdev) {
1035 if (disk->fops->release)
1036 ret = disk->fops->release(bd_inode, NULL);
1037 } else {
1038 mutex_lock(&bdev->bd_contains->bd_mutex);
1039 bdev->bd_contains->bd_part_count--;
1040 mutex_unlock(&bdev->bd_contains->bd_mutex);
1041 }
1042 if (!bdev->bd_openers) {
1043 struct module *owner = disk->fops->owner;
1044
1045 put_disk(disk);
1046 module_put(owner);
1047
1048 if (bdev->bd_contains != bdev) {
1049 kobject_put(&bdev->bd_part->kobj);
1050 bdev->bd_part = NULL;
1051 }
1052 bdev->bd_disk = NULL;
1053 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
1054 if (bdev != bdev->bd_contains) {
1055 blkdev_put(bdev->bd_contains);
1056 }
1057 bdev->bd_contains = NULL;
1058 }
1059 unlock_kernel();
1060 mutex_unlock(&bdev->bd_mutex);
1061 bdput(bdev);
1062 return ret;
1063}
1064
1065EXPORT_SYMBOL(blkdev_put);
1066
1067static int blkdev_close(struct inode * inode, struct file * filp) 1137static int blkdev_close(struct inode * inode, struct file * filp)
1068{ 1138{
1069 struct block_device *bdev = I_BDEV(filp->f_mapping->host); 1139 struct block_device *bdev = I_BDEV(filp->f_mapping->host);
diff --git a/fs/buffer.c b/fs/buffer.c
index 3660dcb97591..71649ef9b658 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -470,13 +470,18 @@ out:
470 pass does the actual I/O. */ 470 pass does the actual I/O. */
471void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers) 471void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
472{ 472{
473 struct address_space *mapping = bdev->bd_inode->i_mapping;
474
475 if (mapping->nrpages == 0)
476 return;
477
473 invalidate_bh_lrus(); 478 invalidate_bh_lrus();
474 /* 479 /*
475 * FIXME: what about destroy_dirty_buffers? 480 * FIXME: what about destroy_dirty_buffers?
476 * We really want to use invalidate_inode_pages2() for 481 * We really want to use invalidate_inode_pages2() for
477 * that, but not until that's cleaned up. 482 * that, but not until that's cleaned up.
478 */ 483 */
479 invalidate_inode_pages(bdev->bd_inode->i_mapping); 484 invalidate_inode_pages(mapping);
480} 485}
481 486
482/* 487/*
diff --git a/fs/char_dev.c b/fs/char_dev.c
index a4cbc6706ef0..3483d3cf8087 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -182,6 +182,28 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
182 return 0; 182 return 0;
183} 183}
184 184
185/**
186 * register_chrdev() - Register a major number for character devices.
187 * @major: major device number or 0 for dynamic allocation
188 * @name: name of this range of devices
189 * @fops: file operations associated with this devices
190 *
191 * If @major == 0 this functions will dynamically allocate a major and return
192 * its number.
193 *
194 * If @major > 0 this function will attempt to reserve a device with the given
195 * major number and will return zero on success.
196 *
197 * Returns a -ve errno on failure.
198 *
199 * The name of this device has nothing to do with the name of the device in
200 * /dev. It only helps to keep track of the different owners of devices. If
201 * your module name has only one type of devices it's ok to use e.g. the name
202 * of the module here.
203 *
204 * This function registers a range of 256 minor numbers. The first minor number
205 * is 0.
206 */
185int register_chrdev(unsigned int major, const char *name, 207int register_chrdev(unsigned int major, const char *name,
186 const struct file_operations *fops) 208 const struct file_operations *fops)
187{ 209{
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index a61d17ed1827..0feb3bd49cb8 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,13 @@
1Version 1.45
2------------
3Do not time out lockw calls when using posix extensions. Do not
4time out requests if server still responding reasonably fast
5on requests on other threads. Improve POSIX locking emulation,
6(lock cancel now works, and unlock of merged range works even
7to Windows servers now). Fix oops on mount to lanman servers
8(win9x, os/2 etc.) when null password. Do not send listxattr
9(SMB to query all EAs) if nouser_xattr specified.
10
1Version 1.44 11Version 1.44
2------------ 12------------
3Rewritten sessionsetup support, including support for legacy SMB 13Rewritten sessionsetup support, including support for legacy SMB
diff --git a/fs/cifs/README b/fs/cifs/README
index 7986d0d97ace..5f0e1bd64fee 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -408,7 +408,7 @@ A partial list of the supported mount options follows:
408 user_xattr Allow getting and setting user xattrs as OS/2 EAs (extended 408 user_xattr Allow getting and setting user xattrs as OS/2 EAs (extended
409 attributes) to the server (default) e.g. via setfattr 409 attributes) to the server (default) e.g. via setfattr
410 and getfattr utilities. 410 and getfattr utilities.
411 nouser_xattr Do not allow getfattr/setfattr to get/set xattrs 411 nouser_xattr Do not allow getfattr/setfattr to get/set/list xattrs
412 mapchars Translate six of the seven reserved characters (not backslash) 412 mapchars Translate six of the seven reserved characters (not backslash)
413 *?<>|: 413 *?<>|:
414 to the remap range (above 0xF000), which also 414 to the remap range (above 0xF000), which also
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index a89efaf78a26..4bc250b2d9fc 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -277,7 +277,8 @@ void calc_lanman_hash(struct cifsSesInfo * ses, char * lnm_session_key)
277 return; 277 return;
278 278
279 memset(password_with_pad, 0, CIFS_ENCPWD_SIZE); 279 memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
280 strncpy(password_with_pad, ses->password, CIFS_ENCPWD_SIZE); 280 if(ses->password)
281 strncpy(password_with_pad, ses->password, CIFS_ENCPWD_SIZE);
281 282
282 if((ses->server->secMode & SECMODE_PW_ENCRYPT) == 0) 283 if((ses->server->secMode & SECMODE_PW_ENCRYPT) == 0)
283 if(extended_security & CIFSSEC_MAY_PLNTXT) { 284 if(extended_security & CIFSSEC_MAY_PLNTXT) {
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c28ede599946..3cd750029be2 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -402,7 +402,6 @@ static struct quotactl_ops cifs_quotactl_ops = {
402}; 402};
403#endif 403#endif
404 404
405#ifdef CONFIG_CIFS_EXPERIMENTAL
406static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags) 405static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
407{ 406{
408 struct cifs_sb_info *cifs_sb; 407 struct cifs_sb_info *cifs_sb;
@@ -422,7 +421,7 @@ static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
422 tcon->tidStatus = CifsExiting; 421 tcon->tidStatus = CifsExiting;
423 up(&tcon->tconSem); 422 up(&tcon->tconSem);
424 423
425 /* cancel_brl_requests(tcon); */ 424 /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
426 /* cancel_notify_requests(tcon); */ 425 /* cancel_notify_requests(tcon); */
427 if(tcon->ses && tcon->ses->server) 426 if(tcon->ses && tcon->ses->server)
428 { 427 {
@@ -438,7 +437,6 @@ static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
438 437
439 return; 438 return;
440} 439}
441#endif
442 440
443static int cifs_remount(struct super_block *sb, int *flags, char *data) 441static int cifs_remount(struct super_block *sb, int *flags, char *data)
444{ 442{
@@ -457,9 +455,7 @@ struct super_operations cifs_super_ops = {
457 unless later we add lazy close of inodes or unless the kernel forgets to call 455 unless later we add lazy close of inodes or unless the kernel forgets to call
458 us with the same number of releases (closes) as opens */ 456 us with the same number of releases (closes) as opens */
459 .show_options = cifs_show_options, 457 .show_options = cifs_show_options,
460#ifdef CONFIG_CIFS_EXPERIMENTAL
461 .umount_begin = cifs_umount_begin, 458 .umount_begin = cifs_umount_begin,
462#endif
463 .remount_fs = cifs_remount, 459 .remount_fs = cifs_remount,
464}; 460};
465 461
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 8f75c6f24701..39ee8ef3bdeb 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -100,5 +100,5 @@ extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t);
100extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); 100extern ssize_t cifs_listxattr(struct dentry *, char *, size_t);
101extern int cifs_ioctl (struct inode * inode, struct file * filep, 101extern int cifs_ioctl (struct inode * inode, struct file * filep,
102 unsigned int command, unsigned long arg); 102 unsigned int command, unsigned long arg);
103#define CIFS_VERSION "1.44" 103#define CIFS_VERSION "1.45"
104#endif /* _CIFSFS_H */ 104#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6d7cf5f3bc0b..b24006c47df1 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -3,6 +3,7 @@
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2002,2006 4 * Copyright (C) International Business Machines Corp., 2002,2006
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * Jeremy Allison (jra@samba.org)
6 * 7 *
7 * This library is free software; you can redistribute it and/or modify 8 * This library is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published 9 * it under the terms of the GNU Lesser General Public License as published
@@ -158,7 +159,8 @@ struct TCP_Server_Info {
158 /* 16th byte of RFC1001 workstation name is always null */ 159 /* 16th byte of RFC1001 workstation name is always null */
159 char workstation_RFC1001_name[SERVER_NAME_LEN_WITH_NULL]; 160 char workstation_RFC1001_name[SERVER_NAME_LEN_WITH_NULL];
160 __u32 sequence_number; /* needed for CIFS PDU signature */ 161 __u32 sequence_number; /* needed for CIFS PDU signature */
161 char mac_signing_key[CIFS_SESS_KEY_SIZE + 16]; 162 char mac_signing_key[CIFS_SESS_KEY_SIZE + 16];
163 unsigned long lstrp; /* when we got last response from this server */
162}; 164};
163 165
164/* 166/*
@@ -266,14 +268,14 @@ struct cifsTconInfo {
266}; 268};
267 269
268/* 270/*
269 * This info hangs off the cifsFileInfo structure. This is used to track 271 * This info hangs off the cifsFileInfo structure, pointed to by llist.
270 * byte stream locks on the file 272 * This is used to track byte stream locks on the file
271 */ 273 */
272struct cifsLockInfo { 274struct cifsLockInfo {
273 struct cifsLockInfo *next; 275 struct list_head llist; /* pointer to next cifsLockInfo */
274 int start; 276 __u64 offset;
275 int length; 277 __u64 length;
276 int type; 278 __u8 type;
277}; 279};
278 280
279/* 281/*
@@ -304,6 +306,8 @@ struct cifsFileInfo {
304 /* lock scope id (0 if none) */ 306 /* lock scope id (0 if none) */
305 struct file * pfile; /* needed for writepage */ 307 struct file * pfile; /* needed for writepage */
306 struct inode * pInode; /* needed for oplock break */ 308 struct inode * pInode; /* needed for oplock break */
309 struct semaphore lock_sem;
310 struct list_head llist; /* list of byte range locks we have. */
307 unsigned closePend:1; /* file is marked to close */ 311 unsigned closePend:1; /* file is marked to close */
308 unsigned invalidHandle:1; /* file closed via session abend */ 312 unsigned invalidHandle:1; /* file closed via session abend */
309 atomic_t wrtPending; /* handle in use - defer close */ 313 atomic_t wrtPending; /* handle in use - defer close */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index a5ddc62d6fe6..b35c55c3c8bb 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -50,6 +50,10 @@ extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
50extern int SendReceive2(const unsigned int /* xid */ , struct cifsSesInfo *, 50extern int SendReceive2(const unsigned int /* xid */ , struct cifsSesInfo *,
51 struct kvec *, int /* nvec to send */, 51 struct kvec *, int /* nvec to send */,
52 int * /* type of buf returned */ , const int long_op); 52 int * /* type of buf returned */ , const int long_op);
53extern int SendReceiveBlockingLock(const unsigned int /* xid */ , struct cifsTconInfo *,
54 struct smb_hdr * /* input */ ,
55 struct smb_hdr * /* out */ ,
56 int * /* bytes returned */);
53extern int checkSMBhdr(struct smb_hdr *smb, __u16 mid); 57extern int checkSMBhdr(struct smb_hdr *smb, __u16 mid);
54extern int checkSMB(struct smb_hdr *smb, __u16 mid, int length); 58extern int checkSMB(struct smb_hdr *smb, __u16 mid, int length);
55extern int is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *); 59extern int is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 19678c575dfc..075d8fb3d376 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -477,7 +477,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
477 /* BB get server time for time conversions and add 477 /* BB get server time for time conversions and add
478 code to use it and timezone since this is not UTC */ 478 code to use it and timezone since this is not UTC */
479 479
480 if (rsp->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) { 480 if (rsp->EncryptionKeyLength == cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
481 memcpy(server->cryptKey, rsp->EncryptionKey, 481 memcpy(server->cryptKey, rsp->EncryptionKey,
482 CIFS_CRYPTO_KEY_SIZE); 482 CIFS_CRYPTO_KEY_SIZE);
483 } else if (server->secMode & SECMODE_PW_ENCRYPT) { 483 } else if (server->secMode & SECMODE_PW_ENCRYPT) {
@@ -1460,8 +1460,13 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
1460 pSMB->hdr.smb_buf_length += count; 1460 pSMB->hdr.smb_buf_length += count;
1461 pSMB->ByteCount = cpu_to_le16(count); 1461 pSMB->ByteCount = cpu_to_le16(count);
1462 1462
1463 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 1463 if (waitFlag) {
1464 rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB,
1465 (struct smb_hdr *) pSMBr, &bytes_returned);
1466 } else {
1467 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
1464 (struct smb_hdr *) pSMBr, &bytes_returned, timeout); 1468 (struct smb_hdr *) pSMBr, &bytes_returned, timeout);
1469 }
1465 cifs_stats_inc(&tcon->num_locks); 1470 cifs_stats_inc(&tcon->num_locks);
1466 if (rc) { 1471 if (rc) {
1467 cFYI(1, ("Send error in Lock = %d", rc)); 1472 cFYI(1, ("Send error in Lock = %d", rc));
@@ -1484,6 +1489,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
1484 char *data_offset; 1489 char *data_offset;
1485 struct cifs_posix_lock *parm_data; 1490 struct cifs_posix_lock *parm_data;
1486 int rc = 0; 1491 int rc = 0;
1492 int timeout = 0;
1487 int bytes_returned = 0; 1493 int bytes_returned = 0;
1488 __u16 params, param_offset, offset, byte_count, count; 1494 __u16 params, param_offset, offset, byte_count, count;
1489 1495
@@ -1503,7 +1509,6 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
1503 pSMB->MaxSetupCount = 0; 1509 pSMB->MaxSetupCount = 0;
1504 pSMB->Reserved = 0; 1510 pSMB->Reserved = 0;
1505 pSMB->Flags = 0; 1511 pSMB->Flags = 0;
1506 pSMB->Timeout = 0;
1507 pSMB->Reserved2 = 0; 1512 pSMB->Reserved2 = 0;
1508 param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; 1513 param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
1509 offset = param_offset + params; 1514 offset = param_offset + params;
@@ -1529,8 +1534,13 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
1529 (((char *) &pSMB->hdr.Protocol) + offset); 1534 (((char *) &pSMB->hdr.Protocol) + offset);
1530 1535
1531 parm_data->lock_type = cpu_to_le16(lock_type); 1536 parm_data->lock_type = cpu_to_le16(lock_type);
1532 if(waitFlag) 1537 if(waitFlag) {
1538 timeout = 3; /* blocking operation, no timeout */
1533 parm_data->lock_flags = cpu_to_le16(1); 1539 parm_data->lock_flags = cpu_to_le16(1);
1540 pSMB->Timeout = cpu_to_le32(-1);
1541 } else
1542 pSMB->Timeout = 0;
1543
1534 parm_data->pid = cpu_to_le32(current->tgid); 1544 parm_data->pid = cpu_to_le32(current->tgid);
1535 parm_data->start = cpu_to_le64(pLockData->fl_start); 1545 parm_data->start = cpu_to_le64(pLockData->fl_start);
1536 parm_data->length = cpu_to_le64(len); /* normalize negative numbers */ 1546 parm_data->length = cpu_to_le64(len); /* normalize negative numbers */
@@ -1541,8 +1551,14 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
1541 pSMB->Reserved4 = 0; 1551 pSMB->Reserved4 = 0;
1542 pSMB->hdr.smb_buf_length += byte_count; 1552 pSMB->hdr.smb_buf_length += byte_count;
1543 pSMB->ByteCount = cpu_to_le16(byte_count); 1553 pSMB->ByteCount = cpu_to_le16(byte_count);
1544 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 1554 if (waitFlag) {
1545 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 1555 rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB,
1556 (struct smb_hdr *) pSMBr, &bytes_returned);
1557 } else {
1558 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
1559 (struct smb_hdr *) pSMBr, &bytes_returned, timeout);
1560 }
1561
1546 if (rc) { 1562 if (rc) {
1547 cFYI(1, ("Send error in Posix Lock = %d", rc)); 1563 cFYI(1, ("Send error in Posix Lock = %d", rc));
1548 } else if (get_flag) { 1564 } else if (get_flag) {
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 876eb9ef85fe..5d394c726860 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -182,6 +182,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
182 182
183 while ((server->tcpStatus != CifsExiting) && (server->tcpStatus != CifsGood)) 183 while ((server->tcpStatus != CifsExiting) && (server->tcpStatus != CifsGood))
184 { 184 {
185 try_to_freeze();
185 if(server->protocolType == IPV6) { 186 if(server->protocolType == IPV6) {
186 rc = ipv6_connect(&server->addr.sockAddr6,&server->ssocket); 187 rc = ipv6_connect(&server->addr.sockAddr6,&server->ssocket);
187 } else { 188 } else {
@@ -612,6 +613,10 @@ multi_t2_fnd:
612#ifdef CONFIG_CIFS_STATS2 613#ifdef CONFIG_CIFS_STATS2
613 mid_entry->when_received = jiffies; 614 mid_entry->when_received = jiffies;
614#endif 615#endif
616 /* so we do not time out requests to server
617 which is still responding (since server could
618 be busy but not dead) */
619 server->lstrp = jiffies;
615 break; 620 break;
616 } 621 }
617 } 622 }
@@ -1266,33 +1271,35 @@ find_unc(__be32 new_target_ip_addr, char *uncName, char *userName)
1266 1271
1267 read_lock(&GlobalSMBSeslock); 1272 read_lock(&GlobalSMBSeslock);
1268 list_for_each(tmp, &GlobalTreeConnectionList) { 1273 list_for_each(tmp, &GlobalTreeConnectionList) {
1269 cFYI(1, ("Next tcon - ")); 1274 cFYI(1, ("Next tcon"));
1270 tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList); 1275 tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
1271 if (tcon->ses) { 1276 if (tcon->ses) {
1272 if (tcon->ses->server) { 1277 if (tcon->ses->server) {
1273 cFYI(1, 1278 cFYI(1,
1274 (" old ip addr: %x == new ip %x ?", 1279 ("old ip addr: %x == new ip %x ?",
1275 tcon->ses->server->addr.sockAddr.sin_addr. 1280 tcon->ses->server->addr.sockAddr.sin_addr.
1276 s_addr, new_target_ip_addr)); 1281 s_addr, new_target_ip_addr));
1277 if (tcon->ses->server->addr.sockAddr.sin_addr. 1282 if (tcon->ses->server->addr.sockAddr.sin_addr.
1278 s_addr == new_target_ip_addr) { 1283 s_addr == new_target_ip_addr) {
1279 /* BB lock tcon and server and tcp session and increment use count here? */ 1284 /* BB lock tcon, server and tcp session and increment use count here? */
1280 /* found a match on the TCP session */ 1285 /* found a match on the TCP session */
1281 /* BB check if reconnection needed */ 1286 /* BB check if reconnection needed */
1282 cFYI(1,("Matched ip, old UNC: %s == new: %s ?", 1287 cFYI(1,("IP match, old UNC: %s new: %s",
1283 tcon->treeName, uncName)); 1288 tcon->treeName, uncName));
1284 if (strncmp 1289 if (strncmp
1285 (tcon->treeName, uncName, 1290 (tcon->treeName, uncName,
1286 MAX_TREE_SIZE) == 0) { 1291 MAX_TREE_SIZE) == 0) {
1287 cFYI(1, 1292 cFYI(1,
1288 ("Matched UNC, old user: %s == new: %s ?", 1293 ("and old usr: %s new: %s",
1289 tcon->treeName, uncName)); 1294 tcon->treeName, uncName));
1290 if (strncmp 1295 if (strncmp
1291 (tcon->ses->userName, 1296 (tcon->ses->userName,
1292 userName, 1297 userName,
1293 MAX_USERNAME_SIZE) == 0) { 1298 MAX_USERNAME_SIZE) == 0) {
1294 read_unlock(&GlobalSMBSeslock); 1299 read_unlock(&GlobalSMBSeslock);
1295 return tcon;/* also matched user (smb session)*/ 1300 /* matched smb session
1301 (user name */
1302 return tcon;
1296 } 1303 }
1297 } 1304 }
1298 } 1305 }
@@ -1969,7 +1976,18 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
1969 } 1976 }
1970 1977
1971 cFYI(1,("Negotiate caps 0x%x",(int)cap)); 1978 cFYI(1,("Negotiate caps 0x%x",(int)cap));
1972 1979#ifdef CONFIG_CIFS_DEBUG2
1980 if(cap & CIFS_UNIX_FCNTL_CAP)
1981 cFYI(1,("FCNTL cap"));
1982 if(cap & CIFS_UNIX_EXTATTR_CAP)
1983 cFYI(1,("EXTATTR cap"));
1984 if(cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
1985 cFYI(1,("POSIX path cap"));
1986 if(cap & CIFS_UNIX_XATTR_CAP)
1987 cFYI(1,("XATTR cap"));
1988 if(cap & CIFS_UNIX_POSIX_ACL_CAP)
1989 cFYI(1,("POSIX ACL cap"));
1990#endif /* CIFS_DEBUG2 */
1973 if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) { 1991 if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) {
1974 cFYI(1,("setting capabilities failed")); 1992 cFYI(1,("setting capabilities failed"));
1975 } 1993 }
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index ba4cbe9b0684..914239d53634 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -267,6 +267,10 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
267 pCifsFile->invalidHandle = FALSE; 267 pCifsFile->invalidHandle = FALSE;
268 pCifsFile->closePend = FALSE; 268 pCifsFile->closePend = FALSE;
269 init_MUTEX(&pCifsFile->fh_sem); 269 init_MUTEX(&pCifsFile->fh_sem);
270 init_MUTEX(&pCifsFile->lock_sem);
271 INIT_LIST_HEAD(&pCifsFile->llist);
272 atomic_set(&pCifsFile->wrtPending,0);
273
270 /* set the following in open now 274 /* set the following in open now
271 pCifsFile->pfile = file; */ 275 pCifsFile->pfile = file; */
272 write_lock(&GlobalSMBSeslock); 276 write_lock(&GlobalSMBSeslock);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 944d2b9e092d..e9c5ba9084fc 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -5,6 +5,7 @@
5 * 5 *
6 * Copyright (C) International Business Machines Corp., 2002,2003 6 * Copyright (C) International Business Machines Corp., 2002,2003
7 * Author(s): Steve French (sfrench@us.ibm.com) 7 * Author(s): Steve French (sfrench@us.ibm.com)
8 * Jeremy Allison (jra@samba.org)
8 * 9 *
9 * This library is free software; you can redistribute it and/or modify 10 * This library is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU Lesser General Public License as published 11 * it under the terms of the GNU Lesser General Public License as published
@@ -47,6 +48,8 @@ static inline struct cifsFileInfo *cifs_init_private(
47 private_data->netfid = netfid; 48 private_data->netfid = netfid;
48 private_data->pid = current->tgid; 49 private_data->pid = current->tgid;
49 init_MUTEX(&private_data->fh_sem); 50 init_MUTEX(&private_data->fh_sem);
51 init_MUTEX(&private_data->lock_sem);
52 INIT_LIST_HEAD(&private_data->llist);
50 private_data->pfile = file; /* needed for writepage */ 53 private_data->pfile = file; /* needed for writepage */
51 private_data->pInode = inode; 54 private_data->pInode = inode;
52 private_data->invalidHandle = FALSE; 55 private_data->invalidHandle = FALSE;
@@ -473,6 +476,8 @@ int cifs_close(struct inode *inode, struct file *file)
473 cifs_sb = CIFS_SB(inode->i_sb); 476 cifs_sb = CIFS_SB(inode->i_sb);
474 pTcon = cifs_sb->tcon; 477 pTcon = cifs_sb->tcon;
475 if (pSMBFile) { 478 if (pSMBFile) {
479 struct cifsLockInfo *li, *tmp;
480
476 pSMBFile->closePend = TRUE; 481 pSMBFile->closePend = TRUE;
477 if (pTcon) { 482 if (pTcon) {
478 /* no sense reconnecting to close a file that is 483 /* no sense reconnecting to close a file that is
@@ -496,6 +501,16 @@ int cifs_close(struct inode *inode, struct file *file)
496 pSMBFile->netfid); 501 pSMBFile->netfid);
497 } 502 }
498 } 503 }
504
505 /* Delete any outstanding lock records.
506 We'll lose them when the file is closed anyway. */
507 down(&pSMBFile->lock_sem);
508 list_for_each_entry_safe(li, tmp, &pSMBFile->llist, llist) {
509 list_del(&li->llist);
510 kfree(li);
511 }
512 up(&pSMBFile->lock_sem);
513
499 write_lock(&GlobalSMBSeslock); 514 write_lock(&GlobalSMBSeslock);
500 list_del(&pSMBFile->flist); 515 list_del(&pSMBFile->flist);
501 list_del(&pSMBFile->tlist); 516 list_del(&pSMBFile->tlist);
@@ -570,6 +585,21 @@ int cifs_closedir(struct inode *inode, struct file *file)
570 return rc; 585 return rc;
571} 586}
572 587
588static int store_file_lock(struct cifsFileInfo *fid, __u64 len,
589 __u64 offset, __u8 lockType)
590{
591 struct cifsLockInfo *li = kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL);
592 if (li == NULL)
593 return -ENOMEM;
594 li->offset = offset;
595 li->length = len;
596 li->type = lockType;
597 down(&fid->lock_sem);
598 list_add(&li->llist, &fid->llist);
599 up(&fid->lock_sem);
600 return 0;
601}
602
573int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) 603int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
574{ 604{
575 int rc, xid; 605 int rc, xid;
@@ -581,6 +611,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
581 struct cifsTconInfo *pTcon; 611 struct cifsTconInfo *pTcon;
582 __u16 netfid; 612 __u16 netfid;
583 __u8 lockType = LOCKING_ANDX_LARGE_FILES; 613 __u8 lockType = LOCKING_ANDX_LARGE_FILES;
614 int posix_locking;
584 615
585 length = 1 + pfLock->fl_end - pfLock->fl_start; 616 length = 1 + pfLock->fl_end - pfLock->fl_start;
586 rc = -EACCES; 617 rc = -EACCES;
@@ -639,15 +670,14 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
639 } 670 }
640 netfid = ((struct cifsFileInfo *)file->private_data)->netfid; 671 netfid = ((struct cifsFileInfo *)file->private_data)->netfid;
641 672
673 posix_locking = (cifs_sb->tcon->ses->capabilities & CAP_UNIX) &&
674 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(cifs_sb->tcon->fsUnixInfo.Capability));
642 675
643 /* BB add code here to normalize offset and length to 676 /* BB add code here to normalize offset and length to
644 account for negative length which we can not accept over the 677 account for negative length which we can not accept over the
645 wire */ 678 wire */
646 if (IS_GETLK(cmd)) { 679 if (IS_GETLK(cmd)) {
647 if(experimEnabled && 680 if(posix_locking) {
648 (cifs_sb->tcon->ses->capabilities & CAP_UNIX) &&
649 (CIFS_UNIX_FCNTL_CAP &
650 le64_to_cpu(cifs_sb->tcon->fsUnixInfo.Capability))) {
651 int posix_lock_type; 681 int posix_lock_type;
652 if(lockType & LOCKING_ANDX_SHARED_LOCK) 682 if(lockType & LOCKING_ANDX_SHARED_LOCK)
653 posix_lock_type = CIFS_RDLCK; 683 posix_lock_type = CIFS_RDLCK;
@@ -683,10 +713,15 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
683 FreeXid(xid); 713 FreeXid(xid);
684 return rc; 714 return rc;
685 } 715 }
686 if (experimEnabled && 716
687 (cifs_sb->tcon->ses->capabilities & CAP_UNIX) && 717 if (!numLock && !numUnlock) {
688 (CIFS_UNIX_FCNTL_CAP & 718 /* if no lock or unlock then nothing
689 le64_to_cpu(cifs_sb->tcon->fsUnixInfo.Capability))) { 719 to do since we do not know what it is */
720 FreeXid(xid);
721 return -EOPNOTSUPP;
722 }
723
724 if (posix_locking) {
690 int posix_lock_type; 725 int posix_lock_type;
691 if(lockType & LOCKING_ANDX_SHARED_LOCK) 726 if(lockType & LOCKING_ANDX_SHARED_LOCK)
692 posix_lock_type = CIFS_RDLCK; 727 posix_lock_type = CIFS_RDLCK;
@@ -695,18 +730,46 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
695 730
696 if(numUnlock == 1) 731 if(numUnlock == 1)
697 posix_lock_type = CIFS_UNLCK; 732 posix_lock_type = CIFS_UNLCK;
698 else if(numLock == 0) { 733
699 /* if no lock or unlock then nothing
700 to do since we do not know what it is */
701 FreeXid(xid);
702 return -EOPNOTSUPP;
703 }
704 rc = CIFSSMBPosixLock(xid, pTcon, netfid, 0 /* set */, 734 rc = CIFSSMBPosixLock(xid, pTcon, netfid, 0 /* set */,
705 length, pfLock, 735 length, pfLock,
706 posix_lock_type, wait_flag); 736 posix_lock_type, wait_flag);
707 } else 737 } else {
708 rc = CIFSSMBLock(xid, pTcon, netfid, length, pfLock->fl_start, 738 struct cifsFileInfo *fid = (struct cifsFileInfo *)file->private_data;
709 numUnlock, numLock, lockType, wait_flag); 739
740 if (numLock) {
741 rc = CIFSSMBLock(xid, pTcon, netfid, length, pfLock->fl_start,
742 0, numLock, lockType, wait_flag);
743
744 if (rc == 0) {
745 /* For Windows locks we must store them. */
746 rc = store_file_lock(fid, length,
747 pfLock->fl_start, lockType);
748 }
749 } else if (numUnlock) {
750 /* For each stored lock that this unlock overlaps
751 completely, unlock it. */
752 int stored_rc = 0;
753 struct cifsLockInfo *li, *tmp;
754
755 down(&fid->lock_sem);
756 list_for_each_entry_safe(li, tmp, &fid->llist, llist) {
757 if (pfLock->fl_start <= li->offset &&
758 length >= li->length) {
759 stored_rc = CIFSSMBLock(xid, pTcon, netfid,
760 li->length, li->offset,
761 1, 0, li->type, FALSE);
762 if (stored_rc)
763 rc = stored_rc;
764
765 list_del(&li->llist);
766 kfree(li);
767 }
768 }
769 up(&fid->lock_sem);
770 }
771 }
772
710 if (pfLock->fl_flags & FL_POSIX) 773 if (pfLock->fl_flags & FL_POSIX)
711 posix_lock_file_wait(file, pfLock); 774 posix_lock_file_wait(file, pfLock);
712 FreeXid(xid); 775 FreeXid(xid);
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index b66eff5dc624..ce87550e918f 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -72,6 +72,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = {
72 {ERRinvlevel,-EOPNOTSUPP}, 72 {ERRinvlevel,-EOPNOTSUPP},
73 {ERRdirnotempty, -ENOTEMPTY}, 73 {ERRdirnotempty, -ENOTEMPTY},
74 {ERRnotlocked, -ENOLCK}, 74 {ERRnotlocked, -ENOLCK},
75 {ERRcancelviolation, -ENOLCK},
75 {ERRalreadyexists, -EEXIST}, 76 {ERRalreadyexists, -EEXIST},
76 {ERRmoredata, -EOVERFLOW}, 77 {ERRmoredata, -EOVERFLOW},
77 {ERReasnotsupported,-EOPNOTSUPP}, 78 {ERReasnotsupported,-EOPNOTSUPP},
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 03bbcb377913..9aeb58a7d369 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -82,7 +82,6 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
82 if(*ptmp_inode == NULL) 82 if(*ptmp_inode == NULL)
83 return rc; 83 return rc;
84 rc = 1; 84 rc = 1;
85 d_instantiate(tmp_dentry, *ptmp_inode);
86 } 85 }
87 } else { 86 } else {
88 tmp_dentry = d_alloc(file->f_dentry, qstring); 87 tmp_dentry = d_alloc(file->f_dentry, qstring);
@@ -99,9 +98,7 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
99 tmp_dentry->d_op = &cifs_dentry_ops; 98 tmp_dentry->d_op = &cifs_dentry_ops;
100 if(*ptmp_inode == NULL) 99 if(*ptmp_inode == NULL)
101 return rc; 100 return rc;
102 rc = 1; 101 rc = 2;
103 d_instantiate(tmp_dentry, *ptmp_inode);
104 d_rehash(tmp_dentry);
105 } 102 }
106 103
107 tmp_dentry->d_time = jiffies; 104 tmp_dentry->d_time = jiffies;
@@ -556,7 +553,7 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
556 FIND_FILE_STANDARD_INFO * pFindData = 553 FIND_FILE_STANDARD_INFO * pFindData =
557 (FIND_FILE_STANDARD_INFO *)current_entry; 554 (FIND_FILE_STANDARD_INFO *)current_entry;
558 filename = &pFindData->FileName[0]; 555 filename = &pFindData->FileName[0];
559 len = le32_to_cpu(pFindData->FileNameLength); 556 len = pFindData->FileNameLength;
560 } else { 557 } else {
561 cFYI(1,("Unknown findfirst level %d",cfile->srch_inf.info_level)); 558 cFYI(1,("Unknown findfirst level %d",cfile->srch_inf.info_level));
562 } 559 }
@@ -870,6 +867,12 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
870 pfindEntry, &obj_type, rc); 867 pfindEntry, &obj_type, rc);
871 else 868 else
872 fill_in_inode(tmp_inode, 1 /* NT */, pfindEntry, &obj_type, rc); 869 fill_in_inode(tmp_inode, 1 /* NT */, pfindEntry, &obj_type, rc);
870
871 if(rc) /* new inode - needs to be tied to dentry */ {
872 d_instantiate(tmp_dentry, tmp_inode);
873 if(rc == 2)
874 d_rehash(tmp_dentry);
875 }
873 876
874 877
875 rc = filldir(direntry,qstring.name,qstring.len,file->f_pos, 878 rc = filldir(direntry,qstring.name,qstring.len,file->f_pos,
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 7202d534ef0b..d1705ab8136e 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -372,7 +372,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
372 372
373 /* no capabilities flags in old lanman negotiation */ 373 /* no capabilities flags in old lanman negotiation */
374 374
375 pSMB->old_req.PasswordLength = CIFS_SESS_KEY_SIZE; 375 pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
376 /* BB calculate hash with password */ 376 /* BB calculate hash with password */
377 /* and copy into bcc */ 377 /* and copy into bcc */
378 378
diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h
index cd41c67ff8d3..212c3c296409 100644
--- a/fs/cifs/smberr.h
+++ b/fs/cifs/smberr.h
@@ -95,6 +95,7 @@
95#define ERRinvlevel 124 95#define ERRinvlevel 124
96#define ERRdirnotempty 145 96#define ERRdirnotempty 145
97#define ERRnotlocked 158 97#define ERRnotlocked 158
98#define ERRcancelviolation 173
98#define ERRalreadyexists 183 99#define ERRalreadyexists 183
99#define ERRbadpipe 230 100#define ERRbadpipe 230
100#define ERRpipebusy 231 101#define ERRpipebusy 231
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 17ba329e2b3d..48d47b46b1fb 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -3,7 +3,8 @@
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2002,2005 4 * Copyright (C) International Business Machines Corp., 2002,2005
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 * Jeremy Allison (jra@samba.org) 2006.
7 *
7 * This library is free software; you can redistribute it and/or modify 8 * This library is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published 9 * it under the terms of the GNU Lesser General Public License as published
9 * by the Free Software Foundation; either version 2.1 of the License, or 10 * by the Free Software Foundation; either version 2.1 of the License, or
@@ -36,7 +37,7 @@ extern mempool_t *cifs_mid_poolp;
36extern kmem_cache_t *cifs_oplock_cachep; 37extern kmem_cache_t *cifs_oplock_cachep;
37 38
38static struct mid_q_entry * 39static struct mid_q_entry *
39AllocMidQEntry(struct smb_hdr *smb_buffer, struct cifsSesInfo *ses) 40AllocMidQEntry(const struct smb_hdr *smb_buffer, struct cifsSesInfo *ses)
40{ 41{
41 struct mid_q_entry *temp; 42 struct mid_q_entry *temp;
42 43
@@ -203,6 +204,10 @@ smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer,
203 rc = 0; 204 rc = 0;
204 } 205 }
205 206
207 /* Don't want to modify the buffer as a
208 side effect of this call. */
209 smb_buffer->smb_buf_length = smb_buf_length;
210
206 return rc; 211 return rc;
207} 212}
208 213
@@ -217,6 +222,7 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
217 unsigned int len = iov[0].iov_len; 222 unsigned int len = iov[0].iov_len;
218 unsigned int total_len; 223 unsigned int total_len;
219 int first_vec = 0; 224 int first_vec = 0;
225 unsigned int smb_buf_length = smb_buffer->smb_buf_length;
220 226
221 if(ssocket == NULL) 227 if(ssocket == NULL)
222 return -ENOTSOCK; /* BB eventually add reconnect code here */ 228 return -ENOTSOCK; /* BB eventually add reconnect code here */
@@ -293,36 +299,15 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
293 } else 299 } else
294 rc = 0; 300 rc = 0;
295 301
302 /* Don't want to modify the buffer as a
303 side effect of this call. */
304 smb_buffer->smb_buf_length = smb_buf_length;
305
296 return rc; 306 return rc;
297} 307}
298 308
299int 309static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
300SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
301 struct kvec *iov, int n_vec, int * pRespBufType /* ret */,
302 const int long_op)
303{ 310{
304 int rc = 0;
305 unsigned int receive_len;
306 unsigned long timeout;
307 struct mid_q_entry *midQ;
308 struct smb_hdr *in_buf = iov[0].iov_base;
309
310 *pRespBufType = CIFS_NO_BUFFER; /* no response buf yet */
311
312 if ((ses == NULL) || (ses->server == NULL)) {
313 cifs_small_buf_release(in_buf);
314 cERROR(1,("Null session"));
315 return -EIO;
316 }
317
318 if(ses->server->tcpStatus == CifsExiting) {
319 cifs_small_buf_release(in_buf);
320 return -ENOENT;
321 }
322
323 /* Ensure that we do not send more than 50 overlapping requests
324 to the same server. We may make this configurable later or
325 use ses->maxReq */
326 if(long_op == -1) { 311 if(long_op == -1) {
327 /* oplock breaks must not be held up */ 312 /* oplock breaks must not be held up */
328 atomic_inc(&ses->server->inFlight); 313 atomic_inc(&ses->server->inFlight);
@@ -345,53 +330,140 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
345 } else { 330 } else {
346 if(ses->server->tcpStatus == CifsExiting) { 331 if(ses->server->tcpStatus == CifsExiting) {
347 spin_unlock(&GlobalMid_Lock); 332 spin_unlock(&GlobalMid_Lock);
348 cifs_small_buf_release(in_buf);
349 return -ENOENT; 333 return -ENOENT;
350 } 334 }
351 335
352 /* can not count locking commands against total since 336 /* can not count locking commands against total since
353 they are allowed to block on server */ 337 they are allowed to block on server */
354 338
355 if(long_op < 3) {
356 /* update # of requests on the wire to server */ 339 /* update # of requests on the wire to server */
340 if (long_op < 3)
357 atomic_inc(&ses->server->inFlight); 341 atomic_inc(&ses->server->inFlight);
358 }
359 spin_unlock(&GlobalMid_Lock); 342 spin_unlock(&GlobalMid_Lock);
360 break; 343 break;
361 } 344 }
362 } 345 }
363 } 346 }
364 /* make sure that we sign in the same order that we send on this socket 347 return 0;
365 and avoid races inside tcp sendmsg code that could cause corruption 348}
366 of smb data */
367
368 down(&ses->server->tcpSem);
369 349
350static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
351 struct mid_q_entry **ppmidQ)
352{
370 if (ses->server->tcpStatus == CifsExiting) { 353 if (ses->server->tcpStatus == CifsExiting) {
371 rc = -ENOENT; 354 return -ENOENT;
372 goto out_unlock2;
373 } else if (ses->server->tcpStatus == CifsNeedReconnect) { 355 } else if (ses->server->tcpStatus == CifsNeedReconnect) {
374 cFYI(1,("tcp session dead - return to caller to retry")); 356 cFYI(1,("tcp session dead - return to caller to retry"));
375 rc = -EAGAIN; 357 return -EAGAIN;
376 goto out_unlock2;
377 } else if (ses->status != CifsGood) { 358 } else if (ses->status != CifsGood) {
378 /* check if SMB session is bad because we are setting it up */ 359 /* check if SMB session is bad because we are setting it up */
379 if((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) && 360 if((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) &&
380 (in_buf->Command != SMB_COM_NEGOTIATE)) { 361 (in_buf->Command != SMB_COM_NEGOTIATE)) {
381 rc = -EAGAIN; 362 return -EAGAIN;
382 goto out_unlock2;
383 } /* else ok - we are setting up session */ 363 } /* else ok - we are setting up session */
384 } 364 }
385 midQ = AllocMidQEntry(in_buf, ses); 365 *ppmidQ = AllocMidQEntry(in_buf, ses);
386 if (midQ == NULL) { 366 if (*ppmidQ == NULL) {
367 return -ENOMEM;
368 }
369 return 0;
370}
371
372static int wait_for_response(struct cifsSesInfo *ses,
373 struct mid_q_entry *midQ,
374 unsigned long timeout,
375 unsigned long time_to_wait)
376{
377 unsigned long curr_timeout;
378
379 for (;;) {
380 curr_timeout = timeout + jiffies;
381 wait_event(ses->server->response_q,
382 (!(midQ->midState == MID_REQUEST_SUBMITTED)) ||
383 time_after(jiffies, curr_timeout) ||
384 ((ses->server->tcpStatus != CifsGood) &&
385 (ses->server->tcpStatus != CifsNew)));
386
387 if (time_after(jiffies, curr_timeout) &&
388 (midQ->midState == MID_REQUEST_SUBMITTED) &&
389 ((ses->server->tcpStatus == CifsGood) ||
390 (ses->server->tcpStatus == CifsNew))) {
391
392 unsigned long lrt;
393
394 /* We timed out. Is the server still
395 sending replies ? */
396 spin_lock(&GlobalMid_Lock);
397 lrt = ses->server->lstrp;
398 spin_unlock(&GlobalMid_Lock);
399
400 /* Calculate time_to_wait past last receive time.
401 Although we prefer not to time out if the
402 server is still responding - we will time
403 out if the server takes more than 15 (or 45
404 or 180) seconds to respond to this request
405 and has not responded to any request from
406 other threads on the client within 10 seconds */
407 lrt += time_to_wait;
408 if (time_after(jiffies, lrt)) {
409 /* No replies for time_to_wait. */
410 cERROR(1,("server not responding"));
411 return -1;
412 }
413 } else {
414 return 0;
415 }
416 }
417}
418
419int
420SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
421 struct kvec *iov, int n_vec, int * pRespBufType /* ret */,
422 const int long_op)
423{
424 int rc = 0;
425 unsigned int receive_len;
426 unsigned long timeout;
427 struct mid_q_entry *midQ;
428 struct smb_hdr *in_buf = iov[0].iov_base;
429
430 *pRespBufType = CIFS_NO_BUFFER; /* no response buf yet */
431
432 if ((ses == NULL) || (ses->server == NULL)) {
433 cifs_small_buf_release(in_buf);
434 cERROR(1,("Null session"));
435 return -EIO;
436 }
437
438 if(ses->server->tcpStatus == CifsExiting) {
439 cifs_small_buf_release(in_buf);
440 return -ENOENT;
441 }
442
443 /* Ensure that we do not send more than 50 overlapping requests
444 to the same server. We may make this configurable later or
445 use ses->maxReq */
446
447 rc = wait_for_free_request(ses, long_op);
448 if (rc) {
449 cifs_small_buf_release(in_buf);
450 return rc;
451 }
452
453 /* make sure that we sign in the same order that we send on this socket
454 and avoid races inside tcp sendmsg code that could cause corruption
455 of smb data */
456
457 down(&ses->server->tcpSem);
458
459 rc = allocate_mid(ses, in_buf, &midQ);
460 if (rc) {
387 up(&ses->server->tcpSem); 461 up(&ses->server->tcpSem);
388 cifs_small_buf_release(in_buf); 462 cifs_small_buf_release(in_buf);
389 /* If not lock req, update # of requests on wire to server */ 463 /* Update # of requests on wire to server */
390 if(long_op < 3) { 464 atomic_dec(&ses->server->inFlight);
391 atomic_dec(&ses->server->inFlight); 465 wake_up(&ses->server->request_q);
392 wake_up(&ses->server->request_q); 466 return rc;
393 }
394 return -ENOMEM;
395 } 467 }
396 468
397 rc = cifs_sign_smb2(iov, n_vec, ses->server, &midQ->sequence_number); 469 rc = cifs_sign_smb2(iov, n_vec, ses->server, &midQ->sequence_number);
@@ -406,32 +478,23 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
406 atomic_dec(&ses->server->inSend); 478 atomic_dec(&ses->server->inSend);
407 midQ->when_sent = jiffies; 479 midQ->when_sent = jiffies;
408#endif 480#endif
409 if(rc < 0) { 481
410 DeleteMidQEntry(midQ); 482 up(&ses->server->tcpSem);
411 up(&ses->server->tcpSem); 483 cifs_small_buf_release(in_buf);
412 cifs_small_buf_release(in_buf); 484
413 /* If not lock req, update # of requests on wire to server */ 485 if(rc < 0)
414 if(long_op < 3) { 486 goto out;
415 atomic_dec(&ses->server->inFlight);
416 wake_up(&ses->server->request_q);
417 }
418 return rc;
419 } else {
420 up(&ses->server->tcpSem);
421 cifs_small_buf_release(in_buf);
422 }
423 487
424 if (long_op == -1) 488 if (long_op == -1)
425 goto cifs_no_response_exit2; 489 goto out;
426 else if (long_op == 2) /* writes past end of file can take loong time */ 490 else if (long_op == 2) /* writes past end of file can take loong time */
427 timeout = 180 * HZ; 491 timeout = 180 * HZ;
428 else if (long_op == 1) 492 else if (long_op == 1)
429 timeout = 45 * HZ; /* should be greater than 493 timeout = 45 * HZ; /* should be greater than
430 servers oplock break timeout (about 43 seconds) */ 494 servers oplock break timeout (about 43 seconds) */
431 else if (long_op > 2) { 495 else
432 timeout = MAX_SCHEDULE_TIMEOUT;
433 } else
434 timeout = 15 * HZ; 496 timeout = 15 * HZ;
497
435 /* wait for 15 seconds or until woken up due to response arriving or 498 /* wait for 15 seconds or until woken up due to response arriving or
436 due to last connection to this server being unmounted */ 499 due to last connection to this server being unmounted */
437 if (signal_pending(current)) { 500 if (signal_pending(current)) {
@@ -441,19 +504,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
441 } 504 }
442 505
443 /* No user interrupts in wait - wreaks havoc with performance */ 506 /* No user interrupts in wait - wreaks havoc with performance */
444 if(timeout != MAX_SCHEDULE_TIMEOUT) { 507 wait_for_response(ses, midQ, timeout, 10 * HZ);
445 timeout += jiffies;
446 wait_event(ses->server->response_q,
447 (!(midQ->midState & MID_REQUEST_SUBMITTED)) ||
448 time_after(jiffies, timeout) ||
449 ((ses->server->tcpStatus != CifsGood) &&
450 (ses->server->tcpStatus != CifsNew)));
451 } else {
452 wait_event(ses->server->response_q,
453 (!(midQ->midState & MID_REQUEST_SUBMITTED)) ||
454 ((ses->server->tcpStatus != CifsGood) &&
455 (ses->server->tcpStatus != CifsNew)));
456 }
457 508
458 spin_lock(&GlobalMid_Lock); 509 spin_lock(&GlobalMid_Lock);
459 if (midQ->resp_buf) { 510 if (midQ->resp_buf) {
@@ -481,11 +532,9 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
481 } 532 }
482 spin_unlock(&GlobalMid_Lock); 533 spin_unlock(&GlobalMid_Lock);
483 DeleteMidQEntry(midQ); 534 DeleteMidQEntry(midQ);
484 /* If not lock req, update # of requests on wire to server */ 535 /* Update # of requests on wire to server */
485 if(long_op < 3) { 536 atomic_dec(&ses->server->inFlight);
486 atomic_dec(&ses->server->inFlight); 537 wake_up(&ses->server->request_q);
487 wake_up(&ses->server->request_q);
488 }
489 return rc; 538 return rc;
490 } 539 }
491 540
@@ -536,24 +585,12 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
536 cFYI(1,("Bad MID state?")); 585 cFYI(1,("Bad MID state?"));
537 } 586 }
538 } 587 }
539cifs_no_response_exit2:
540 DeleteMidQEntry(midQ);
541
542 if(long_op < 3) {
543 atomic_dec(&ses->server->inFlight);
544 wake_up(&ses->server->request_q);
545 }
546 588
547 return rc; 589out:
548 590
549out_unlock2: 591 DeleteMidQEntry(midQ);
550 up(&ses->server->tcpSem); 592 atomic_dec(&ses->server->inFlight);
551 cifs_small_buf_release(in_buf); 593 wake_up(&ses->server->request_q);
552 /* If not lock req, update # of requests on wire to server */
553 if(long_op < 3) {
554 atomic_dec(&ses->server->inFlight);
555 wake_up(&ses->server->request_q);
556 }
557 594
558 return rc; 595 return rc;
559} 596}
@@ -583,85 +620,34 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
583 /* Ensure that we do not send more than 50 overlapping requests 620 /* Ensure that we do not send more than 50 overlapping requests
584 to the same server. We may make this configurable later or 621 to the same server. We may make this configurable later or
585 use ses->maxReq */ 622 use ses->maxReq */
586 if(long_op == -1) {
587 /* oplock breaks must not be held up */
588 atomic_inc(&ses->server->inFlight);
589 } else {
590 spin_lock(&GlobalMid_Lock);
591 while(1) {
592 if(atomic_read(&ses->server->inFlight) >=
593 cifs_max_pending){
594 spin_unlock(&GlobalMid_Lock);
595#ifdef CONFIG_CIFS_STATS2
596 atomic_inc(&ses->server->num_waiters);
597#endif
598 wait_event(ses->server->request_q,
599 atomic_read(&ses->server->inFlight)
600 < cifs_max_pending);
601#ifdef CONFIG_CIFS_STATS2
602 atomic_dec(&ses->server->num_waiters);
603#endif
604 spin_lock(&GlobalMid_Lock);
605 } else {
606 if(ses->server->tcpStatus == CifsExiting) {
607 spin_unlock(&GlobalMid_Lock);
608 return -ENOENT;
609 }
610 623
611 /* can not count locking commands against total since 624 rc = wait_for_free_request(ses, long_op);
612 they are allowed to block on server */ 625 if (rc)
613 626 return rc;
614 if(long_op < 3) { 627
615 /* update # of requests on the wire to server */
616 atomic_inc(&ses->server->inFlight);
617 }
618 spin_unlock(&GlobalMid_Lock);
619 break;
620 }
621 }
622 }
623 /* make sure that we sign in the same order that we send on this socket 628 /* make sure that we sign in the same order that we send on this socket
624 and avoid races inside tcp sendmsg code that could cause corruption 629 and avoid races inside tcp sendmsg code that could cause corruption
625 of smb data */ 630 of smb data */
626 631
627 down(&ses->server->tcpSem); 632 down(&ses->server->tcpSem);
628 633
629 if (ses->server->tcpStatus == CifsExiting) { 634 rc = allocate_mid(ses, in_buf, &midQ);
630 rc = -ENOENT; 635 if (rc) {
631 goto out_unlock;
632 } else if (ses->server->tcpStatus == CifsNeedReconnect) {
633 cFYI(1,("tcp session dead - return to caller to retry"));
634 rc = -EAGAIN;
635 goto out_unlock;
636 } else if (ses->status != CifsGood) {
637 /* check if SMB session is bad because we are setting it up */
638 if((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) &&
639 (in_buf->Command != SMB_COM_NEGOTIATE)) {
640 rc = -EAGAIN;
641 goto out_unlock;
642 } /* else ok - we are setting up session */
643 }
644 midQ = AllocMidQEntry(in_buf, ses);
645 if (midQ == NULL) {
646 up(&ses->server->tcpSem); 636 up(&ses->server->tcpSem);
647 /* If not lock req, update # of requests on wire to server */ 637 /* Update # of requests on wire to server */
648 if(long_op < 3) { 638 atomic_dec(&ses->server->inFlight);
649 atomic_dec(&ses->server->inFlight); 639 wake_up(&ses->server->request_q);
650 wake_up(&ses->server->request_q); 640 return rc;
651 }
652 return -ENOMEM;
653 } 641 }
654 642
655 if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { 643 if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
656 up(&ses->server->tcpSem);
657 cERROR(1, ("Illegal length, greater than maximum frame, %d", 644 cERROR(1, ("Illegal length, greater than maximum frame, %d",
658 in_buf->smb_buf_length)); 645 in_buf->smb_buf_length));
659 DeleteMidQEntry(midQ); 646 DeleteMidQEntry(midQ);
660 /* If not lock req, update # of requests on wire to server */ 647 up(&ses->server->tcpSem);
661 if(long_op < 3) { 648 /* Update # of requests on wire to server */
662 atomic_dec(&ses->server->inFlight); 649 atomic_dec(&ses->server->inFlight);
663 wake_up(&ses->server->request_q); 650 wake_up(&ses->server->request_q);
664 }
665 return -EIO; 651 return -EIO;
666 } 652 }
667 653
@@ -677,27 +663,19 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
677 atomic_dec(&ses->server->inSend); 663 atomic_dec(&ses->server->inSend);
678 midQ->when_sent = jiffies; 664 midQ->when_sent = jiffies;
679#endif 665#endif
680 if(rc < 0) { 666 up(&ses->server->tcpSem);
681 DeleteMidQEntry(midQ); 667
682 up(&ses->server->tcpSem); 668 if(rc < 0)
683 /* If not lock req, update # of requests on wire to server */ 669 goto out;
684 if(long_op < 3) { 670
685 atomic_dec(&ses->server->inFlight);
686 wake_up(&ses->server->request_q);
687 }
688 return rc;
689 } else
690 up(&ses->server->tcpSem);
691 if (long_op == -1) 671 if (long_op == -1)
692 goto cifs_no_response_exit; 672 goto out;
693 else if (long_op == 2) /* writes past end of file can take loong time */ 673 else if (long_op == 2) /* writes past end of file can take loong time */
694 timeout = 180 * HZ; 674 timeout = 180 * HZ;
695 else if (long_op == 1) 675 else if (long_op == 1)
696 timeout = 45 * HZ; /* should be greater than 676 timeout = 45 * HZ; /* should be greater than
697 servers oplock break timeout (about 43 seconds) */ 677 servers oplock break timeout (about 43 seconds) */
698 else if (long_op > 2) { 678 else
699 timeout = MAX_SCHEDULE_TIMEOUT;
700 } else
701 timeout = 15 * HZ; 679 timeout = 15 * HZ;
702 /* wait for 15 seconds or until woken up due to response arriving or 680 /* wait for 15 seconds or until woken up due to response arriving or
703 due to last connection to this server being unmounted */ 681 due to last connection to this server being unmounted */
@@ -708,19 +686,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
708 } 686 }
709 687
710 /* No user interrupts in wait - wreaks havoc with performance */ 688 /* No user interrupts in wait - wreaks havoc with performance */
711 if(timeout != MAX_SCHEDULE_TIMEOUT) { 689 wait_for_response(ses, midQ, timeout, 10 * HZ);
712 timeout += jiffies;
713 wait_event(ses->server->response_q,
714 (!(midQ->midState & MID_REQUEST_SUBMITTED)) ||
715 time_after(jiffies, timeout) ||
716 ((ses->server->tcpStatus != CifsGood) &&
717 (ses->server->tcpStatus != CifsNew)));
718 } else {
719 wait_event(ses->server->response_q,
720 (!(midQ->midState & MID_REQUEST_SUBMITTED)) ||
721 ((ses->server->tcpStatus != CifsGood) &&
722 (ses->server->tcpStatus != CifsNew)));
723 }
724 690
725 spin_lock(&GlobalMid_Lock); 691 spin_lock(&GlobalMid_Lock);
726 if (midQ->resp_buf) { 692 if (midQ->resp_buf) {
@@ -748,11 +714,9 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
748 } 714 }
749 spin_unlock(&GlobalMid_Lock); 715 spin_unlock(&GlobalMid_Lock);
750 DeleteMidQEntry(midQ); 716 DeleteMidQEntry(midQ);
751 /* If not lock req, update # of requests on wire to server */ 717 /* Update # of requests on wire to server */
752 if(long_op < 3) { 718 atomic_dec(&ses->server->inFlight);
753 atomic_dec(&ses->server->inFlight); 719 wake_up(&ses->server->request_q);
754 wake_up(&ses->server->request_q);
755 }
756 return rc; 720 return rc;
757 } 721 }
758 722
@@ -799,23 +763,253 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
799 cERROR(1,("Bad MID state?")); 763 cERROR(1,("Bad MID state?"));
800 } 764 }
801 } 765 }
802cifs_no_response_exit: 766
767out:
768
803 DeleteMidQEntry(midQ); 769 DeleteMidQEntry(midQ);
770 atomic_dec(&ses->server->inFlight);
771 wake_up(&ses->server->request_q);
804 772
805 if(long_op < 3) { 773 return rc;
806 atomic_dec(&ses->server->inFlight); 774}
807 wake_up(&ses->server->request_q); 775
808 } 776/* Send an NT_CANCEL SMB to cause the POSIX blocking lock to return. */
777
778static int
779send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
780 struct mid_q_entry *midQ)
781{
782 int rc = 0;
783 struct cifsSesInfo *ses = tcon->ses;
784 __u16 mid = in_buf->Mid;
809 785
786 header_assemble(in_buf, SMB_COM_NT_CANCEL, tcon, 0);
787 in_buf->Mid = mid;
788 down(&ses->server->tcpSem);
789 rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
790 if (rc) {
791 up(&ses->server->tcpSem);
792 return rc;
793 }
794 rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
795 (struct sockaddr *) &(ses->server->addr.sockAddr));
796 up(&ses->server->tcpSem);
810 return rc; 797 return rc;
798}
799
800/* We send a LOCKINGX_CANCEL_LOCK to cause the Windows
801 blocking lock to return. */
802
803static int
804send_lock_cancel(const unsigned int xid, struct cifsTconInfo *tcon,
805 struct smb_hdr *in_buf,
806 struct smb_hdr *out_buf)
807{
808 int bytes_returned;
809 struct cifsSesInfo *ses = tcon->ses;
810 LOCK_REQ *pSMB = (LOCK_REQ *)in_buf;
811
812 /* We just modify the current in_buf to change
813 the type of lock from LOCKING_ANDX_SHARED_LOCK
814 or LOCKING_ANDX_EXCLUSIVE_LOCK to
815 LOCKING_ANDX_CANCEL_LOCK. */
816
817 pSMB->LockType = LOCKING_ANDX_CANCEL_LOCK|LOCKING_ANDX_LARGE_FILES;
818 pSMB->Timeout = 0;
819 pSMB->hdr.Mid = GetNextMid(ses->server);
820
821 return SendReceive(xid, ses, in_buf, out_buf,
822 &bytes_returned, 0);
823}
811 824
812out_unlock: 825int
826SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
827 struct smb_hdr *in_buf, struct smb_hdr *out_buf,
828 int *pbytes_returned)
829{
830 int rc = 0;
831 int rstart = 0;
832 unsigned int receive_len;
833 struct mid_q_entry *midQ;
834 struct cifsSesInfo *ses;
835
836 if (tcon == NULL || tcon->ses == NULL) {
837 cERROR(1,("Null smb session"));
838 return -EIO;
839 }
840 ses = tcon->ses;
841
842 if(ses->server == NULL) {
843 cERROR(1,("Null tcp session"));
844 return -EIO;
845 }
846
847 if(ses->server->tcpStatus == CifsExiting)
848 return -ENOENT;
849
850 /* Ensure that we do not send more than 50 overlapping requests
851 to the same server. We may make this configurable later or
852 use ses->maxReq */
853
854 rc = wait_for_free_request(ses, 3);
855 if (rc)
856 return rc;
857
858 /* make sure that we sign in the same order that we send on this socket
859 and avoid races inside tcp sendmsg code that could cause corruption
860 of smb data */
861
862 down(&ses->server->tcpSem);
863
864 rc = allocate_mid(ses, in_buf, &midQ);
865 if (rc) {
866 up(&ses->server->tcpSem);
867 return rc;
868 }
869
870 if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
871 up(&ses->server->tcpSem);
872 cERROR(1, ("Illegal length, greater than maximum frame, %d",
873 in_buf->smb_buf_length));
874 DeleteMidQEntry(midQ);
875 return -EIO;
876 }
877
878 rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
879
880 midQ->midState = MID_REQUEST_SUBMITTED;
881#ifdef CONFIG_CIFS_STATS2
882 atomic_inc(&ses->server->inSend);
883#endif
884 rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
885 (struct sockaddr *) &(ses->server->addr.sockAddr));
886#ifdef CONFIG_CIFS_STATS2
887 atomic_dec(&ses->server->inSend);
888 midQ->when_sent = jiffies;
889#endif
813 up(&ses->server->tcpSem); 890 up(&ses->server->tcpSem);
814 /* If not lock req, update # of requests on wire to server */ 891
815 if(long_op < 3) { 892 if(rc < 0) {
816 atomic_dec(&ses->server->inFlight); 893 DeleteMidQEntry(midQ);
817 wake_up(&ses->server->request_q); 894 return rc;
895 }
896
897 /* Wait for a reply - allow signals to interrupt. */
898 rc = wait_event_interruptible(ses->server->response_q,
899 (!(midQ->midState == MID_REQUEST_SUBMITTED)) ||
900 ((ses->server->tcpStatus != CifsGood) &&
901 (ses->server->tcpStatus != CifsNew)));
902
903 /* Were we interrupted by a signal ? */
904 if ((rc == -ERESTARTSYS) &&
905 (midQ->midState == MID_REQUEST_SUBMITTED) &&
906 ((ses->server->tcpStatus == CifsGood) ||
907 (ses->server->tcpStatus == CifsNew))) {
908
909 if (in_buf->Command == SMB_COM_TRANSACTION2) {
910 /* POSIX lock. We send a NT_CANCEL SMB to cause the
911 blocking lock to return. */
912
913 rc = send_nt_cancel(tcon, in_buf, midQ);
914 if (rc) {
915 DeleteMidQEntry(midQ);
916 return rc;
917 }
918 } else {
919 /* Windows lock. We send a LOCKINGX_CANCEL_LOCK
920 to cause the blocking lock to return. */
921
922 rc = send_lock_cancel(xid, tcon, in_buf, out_buf);
923
924 /* If we get -ENOLCK back the lock may have
925 already been removed. Don't exit in this case. */
926 if (rc && rc != -ENOLCK) {
927 DeleteMidQEntry(midQ);
928 return rc;
929 }
930 }
931
932 /* Wait 5 seconds for the response. */
933 if (wait_for_response(ses, midQ, 5 * HZ, 5 * HZ)==0) {
934 /* We got the response - restart system call. */
935 rstart = 1;
936 }
937 }
938
939 spin_lock(&GlobalMid_Lock);
940 if (midQ->resp_buf) {
941 spin_unlock(&GlobalMid_Lock);
942 receive_len = midQ->resp_buf->smb_buf_length;
943 } else {
944 cERROR(1,("No response for cmd %d mid %d",
945 midQ->command, midQ->mid));
946 if(midQ->midState == MID_REQUEST_SUBMITTED) {
947 if(ses->server->tcpStatus == CifsExiting)
948 rc = -EHOSTDOWN;
949 else {
950 ses->server->tcpStatus = CifsNeedReconnect;
951 midQ->midState = MID_RETRY_NEEDED;
952 }
953 }
954
955 if (rc != -EHOSTDOWN) {
956 if(midQ->midState == MID_RETRY_NEEDED) {
957 rc = -EAGAIN;
958 cFYI(1,("marking request for retry"));
959 } else {
960 rc = -EIO;
961 }
962 }
963 spin_unlock(&GlobalMid_Lock);
964 DeleteMidQEntry(midQ);
965 return rc;
818 } 966 }
967
968 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
969 cERROR(1, ("Frame too large received. Length: %d Xid: %d",
970 receive_len, xid));
971 rc = -EIO;
972 } else { /* rcvd frame is ok */
973
974 if (midQ->resp_buf && out_buf
975 && (midQ->midState == MID_RESPONSE_RECEIVED)) {
976 out_buf->smb_buf_length = receive_len;
977 memcpy((char *)out_buf + 4,
978 (char *)midQ->resp_buf + 4,
979 receive_len);
980
981 dump_smb(out_buf, 92);
982 /* convert the length into a more usable form */
983 if((receive_len > 24) &&
984 (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
985 SECMODE_SIGN_ENABLED))) {
986 rc = cifs_verify_signature(out_buf,
987 ses->server->mac_signing_key,
988 midQ->sequence_number+1);
989 if(rc) {
990 cERROR(1,("Unexpected SMB signature"));
991 /* BB FIXME add code to kill session */
992 }
993 }
994
995 *pbytes_returned = out_buf->smb_buf_length;
996
997 /* BB special case reconnect tid and uid here? */
998 rc = map_smb_to_linux_error(out_buf);
819 999
1000 /* convert ByteCount if necessary */
1001 if (receive_len >=
1002 sizeof (struct smb_hdr) -
1003 4 /* do not count RFC1001 header */ +
1004 (2 * out_buf->WordCount) + 2 /* bcc */ )
1005 BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
1006 } else {
1007 rc = -EIO;
1008 cERROR(1,("Bad MID state?"));
1009 }
1010 }
1011 DeleteMidQEntry(midQ);
1012 if (rstart && rc == -EACCES)
1013 return -ERESTARTSYS;
820 return rc; 1014 return rc;
821} 1015}
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 7754d641775e..067648b7179b 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -330,11 +330,15 @@ ssize_t cifs_listxattr(struct dentry * direntry, char * data, size_t buf_size)
330 sb = direntry->d_inode->i_sb; 330 sb = direntry->d_inode->i_sb;
331 if(sb == NULL) 331 if(sb == NULL)
332 return -EIO; 332 return -EIO;
333 xid = GetXid();
334 333
335 cifs_sb = CIFS_SB(sb); 334 cifs_sb = CIFS_SB(sb);
336 pTcon = cifs_sb->tcon; 335 pTcon = cifs_sb->tcon;
337 336
337 if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
338 return -EOPNOTSUPP;
339
340 xid = GetXid();
341
338 full_path = build_path_from_dentry(direntry); 342 full_path = build_path_from_dentry(direntry);
339 if(full_path == NULL) { 343 if(full_path == NULL) {
340 FreeXid(xid); 344 FreeXid(xid);
diff --git a/fs/coda/file.c b/fs/coda/file.c
index cc66c681bd11..dbfbcfa5b3c0 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -136,10 +136,8 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
136 coda_vfs_stat.open++; 136 coda_vfs_stat.open++;
137 137
138 cfi = kmalloc(sizeof(struct coda_file_info), GFP_KERNEL); 138 cfi = kmalloc(sizeof(struct coda_file_info), GFP_KERNEL);
139 if (!cfi) { 139 if (!cfi)
140 unlock_kernel();
141 return -ENOMEM; 140 return -ENOMEM;
142 }
143 141
144 lock_kernel(); 142 lock_kernel();
145 143
diff --git a/fs/dcache.c b/fs/dcache.c
index c6e3535be192..1b4a3a34ec57 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -38,7 +38,7 @@ int sysctl_vfs_cache_pressure __read_mostly = 100;
38EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); 38EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
39 39
40 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock); 40 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
41static seqlock_t rename_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; 41static __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
42 42
43EXPORT_SYMBOL(dcache_lock); 43EXPORT_SYMBOL(dcache_lock);
44 44
@@ -1339,10 +1339,10 @@ void d_move(struct dentry * dentry, struct dentry * target)
1339 */ 1339 */
1340 if (target < dentry) { 1340 if (target < dentry) {
1341 spin_lock(&target->d_lock); 1341 spin_lock(&target->d_lock);
1342 spin_lock(&dentry->d_lock); 1342 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
1343 } else { 1343 } else {
1344 spin_lock(&dentry->d_lock); 1344 spin_lock(&dentry->d_lock);
1345 spin_lock(&target->d_lock); 1345 spin_lock_nested(&target->d_lock, DENTRY_D_LOCK_NESTED);
1346 } 1346 }
1347 1347
1348 /* Move the dentry to the target hash queue, if on different bucket */ 1348 /* Move the dentry to the target hash queue, if on different bucket */
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 538fb0418fba..5981e17f46f0 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -220,7 +220,8 @@ static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes)
220 if (dio->end_io && dio->result) 220 if (dio->end_io && dio->result)
221 dio->end_io(dio->iocb, offset, bytes, dio->map_bh.b_private); 221 dio->end_io(dio->iocb, offset, bytes, dio->map_bh.b_private);
222 if (dio->lock_type == DIO_LOCKING) 222 if (dio->lock_type == DIO_LOCKING)
223 up_read(&dio->inode->i_alloc_sem); 223 /* lockdep: non-owner release */
224 up_read_non_owner(&dio->inode->i_alloc_sem);
224} 225}
225 226
226/* 227/*
@@ -1261,7 +1262,8 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1261 } 1262 }
1262 1263
1263 if (dio_lock_type == DIO_LOCKING) 1264 if (dio_lock_type == DIO_LOCKING)
1264 down_read(&inode->i_alloc_sem); 1265 /* lockdep: not the owner will release it */
1266 down_read_non_owner(&inode->i_alloc_sem);
1265 } 1267 }
1266 1268
1267 /* 1269 /*
diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c
index e249cf733a6b..1d30d2ff440f 100644
--- a/fs/efs/symlink.c
+++ b/fs/efs/symlink.c
@@ -22,7 +22,7 @@ static int efs_symlink_readpage(struct file *file, struct page *page)
22 22
23 err = -ENAMETOOLONG; 23 err = -ENAMETOOLONG;
24 if (size > 2 * EFS_BLOCKSIZE) 24 if (size > 2 * EFS_BLOCKSIZE)
25 goto fail; 25 goto fail_notlocked;
26 26
27 lock_kernel(); 27 lock_kernel();
28 /* read first 512 bytes of link target */ 28 /* read first 512 bytes of link target */
@@ -47,6 +47,7 @@ static int efs_symlink_readpage(struct file *file, struct page *page)
47 return 0; 47 return 0;
48fail: 48fail:
49 unlock_kernel(); 49 unlock_kernel();
50fail_notlocked:
50 SetPageError(page); 51 SetPageError(page);
51 kunmap(page); 52 kunmap(page);
52 unlock_page(page); 53 unlock_page(page);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9c677bbd0b08..3a3567433b92 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -120,7 +120,7 @@ struct epoll_filefd {
120 */ 120 */
121struct wake_task_node { 121struct wake_task_node {
122 struct list_head llink; 122 struct list_head llink;
123 task_t *task; 123 struct task_struct *task;
124 wait_queue_head_t *wq; 124 wait_queue_head_t *wq;
125}; 125};
126 126
@@ -413,7 +413,7 @@ static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq)
413{ 413{
414 int wake_nests = 0; 414 int wake_nests = 0;
415 unsigned long flags; 415 unsigned long flags;
416 task_t *this_task = current; 416 struct task_struct *this_task = current;
417 struct list_head *lsthead = &psw->wake_task_list, *lnk; 417 struct list_head *lsthead = &psw->wake_task_list, *lnk;
418 struct wake_task_node *tncur; 418 struct wake_task_node *tncur;
419 struct wake_task_node tnode; 419 struct wake_task_node tnode;
@@ -1168,7 +1168,7 @@ static int ep_unlink(struct eventpoll *ep, struct epitem *epi)
1168eexit_1: 1168eexit_1:
1169 1169
1170 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n", 1170 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n",
1171 current, ep, epi->file, error)); 1171 current, ep, epi->ffd.file, error));
1172 1172
1173 return error; 1173 return error;
1174} 1174}
@@ -1236,7 +1236,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
1236 struct eventpoll *ep = epi->ep; 1236 struct eventpoll *ep = epi->ep;
1237 1237
1238 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n", 1238 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
1239 current, epi->file, epi, ep)); 1239 current, epi->ffd.file, epi, ep));
1240 1240
1241 write_lock_irqsave(&ep->lock, flags); 1241 write_lock_irqsave(&ep->lock, flags);
1242 1242
diff --git a/fs/exec.c b/fs/exec.c
index 8344ba73a2a6..54135df2a966 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -486,8 +486,6 @@ struct file *open_exec(const char *name)
486 if (!(nd.mnt->mnt_flags & MNT_NOEXEC) && 486 if (!(nd.mnt->mnt_flags & MNT_NOEXEC) &&
487 S_ISREG(inode->i_mode)) { 487 S_ISREG(inode->i_mode)) {
488 int err = vfs_permission(&nd, MAY_EXEC); 488 int err = vfs_permission(&nd, MAY_EXEC);
489 if (!err && !(inode->i_mode & 0111))
490 err = -EACCES;
491 file = ERR_PTR(err); 489 file = ERR_PTR(err);
492 if (!err) { 490 if (!err) {
493 file = nameidata_to_filp(&nd, O_RDONLY); 491 file = nameidata_to_filp(&nd, O_RDONLY);
@@ -753,7 +751,7 @@ no_thread_group:
753 751
754 write_lock_irq(&tasklist_lock); 752 write_lock_irq(&tasklist_lock);
755 spin_lock(&oldsighand->siglock); 753 spin_lock(&oldsighand->siglock);
756 spin_lock(&newsighand->siglock); 754 spin_lock_nested(&newsighand->siglock, SINGLE_DEPTH_NESTING);
757 755
758 rcu_assign_pointer(current->sighand, newsighand); 756 rcu_assign_pointer(current->sighand, newsighand);
759 recalc_sigpending(); 757 recalc_sigpending();
@@ -922,12 +920,6 @@ int prepare_binprm(struct linux_binprm *bprm)
922 int retval; 920 int retval;
923 921
924 mode = inode->i_mode; 922 mode = inode->i_mode;
925 /*
926 * Check execute perms again - if the caller has CAP_DAC_OVERRIDE,
927 * generic_permission lets a non-executable through
928 */
929 if (!(mode & 0111)) /* with at least _one_ execute bit set */
930 return -EACCES;
931 if (bprm->file->f_op == NULL) 923 if (bprm->file->f_op == NULL)
932 return -EACCES; 924 return -EACCES;
933 925
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 9f43879d6d68..ca5bfb6914d2 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -251,6 +251,44 @@ static struct super_operations ext2_sops = {
251#endif 251#endif
252}; 252};
253 253
254static struct dentry *ext2_get_dentry(struct super_block *sb, void *vobjp)
255{
256 __u32 *objp = vobjp;
257 unsigned long ino = objp[0];
258 __u32 generation = objp[1];
259 struct inode *inode;
260 struct dentry *result;
261
262 if (ino < EXT2_FIRST_INO(sb) && ino != EXT2_ROOT_INO)
263 return ERR_PTR(-ESTALE);
264 if (ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count))
265 return ERR_PTR(-ESTALE);
266
267 /* iget isn't really right if the inode is currently unallocated!!
268 * ext2_read_inode currently does appropriate checks, but
269 * it might be "neater" to call ext2_get_inode first and check
270 * if the inode is valid.....
271 */
272 inode = iget(sb, ino);
273 if (inode == NULL)
274 return ERR_PTR(-ENOMEM);
275 if (is_bad_inode(inode) ||
276 (generation && inode->i_generation != generation)) {
277 /* we didn't find the right inode.. */
278 iput(inode);
279 return ERR_PTR(-ESTALE);
280 }
281 /* now to find a dentry.
282 * If possible, get a well-connected one
283 */
284 result = d_alloc_anon(inode);
285 if (!result) {
286 iput(inode);
287 return ERR_PTR(-ENOMEM);
288 }
289 return result;
290}
291
254/* Yes, most of these are left as NULL!! 292/* Yes, most of these are left as NULL!!
255 * A NULL value implies the default, which works with ext2-like file 293 * A NULL value implies the default, which works with ext2-like file
256 * systems, but can be improved upon. 294 * systems, but can be improved upon.
@@ -258,6 +296,7 @@ static struct super_operations ext2_sops = {
258 */ 296 */
259static struct export_operations ext2_export_ops = { 297static struct export_operations ext2_export_ops = {
260 .get_parent = ext2_get_parent, 298 .get_parent = ext2_get_parent,
299 .get_dentry = ext2_get_dentry,
261}; 300};
262 301
263static unsigned long get_sb_block(void **data) 302static unsigned long get_sb_block(void **data)
@@ -775,7 +814,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
775 if (EXT2_INODE_SIZE(sb) == 0) 814 if (EXT2_INODE_SIZE(sb) == 0)
776 goto cantfind_ext2; 815 goto cantfind_ext2;
777 sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb); 816 sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb);
778 if (sbi->s_inodes_per_block == 0) 817 if (sbi->s_inodes_per_block == 0 || sbi->s_inodes_per_group == 0)
779 goto cantfind_ext2; 818 goto cantfind_ext2;
780 sbi->s_itb_per_group = sbi->s_inodes_per_group / 819 sbi->s_itb_per_group = sbi->s_inodes_per_group /
781 sbi->s_inodes_per_block; 820 sbi->s_inodes_per_block;
@@ -1157,7 +1196,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
1157 struct buffer_head tmp_bh; 1196 struct buffer_head tmp_bh;
1158 struct buffer_head *bh; 1197 struct buffer_head *bh;
1159 1198
1160 mutex_lock(&inode->i_mutex); 1199 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
1161 while (towrite > 0) { 1200 while (towrite > 0) {
1162 tocopy = sb->s_blocksize - offset < towrite ? 1201 tocopy = sb->s_blocksize - offset < towrite ?
1163 sb->s_blocksize - offset : towrite; 1202 sb->s_blocksize - offset : towrite;
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 92d50b53a933..0d1e6279cbfd 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -62,9 +62,6 @@ extern int ext3_permission (struct inode *, int, struct nameidata *);
62extern int ext3_acl_chmod (struct inode *); 62extern int ext3_acl_chmod (struct inode *);
63extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); 63extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
64 64
65extern int init_ext3_acl(void);
66extern void exit_ext3_acl(void);
67
68#else /* CONFIG_EXT3_FS_POSIX_ACL */ 65#else /* CONFIG_EXT3_FS_POSIX_ACL */
69#include <linux/sched.h> 66#include <linux/sched.h>
70#define ext3_permission NULL 67#define ext3_permission NULL
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index a504a40d6d29..063d994bda0b 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1269,12 +1269,12 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
1269 goal = le32_to_cpu(es->s_first_data_block); 1269 goal = le32_to_cpu(es->s_first_data_block);
1270 group_no = (goal - le32_to_cpu(es->s_first_data_block)) / 1270 group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
1271 EXT3_BLOCKS_PER_GROUP(sb); 1271 EXT3_BLOCKS_PER_GROUP(sb);
1272 goal_group = group_no;
1273retry_alloc:
1272 gdp = ext3_get_group_desc(sb, group_no, &gdp_bh); 1274 gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
1273 if (!gdp) 1275 if (!gdp)
1274 goto io_error; 1276 goto io_error;
1275 1277
1276 goal_group = group_no;
1277retry:
1278 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); 1278 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1279 /* 1279 /*
1280 * if there is not enough free blocks to make a new resevation 1280 * if there is not enough free blocks to make a new resevation
@@ -1349,7 +1349,7 @@ retry:
1349 if (my_rsv) { 1349 if (my_rsv) {
1350 my_rsv = NULL; 1350 my_rsv = NULL;
1351 group_no = goal_group; 1351 group_no = goal_group;
1352 goto retry; 1352 goto retry_alloc;
1353 } 1353 }
1354 /* No space left on the device */ 1354 /* No space left on the device */
1355 *errp = -ENOSPC; 1355 *errp = -ENOSPC;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index f804d5e9d60c..84be02e93652 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -925,7 +925,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
925 set_buffer_new(bh_result); 925 set_buffer_new(bh_result);
926got_it: 926got_it:
927 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 927 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
928 if (blocks_to_boundary == 0) 928 if (count > blocks_to_boundary)
929 set_buffer_boundary(bh_result); 929 set_buffer_boundary(bh_result);
930 err = count; 930 err = count;
931 /* Clean up and exit */ 931 /* Clean up and exit */
@@ -1009,11 +1009,14 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
1009 buffer_trace_init(&dummy.b_history); 1009 buffer_trace_init(&dummy.b_history);
1010 err = ext3_get_blocks_handle(handle, inode, block, 1, 1010 err = ext3_get_blocks_handle(handle, inode, block, 1,
1011 &dummy, create, 1); 1011 &dummy, create, 1);
1012 if (err == 1) { 1012 /*
1013 * ext3_get_blocks_handle() returns number of blocks
1014 * mapped. 0 in case of a HOLE.
1015 */
1016 if (err > 0) {
1017 if (err > 1)
1018 WARN_ON(1);
1013 err = 0; 1019 err = 0;
1014 } else if (err >= 0) {
1015 WARN_ON(1);
1016 err = -EIO;
1017 } 1020 }
1018 *errp = err; 1021 *errp = err;
1019 if (!err && buffer_mapped(&dummy)) { 1022 if (!err && buffer_mapped(&dummy)) {
@@ -1158,7 +1161,7 @@ retry:
1158 ret = PTR_ERR(handle); 1161 ret = PTR_ERR(handle);
1159 goto out; 1162 goto out;
1160 } 1163 }
1161 if (test_opt(inode->i_sb, NOBH)) 1164 if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
1162 ret = nobh_prepare_write(page, from, to, ext3_get_block); 1165 ret = nobh_prepare_write(page, from, to, ext3_get_block);
1163 else 1166 else
1164 ret = block_prepare_write(page, from, to, ext3_get_block); 1167 ret = block_prepare_write(page, from, to, ext3_get_block);
@@ -1244,7 +1247,7 @@ static int ext3_writeback_commit_write(struct file *file, struct page *page,
1244 if (new_i_size > EXT3_I(inode)->i_disksize) 1247 if (new_i_size > EXT3_I(inode)->i_disksize)
1245 EXT3_I(inode)->i_disksize = new_i_size; 1248 EXT3_I(inode)->i_disksize = new_i_size;
1246 1249
1247 if (test_opt(inode->i_sb, NOBH)) 1250 if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
1248 ret = nobh_commit_write(file, page, from, to); 1251 ret = nobh_commit_write(file, page, from, to);
1249 else 1252 else
1250 ret = generic_commit_write(file, page, from, to); 1253 ret = generic_commit_write(file, page, from, to);
@@ -1494,7 +1497,7 @@ static int ext3_writeback_writepage(struct page *page,
1494 goto out_fail; 1497 goto out_fail;
1495 } 1498 }
1496 1499
1497 if (test_opt(inode->i_sb, NOBH)) 1500 if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
1498 ret = nobh_writepage(page, ext3_get_block, wbc); 1501 ret = nobh_writepage(page, ext3_get_block, wbc);
1499 else 1502 else
1500 ret = block_write_full_page(page, ext3_get_block, wbc); 1503 ret = block_write_full_page(page, ext3_get_block, wbc);
@@ -2402,14 +2405,15 @@ static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
2402 struct buffer_head *bh; 2405 struct buffer_head *bh;
2403 struct ext3_group_desc * gdp; 2406 struct ext3_group_desc * gdp;
2404 2407
2405 2408 if (!ext3_valid_inum(sb, ino)) {
2406 if ((ino != EXT3_ROOT_INO && ino != EXT3_JOURNAL_INO && 2409 /*
2407 ino != EXT3_RESIZE_INO && ino < EXT3_FIRST_INO(sb)) || 2410 * This error is already checked for in namei.c unless we are
2408 ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)) { 2411 * looking at an NFS filehandle, in which case no error
2409 ext3_error(sb, "ext3_get_inode_block", 2412 * report is needed
2410 "bad inode number: %lu", ino); 2413 */
2411 return 0; 2414 return 0;
2412 } 2415 }
2416
2413 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); 2417 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
2414 if (block_group >= EXT3_SB(sb)->s_groups_count) { 2418 if (block_group >= EXT3_SB(sb)->s_groups_count) {
2415 ext3_error(sb,"ext3_get_inode_block","group >= groups count"); 2419 ext3_error(sb,"ext3_get_inode_block","group >= groups count");
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index d9176dba3698..2aa7101b27cd 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1000,7 +1000,12 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
1000 if (bh) { 1000 if (bh) {
1001 unsigned long ino = le32_to_cpu(de->inode); 1001 unsigned long ino = le32_to_cpu(de->inode);
1002 brelse (bh); 1002 brelse (bh);
1003 inode = iget(dir->i_sb, ino); 1003 if (!ext3_valid_inum(dir->i_sb, ino)) {
1004 ext3_error(dir->i_sb, "ext3_lookup",
1005 "bad inode number: %lu", ino);
1006 inode = NULL;
1007 } else
1008 inode = iget(dir->i_sb, ino);
1004 1009
1005 if (!inode) 1010 if (!inode)
1006 return ERR_PTR(-EACCES); 1011 return ERR_PTR(-EACCES);
@@ -1028,7 +1033,13 @@ struct dentry *ext3_get_parent(struct dentry *child)
1028 return ERR_PTR(-ENOENT); 1033 return ERR_PTR(-ENOENT);
1029 ino = le32_to_cpu(de->inode); 1034 ino = le32_to_cpu(de->inode);
1030 brelse(bh); 1035 brelse(bh);
1031 inode = iget(child->d_inode->i_sb, ino); 1036
1037 if (!ext3_valid_inum(child->d_inode->i_sb, ino)) {
1038 ext3_error(child->d_inode->i_sb, "ext3_get_parent",
1039 "bad inode number: %lu", ino);
1040 inode = NULL;
1041 } else
1042 inode = iget(child->d_inode->i_sb, ino);
1032 1043
1033 if (!inode) 1044 if (!inode)
1034 return ERR_PTR(-EACCES); 1045 return ERR_PTR(-EACCES);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index f2dd71336612..3559086eee5f 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -554,6 +554,47 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
554 return 0; 554 return 0;
555} 555}
556 556
557
558static struct dentry *ext3_get_dentry(struct super_block *sb, void *vobjp)
559{
560 __u32 *objp = vobjp;
561 unsigned long ino = objp[0];
562 __u32 generation = objp[1];
563 struct inode *inode;
564 struct dentry *result;
565
566 if (ino < EXT3_FIRST_INO(sb) && ino != EXT3_ROOT_INO)
567 return ERR_PTR(-ESTALE);
568 if (ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count))
569 return ERR_PTR(-ESTALE);
570
571 /* iget isn't really right if the inode is currently unallocated!!
572 *
573 * ext3_read_inode will return a bad_inode if the inode had been
574 * deleted, so we should be safe.
575 *
576 * Currently we don't know the generation for parent directory, so
577 * a generation of 0 means "accept any"
578 */
579 inode = iget(sb, ino);
580 if (inode == NULL)
581 return ERR_PTR(-ENOMEM);
582 if (is_bad_inode(inode) ||
583 (generation && inode->i_generation != generation)) {
584 iput(inode);
585 return ERR_PTR(-ESTALE);
586 }
587 /* now to find a dentry.
588 * If possible, get a well-connected one
589 */
590 result = d_alloc_anon(inode);
591 if (!result) {
592 iput(inode);
593 return ERR_PTR(-ENOMEM);
594 }
595 return result;
596}
597
557#ifdef CONFIG_QUOTA 598#ifdef CONFIG_QUOTA
558#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") 599#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
559#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) 600#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -622,6 +663,7 @@ static struct super_operations ext3_sops = {
622 663
623static struct export_operations ext3_export_ops = { 664static struct export_operations ext3_export_ops = {
624 .get_parent = ext3_get_parent, 665 .get_parent = ext3_get_parent,
666 .get_dentry = ext3_get_dentry,
625}; 667};
626 668
627enum { 669enum {
@@ -2614,7 +2656,7 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
2614 struct buffer_head *bh; 2656 struct buffer_head *bh;
2615 handle_t *handle = journal_current_handle(); 2657 handle_t *handle = journal_current_handle();
2616 2658
2617 mutex_lock(&inode->i_mutex); 2659 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
2618 while (towrite > 0) { 2660 while (towrite > 0) {
2619 tocopy = sb->s_blocksize - offset < towrite ? 2661 tocopy = sb->s_blocksize - offset < towrite ?
2620 sb->s_blocksize - offset : towrite; 2662 sb->s_blocksize - offset : towrite;
diff --git a/fs/file.c b/fs/file.c
index 55f4e7022563..b3c6b82e6a9d 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -240,13 +240,9 @@ static struct fdtable *alloc_fdtable(int nr)
240 if (!fdt) 240 if (!fdt)
241 goto out; 241 goto out;
242 242
243 nfds = 8 * L1_CACHE_BYTES; 243 nfds = max_t(int, 8 * L1_CACHE_BYTES, roundup_pow_of_two(nr + 1));
244 /* Expand to the max in easy steps */ 244 if (nfds > NR_OPEN)
245 while (nfds <= nr) { 245 nfds = NR_OPEN;
246 nfds = nfds * 2;
247 if (nfds > NR_OPEN)
248 nfds = NR_OPEN;
249 }
250 246
251 new_openset = alloc_fdset(nfds); 247 new_openset = alloc_fdset(nfds);
252 new_execset = alloc_fdset(nfds); 248 new_execset = alloc_fdset(nfds);
@@ -277,11 +273,13 @@ static struct fdtable *alloc_fdtable(int nr)
277 } while (nfds <= nr); 273 } while (nfds <= nr);
278 new_fds = alloc_fd_array(nfds); 274 new_fds = alloc_fd_array(nfds);
279 if (!new_fds) 275 if (!new_fds)
280 goto out; 276 goto out2;
281 fdt->fd = new_fds; 277 fdt->fd = new_fds;
282 fdt->max_fds = nfds; 278 fdt->max_fds = nfds;
283 fdt->free_files = NULL; 279 fdt->free_files = NULL;
284 return fdt; 280 return fdt;
281out2:
282 nfds = fdt->max_fdset;
285out: 283out:
286 if (new_openset) 284 if (new_openset)
287 free_fdset(new_openset, nfds); 285 free_fdset(new_openset, nfds);
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 29cce456c7ce..43886fa00a2a 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -246,6 +246,8 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
246 u_long page, npages, block, pblocks, nblocks, offset; 246 u_long page, npages, block, pblocks, nblocks, offset;
247 loff_t pos; 247 loff_t pos;
248 248
249 lock_kernel();
250
249 switch ((long)fp->f_pos) { 251 switch ((long)fp->f_pos) {
250 case 0: 252 case 0:
251 if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0) 253 if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0)
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index a3bce3a77253..46fe60b2da23 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -105,7 +105,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
105 105
106/* 106/*
107 * Add a connection to the control filesystem (if it exists). Caller 107 * Add a connection to the control filesystem (if it exists). Caller
108 * must host fuse_mutex 108 * must hold fuse_mutex
109 */ 109 */
110int fuse_ctl_add_conn(struct fuse_conn *fc) 110int fuse_ctl_add_conn(struct fuse_conn *fc)
111{ 111{
@@ -139,7 +139,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
139 139
140/* 140/*
141 * Remove a connection from the control filesystem (if it exists). 141 * Remove a connection from the control filesystem (if it exists).
142 * Caller must host fuse_mutex 142 * Caller must hold fuse_mutex
143 */ 143 */
144void fuse_ctl_remove_conn(struct fuse_conn *fc) 144void fuse_ctl_remove_conn(struct fuse_conn *fc)
145{ 145{
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 72a74cde6de8..409ce6a7cca4 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -14,6 +14,33 @@
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/namei.h> 15#include <linux/namei.h>
16 16
17#if BITS_PER_LONG >= 64
18static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
19{
20 entry->d_time = time;
21}
22
23static inline u64 fuse_dentry_time(struct dentry *entry)
24{
25 return entry->d_time;
26}
27#else
28/*
29 * On 32 bit archs store the high 32 bits of time in d_fsdata
30 */
31static void fuse_dentry_settime(struct dentry *entry, u64 time)
32{
33 entry->d_time = time;
34 entry->d_fsdata = (void *) (unsigned long) (time >> 32);
35}
36
37static u64 fuse_dentry_time(struct dentry *entry)
38{
39 return (u64) entry->d_time +
40 ((u64) (unsigned long) entry->d_fsdata << 32);
41}
42#endif
43
17/* 44/*
18 * FUSE caches dentries and attributes with separate timeout. The 45 * FUSE caches dentries and attributes with separate timeout. The
19 * time in jiffies until the dentry/attributes are valid is stored in 46 * time in jiffies until the dentry/attributes are valid is stored in
@@ -23,10 +50,13 @@
23/* 50/*
24 * Calculate the time in jiffies until a dentry/attributes are valid 51 * Calculate the time in jiffies until a dentry/attributes are valid
25 */ 52 */
26static unsigned long time_to_jiffies(unsigned long sec, unsigned long nsec) 53static u64 time_to_jiffies(unsigned long sec, unsigned long nsec)
27{ 54{
28 struct timespec ts = {sec, nsec}; 55 if (sec || nsec) {
29 return jiffies + timespec_to_jiffies(&ts); 56 struct timespec ts = {sec, nsec};
57 return get_jiffies_64() + timespec_to_jiffies(&ts);
58 } else
59 return 0;
30} 60}
31 61
32/* 62/*
@@ -35,7 +65,8 @@ static unsigned long time_to_jiffies(unsigned long sec, unsigned long nsec)
35 */ 65 */
36static void fuse_change_timeout(struct dentry *entry, struct fuse_entry_out *o) 66static void fuse_change_timeout(struct dentry *entry, struct fuse_entry_out *o)
37{ 67{
38 entry->d_time = time_to_jiffies(o->entry_valid, o->entry_valid_nsec); 68 fuse_dentry_settime(entry,
69 time_to_jiffies(o->entry_valid, o->entry_valid_nsec));
39 if (entry->d_inode) 70 if (entry->d_inode)
40 get_fuse_inode(entry->d_inode)->i_time = 71 get_fuse_inode(entry->d_inode)->i_time =
41 time_to_jiffies(o->attr_valid, o->attr_valid_nsec); 72 time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
@@ -47,7 +78,7 @@ static void fuse_change_timeout(struct dentry *entry, struct fuse_entry_out *o)
47 */ 78 */
48void fuse_invalidate_attr(struct inode *inode) 79void fuse_invalidate_attr(struct inode *inode)
49{ 80{
50 get_fuse_inode(inode)->i_time = jiffies - 1; 81 get_fuse_inode(inode)->i_time = 0;
51} 82}
52 83
53/* 84/*
@@ -60,7 +91,7 @@ void fuse_invalidate_attr(struct inode *inode)
60 */ 91 */
61static void fuse_invalidate_entry_cache(struct dentry *entry) 92static void fuse_invalidate_entry_cache(struct dentry *entry)
62{ 93{
63 entry->d_time = jiffies - 1; 94 fuse_dentry_settime(entry, 0);
64} 95}
65 96
66/* 97/*
@@ -102,7 +133,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
102 133
103 if (inode && is_bad_inode(inode)) 134 if (inode && is_bad_inode(inode))
104 return 0; 135 return 0;
105 else if (time_after(jiffies, entry->d_time)) { 136 else if (fuse_dentry_time(entry) < get_jiffies_64()) {
106 int err; 137 int err;
107 struct fuse_entry_out outarg; 138 struct fuse_entry_out outarg;
108 struct fuse_conn *fc; 139 struct fuse_conn *fc;
@@ -666,7 +697,7 @@ static int fuse_revalidate(struct dentry *entry)
666 if (!fuse_allow_task(fc, current)) 697 if (!fuse_allow_task(fc, current))
667 return -EACCES; 698 return -EACCES;
668 if (get_node_id(inode) != FUSE_ROOT_ID && 699 if (get_node_id(inode) != FUSE_ROOT_ID &&
669 time_before_eq(jiffies, fi->i_time)) 700 fi->i_time >= get_jiffies_64())
670 return 0; 701 return 0;
671 702
672 return fuse_do_getattr(inode); 703 return fuse_do_getattr(inode);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 63614ed16336..5c4fcd1dbf59 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -395,14 +395,16 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
395 struct fuse_readpages_data data; 395 struct fuse_readpages_data data;
396 int err; 396 int err;
397 397
398 err = -EIO;
398 if (is_bad_inode(inode)) 399 if (is_bad_inode(inode))
399 return -EIO; 400 goto clean_pages_up;
400 401
401 data.file = file; 402 data.file = file;
402 data.inode = inode; 403 data.inode = inode;
403 data.req = fuse_get_req(fc); 404 data.req = fuse_get_req(fc);
405 err = PTR_ERR(data.req);
404 if (IS_ERR(data.req)) 406 if (IS_ERR(data.req))
405 return PTR_ERR(data.req); 407 goto clean_pages_up;
406 408
407 err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); 409 err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
408 if (!err) { 410 if (!err) {
@@ -412,6 +414,10 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
412 fuse_put_request(fc, data.req); 414 fuse_put_request(fc, data.req);
413 } 415 }
414 return err; 416 return err;
417
418clean_pages_up:
419 put_pages_list(pages);
420 return err;
415} 421}
416 422
417static size_t fuse_send_write(struct fuse_req *req, struct file *file, 423static size_t fuse_send_write(struct fuse_req *req, struct file *file,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 0dbf96621841..69c7750d55b8 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -59,7 +59,7 @@ struct fuse_inode {
59 struct fuse_req *forget_req; 59 struct fuse_req *forget_req;
60 60
61 /** Time in jiffies until the file attributes are valid */ 61 /** Time in jiffies until the file attributes are valid */
62 unsigned long i_time; 62 u64 i_time;
63}; 63};
64 64
65/** FUSE specific file data */ 65/** FUSE specific file data */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index dcaaabd3b9c4..7d25092262ae 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -51,7 +51,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
51 return NULL; 51 return NULL;
52 52
53 fi = get_fuse_inode(inode); 53 fi = get_fuse_inode(inode);
54 fi->i_time = jiffies - 1; 54 fi->i_time = 0;
55 fi->nodeid = 0; 55 fi->nodeid = 0;
56 fi->nlookup = 0; 56 fi->nlookup = 0;
57 fi->forget_req = fuse_request_alloc(); 57 fi->forget_req = fuse_request_alloc();
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6449cb697967..c3920c96dadf 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -83,8 +83,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
83 83
84 ret = -ENOMEM; 84 ret = -ENOMEM;
85 len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 85 len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
86 if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
87 goto out;
88 86
89 if (vma->vm_flags & VM_MAYSHARE && 87 if (vma->vm_flags & VM_MAYSHARE &&
90 hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT), 88 hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
@@ -93,7 +91,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
93 91
94 ret = 0; 92 ret = 0;
95 hugetlb_prefault_arch_hook(vma->vm_mm); 93 hugetlb_prefault_arch_hook(vma->vm_mm);
96 if (inode->i_size < len) 94 if (vma->vm_flags & VM_WRITE && inode->i_size < len)
97 inode->i_size = len; 95 inode->i_size = len;
98out: 96out:
99 mutex_unlock(&inode->i_mutex); 97 mutex_unlock(&inode->i_mutex);
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index f2386442adee..017cb0f134d6 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -187,7 +187,7 @@ static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
187{ 187{
188 struct inotify_kernel_event *kevent; 188 struct inotify_kernel_event *kevent;
189 189
190 kevent = kmem_cache_alloc(event_cachep, GFP_KERNEL); 190 kevent = kmem_cache_alloc(event_cachep, GFP_NOFS);
191 if (unlikely(!kevent)) 191 if (unlikely(!kevent))
192 return NULL; 192 return NULL;
193 193
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 93aa5715f224..78b1deae3fa2 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -44,6 +44,9 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
44 task->ioprio = ioprio; 44 task->ioprio = ioprio;
45 45
46 ioc = task->io_context; 46 ioc = task->io_context;
47 /* see wmb() in current_io_context() */
48 smp_read_barrier_depends();
49
47 if (ioc && ioc->set_ioprio) 50 if (ioc && ioc->set_ioprio)
48 ioc->set_ioprio(ioc, ioprio); 51 ioc->set_ioprio(ioc, ioprio);
49 52
@@ -111,9 +114,9 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
111 continue; 114 continue;
112 ret = set_task_ioprio(p, ioprio); 115 ret = set_task_ioprio(p, ioprio);
113 if (ret) 116 if (ret)
114 break; 117 goto free_uid;
115 } while_each_thread(g, p); 118 } while_each_thread(g, p);
116 119free_uid:
117 if (who) 120 if (who)
118 free_uid(user); 121 free_uid(user);
119 break; 122 break;
@@ -137,6 +140,29 @@ out:
137 return ret; 140 return ret;
138} 141}
139 142
143int ioprio_best(unsigned short aprio, unsigned short bprio)
144{
145 unsigned short aclass = IOPRIO_PRIO_CLASS(aprio);
146 unsigned short bclass = IOPRIO_PRIO_CLASS(bprio);
147
148 if (!ioprio_valid(aprio))
149 return bprio;
150 if (!ioprio_valid(bprio))
151 return aprio;
152
153 if (aclass == IOPRIO_CLASS_NONE)
154 aclass = IOPRIO_CLASS_BE;
155 if (bclass == IOPRIO_CLASS_NONE)
156 bclass = IOPRIO_CLASS_BE;
157
158 if (aclass == bclass)
159 return min(aprio, bprio);
160 if (aclass > bclass)
161 return bprio;
162 else
163 return aprio;
164}
165
140asmlinkage long sys_ioprio_get(int which, int who) 166asmlinkage long sys_ioprio_get(int which, int who)
141{ 167{
142 struct task_struct *g, *p; 168 struct task_struct *g, *p;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 0971814c38b8..42da60784311 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -261,7 +261,7 @@ void journal_commit_transaction(journal_t *journal)
261 struct buffer_head *bh = jh2bh(jh); 261 struct buffer_head *bh = jh2bh(jh);
262 262
263 jbd_lock_bh_state(bh); 263 jbd_lock_bh_state(bh);
264 kfree(jh->b_committed_data); 264 jbd_slab_free(jh->b_committed_data, bh->b_size);
265 jh->b_committed_data = NULL; 265 jh->b_committed_data = NULL;
266 jbd_unlock_bh_state(bh); 266 jbd_unlock_bh_state(bh);
267 } 267 }
@@ -745,14 +745,14 @@ restart_loop:
745 * Otherwise, we can just throw away the frozen data now. 745 * Otherwise, we can just throw away the frozen data now.
746 */ 746 */
747 if (jh->b_committed_data) { 747 if (jh->b_committed_data) {
748 kfree(jh->b_committed_data); 748 jbd_slab_free(jh->b_committed_data, bh->b_size);
749 jh->b_committed_data = NULL; 749 jh->b_committed_data = NULL;
750 if (jh->b_frozen_data) { 750 if (jh->b_frozen_data) {
751 jh->b_committed_data = jh->b_frozen_data; 751 jh->b_committed_data = jh->b_frozen_data;
752 jh->b_frozen_data = NULL; 752 jh->b_frozen_data = NULL;
753 } 753 }
754 } else if (jh->b_frozen_data) { 754 } else if (jh->b_frozen_data) {
755 kfree(jh->b_frozen_data); 755 jbd_slab_free(jh->b_frozen_data, bh->b_size);
756 jh->b_frozen_data = NULL; 756 jh->b_frozen_data = NULL;
757 } 757 }
758 758
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 8c9b28dff119..f66724ce443a 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -84,6 +84,7 @@ EXPORT_SYMBOL(journal_force_commit);
84 84
85static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); 85static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
86static void __journal_abort_soft (journal_t *journal, int errno); 86static void __journal_abort_soft (journal_t *journal, int errno);
87static int journal_create_jbd_slab(size_t slab_size);
87 88
88/* 89/*
89 * Helper function used to manage commit timeouts 90 * Helper function used to manage commit timeouts
@@ -328,10 +329,10 @@ repeat:
328 char *tmp; 329 char *tmp;
329 330
330 jbd_unlock_bh_state(bh_in); 331 jbd_unlock_bh_state(bh_in);
331 tmp = jbd_rep_kmalloc(bh_in->b_size, GFP_NOFS); 332 tmp = jbd_slab_alloc(bh_in->b_size, GFP_NOFS);
332 jbd_lock_bh_state(bh_in); 333 jbd_lock_bh_state(bh_in);
333 if (jh_in->b_frozen_data) { 334 if (jh_in->b_frozen_data) {
334 kfree(tmp); 335 jbd_slab_free(tmp, bh_in->b_size);
335 goto repeat; 336 goto repeat;
336 } 337 }
337 338
@@ -1069,17 +1070,17 @@ static int load_superblock(journal_t *journal)
1069int journal_load(journal_t *journal) 1070int journal_load(journal_t *journal)
1070{ 1071{
1071 int err; 1072 int err;
1073 journal_superblock_t *sb;
1072 1074
1073 err = load_superblock(journal); 1075 err = load_superblock(journal);
1074 if (err) 1076 if (err)
1075 return err; 1077 return err;
1076 1078
1079 sb = journal->j_superblock;
1077 /* If this is a V2 superblock, then we have to check the 1080 /* If this is a V2 superblock, then we have to check the
1078 * features flags on it. */ 1081 * features flags on it. */
1079 1082
1080 if (journal->j_format_version >= 2) { 1083 if (journal->j_format_version >= 2) {
1081 journal_superblock_t *sb = journal->j_superblock;
1082
1083 if ((sb->s_feature_ro_compat & 1084 if ((sb->s_feature_ro_compat &
1084 ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) || 1085 ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) ||
1085 (sb->s_feature_incompat & 1086 (sb->s_feature_incompat &
@@ -1090,6 +1091,13 @@ int journal_load(journal_t *journal)
1090 } 1091 }
1091 } 1092 }
1092 1093
1094 /*
1095 * Create a slab for this blocksize
1096 */
1097 err = journal_create_jbd_slab(cpu_to_be32(sb->s_blocksize));
1098 if (err)
1099 return err;
1100
1093 /* Let the recovery code check whether it needs to recover any 1101 /* Let the recovery code check whether it needs to recover any
1094 * data from the journal. */ 1102 * data from the journal. */
1095 if (journal_recover(journal)) 1103 if (journal_recover(journal))
@@ -1612,6 +1620,77 @@ void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry)
1612} 1620}
1613 1621
1614/* 1622/*
1623 * jbd slab management: create 1k, 2k, 4k, 8k slabs as needed
1624 * and allocate frozen and commit buffers from these slabs.
1625 *
1626 * Reason for doing this is to avoid, SLAB_DEBUG - since it could
1627 * cause bh to cross page boundary.
1628 */
1629
1630#define JBD_MAX_SLABS 5
1631#define JBD_SLAB_INDEX(size) (size >> 11)
1632
1633static kmem_cache_t *jbd_slab[JBD_MAX_SLABS];
1634static const char *jbd_slab_names[JBD_MAX_SLABS] = {
1635 "jbd_1k", "jbd_2k", "jbd_4k", NULL, "jbd_8k"
1636};
1637
1638static void journal_destroy_jbd_slabs(void)
1639{
1640 int i;
1641
1642 for (i = 0; i < JBD_MAX_SLABS; i++) {
1643 if (jbd_slab[i])
1644 kmem_cache_destroy(jbd_slab[i]);
1645 jbd_slab[i] = NULL;
1646 }
1647}
1648
1649static int journal_create_jbd_slab(size_t slab_size)
1650{
1651 int i = JBD_SLAB_INDEX(slab_size);
1652
1653 BUG_ON(i >= JBD_MAX_SLABS);
1654
1655 /*
1656 * Check if we already have a slab created for this size
1657 */
1658 if (jbd_slab[i])
1659 return 0;
1660
1661 /*
1662 * Create a slab and force alignment to be same as slabsize -
1663 * this will make sure that allocations won't cross the page
1664 * boundary.
1665 */
1666 jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
1667 slab_size, slab_size, 0, NULL, NULL);
1668 if (!jbd_slab[i]) {
1669 printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n");
1670 return -ENOMEM;
1671 }
1672 return 0;
1673}
1674
1675void * jbd_slab_alloc(size_t size, gfp_t flags)
1676{
1677 int idx;
1678
1679 idx = JBD_SLAB_INDEX(size);
1680 BUG_ON(jbd_slab[idx] == NULL);
1681 return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL);
1682}
1683
1684void jbd_slab_free(void *ptr, size_t size)
1685{
1686 int idx;
1687
1688 idx = JBD_SLAB_INDEX(size);
1689 BUG_ON(jbd_slab[idx] == NULL);
1690 kmem_cache_free(jbd_slab[idx], ptr);
1691}
1692
1693/*
1615 * Journal_head storage management 1694 * Journal_head storage management
1616 */ 1695 */
1617static kmem_cache_t *journal_head_cache; 1696static kmem_cache_t *journal_head_cache;
@@ -1799,13 +1878,13 @@ static void __journal_remove_journal_head(struct buffer_head *bh)
1799 printk(KERN_WARNING "%s: freeing " 1878 printk(KERN_WARNING "%s: freeing "
1800 "b_frozen_data\n", 1879 "b_frozen_data\n",
1801 __FUNCTION__); 1880 __FUNCTION__);
1802 kfree(jh->b_frozen_data); 1881 jbd_slab_free(jh->b_frozen_data, bh->b_size);
1803 } 1882 }
1804 if (jh->b_committed_data) { 1883 if (jh->b_committed_data) {
1805 printk(KERN_WARNING "%s: freeing " 1884 printk(KERN_WARNING "%s: freeing "
1806 "b_committed_data\n", 1885 "b_committed_data\n",
1807 __FUNCTION__); 1886 __FUNCTION__);
1808 kfree(jh->b_committed_data); 1887 jbd_slab_free(jh->b_committed_data, bh->b_size);
1809 } 1888 }
1810 bh->b_private = NULL; 1889 bh->b_private = NULL;
1811 jh->b_bh = NULL; /* debug, really */ 1890 jh->b_bh = NULL; /* debug, really */
@@ -1961,6 +2040,7 @@ static void journal_destroy_caches(void)
1961 journal_destroy_revoke_caches(); 2040 journal_destroy_revoke_caches();
1962 journal_destroy_journal_head_cache(); 2041 journal_destroy_journal_head_cache();
1963 journal_destroy_handle_cache(); 2042 journal_destroy_handle_cache();
2043 journal_destroy_jbd_slabs();
1964} 2044}
1965 2045
1966static int __init journal_init(void) 2046static int __init journal_init(void)
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 508b2ea91f43..f5169a96260e 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -666,8 +666,9 @@ repeat:
666 if (!frozen_buffer) { 666 if (!frozen_buffer) {
667 JBUFFER_TRACE(jh, "allocate memory for buffer"); 667 JBUFFER_TRACE(jh, "allocate memory for buffer");
668 jbd_unlock_bh_state(bh); 668 jbd_unlock_bh_state(bh);
669 frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size, 669 frozen_buffer =
670 GFP_NOFS); 670 jbd_slab_alloc(jh2bh(jh)->b_size,
671 GFP_NOFS);
671 if (!frozen_buffer) { 672 if (!frozen_buffer) {
672 printk(KERN_EMERG 673 printk(KERN_EMERG
673 "%s: OOM for frozen_buffer\n", 674 "%s: OOM for frozen_buffer\n",
@@ -726,7 +727,7 @@ done:
726 727
727out: 728out:
728 if (unlikely(frozen_buffer)) /* It's usually NULL */ 729 if (unlikely(frozen_buffer)) /* It's usually NULL */
729 kfree(frozen_buffer); 730 jbd_slab_free(frozen_buffer, bh->b_size);
730 731
731 JBUFFER_TRACE(jh, "exit"); 732 JBUFFER_TRACE(jh, "exit");
732 return error; 733 return error;
@@ -879,7 +880,7 @@ int journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
879 880
880repeat: 881repeat:
881 if (!jh->b_committed_data) { 882 if (!jh->b_committed_data) {
882 committed_data = jbd_kmalloc(jh2bh(jh)->b_size, GFP_NOFS); 883 committed_data = jbd_slab_alloc(jh2bh(jh)->b_size, GFP_NOFS);
883 if (!committed_data) { 884 if (!committed_data) {
884 printk(KERN_EMERG "%s: No memory for committed data\n", 885 printk(KERN_EMERG "%s: No memory for committed data\n",
885 __FUNCTION__); 886 __FUNCTION__);
@@ -906,7 +907,7 @@ repeat:
906out: 907out:
907 journal_put_journal_head(jh); 908 journal_put_journal_head(jh);
908 if (unlikely(committed_data)) 909 if (unlikely(committed_data))
909 kfree(committed_data); 910 jbd_slab_free(committed_data, bh->b_size);
910 return err; 911 return err;
911} 912}
912 913
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 9c2077e7e081..0ae3cd10702c 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -345,10 +345,8 @@ int jffs2_init_acl(struct inode *inode, struct inode *dir)
345 return rc; 345 return rc;
346} 346}
347 347
348void jffs2_clear_acl(struct inode *inode) 348void jffs2_clear_acl(struct jffs2_inode_info *f)
349{ 349{
350 struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
351
352 if (f->i_acl_access && f->i_acl_access != JFFS2_ACL_NOT_CACHED) { 350 if (f->i_acl_access && f->i_acl_access != JFFS2_ACL_NOT_CACHED) {
353 posix_acl_release(f->i_acl_access); 351 posix_acl_release(f->i_acl_access);
354 f->i_acl_access = JFFS2_ACL_NOT_CACHED; 352 f->i_acl_access = JFFS2_ACL_NOT_CACHED;
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 8893bd1a6ba7..fa327dbd3171 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -30,7 +30,7 @@ struct jffs2_acl_header {
30extern int jffs2_permission(struct inode *, int, struct nameidata *); 30extern int jffs2_permission(struct inode *, int, struct nameidata *);
31extern int jffs2_acl_chmod(struct inode *); 31extern int jffs2_acl_chmod(struct inode *);
32extern int jffs2_init_acl(struct inode *, struct inode *); 32extern int jffs2_init_acl(struct inode *, struct inode *);
33extern void jffs2_clear_acl(struct inode *); 33extern void jffs2_clear_acl(struct jffs2_inode_info *);
34 34
35extern struct xattr_handler jffs2_acl_access_xattr_handler; 35extern struct xattr_handler jffs2_acl_access_xattr_handler;
36extern struct xattr_handler jffs2_acl_default_xattr_handler; 36extern struct xattr_handler jffs2_acl_default_xattr_handler;
@@ -40,6 +40,6 @@ extern struct xattr_handler jffs2_acl_default_xattr_handler;
40#define jffs2_permission NULL 40#define jffs2_permission NULL
41#define jffs2_acl_chmod(inode) (0) 41#define jffs2_acl_chmod(inode) (0)
42#define jffs2_init_acl(inode,dir) (0) 42#define jffs2_init_acl(inode,dir) (0)
43#define jffs2_clear_acl(inode) 43#define jffs2_clear_acl(f)
44 44
45#endif /* CONFIG_JFFS2_FS_POSIX_ACL */ 45#endif /* CONFIG_JFFS2_FS_POSIX_ACL */
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 8310c95478e9..33f291005012 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -190,7 +190,7 @@ void jffs2_free_tmp_dnode_info(struct jffs2_tmp_dnode_info *x)
190 kmem_cache_free(tmp_dnode_info_slab, x); 190 kmem_cache_free(tmp_dnode_info_slab, x);
191} 191}
192 192
193struct jffs2_raw_node_ref *jffs2_alloc_refblock(void) 193static struct jffs2_raw_node_ref *jffs2_alloc_refblock(void)
194{ 194{
195 struct jffs2_raw_node_ref *ret; 195 struct jffs2_raw_node_ref *ret;
196 196
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index f752baa8d399..cae92c14116d 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -426,8 +426,6 @@ char *jffs2_getlink(struct jffs2_sb_info *c, struct jffs2_inode_info *f);
426/* scan.c */ 426/* scan.c */
427int jffs2_scan_medium(struct jffs2_sb_info *c); 427int jffs2_scan_medium(struct jffs2_sb_info *c);
428void jffs2_rotate_lists(struct jffs2_sb_info *c); 428void jffs2_rotate_lists(struct jffs2_sb_info *c);
429int jffs2_fill_scan_buf(struct jffs2_sb_info *c, void *buf,
430 uint32_t ofs, uint32_t len);
431struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uint32_t ino); 429struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uint32_t ino);
432int jffs2_scan_classify_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); 430int jffs2_scan_classify_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
433int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t size); 431int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t size);
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index cc1899268c43..266423b2709d 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -968,6 +968,7 @@ void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f)
968 struct jffs2_full_dirent *fd, *fds; 968 struct jffs2_full_dirent *fd, *fds;
969 int deleted; 969 int deleted;
970 970
971 jffs2_clear_acl(f);
971 jffs2_xattr_delete_inode(c, f->inocache); 972 jffs2_xattr_delete_inode(c, f->inocache);
972 down(&f->sem); 973 down(&f->sem);
973 deleted = f->inocache && !f->inocache->nlink; 974 deleted = f->inocache && !f->inocache->nlink;
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 2bfdc33752d3..e2413466ddd5 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -274,8 +274,8 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
274 return ret; 274 return ret;
275} 275}
276 276
277int jffs2_fill_scan_buf (struct jffs2_sb_info *c, void *buf, 277static int jffs2_fill_scan_buf(struct jffs2_sb_info *c, void *buf,
278 uint32_t ofs, uint32_t len) 278 uint32_t ofs, uint32_t len)
279{ 279{
280 int ret; 280 int ret;
281 size_t retlen; 281 size_t retlen;
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index c19bd476e8ec..e52cef526d90 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -252,6 +252,11 @@ int jffs2_sum_add_kvec(struct jffs2_sb_info *c, const struct kvec *invecs,
252 union jffs2_node_union *node; 252 union jffs2_node_union *node;
253 struct jffs2_eraseblock *jeb; 253 struct jffs2_eraseblock *jeb;
254 254
255 if (c->summary->sum_size == JFFS2_SUMMARY_NOSUM_SIZE) {
256 dbg_summary("Summary is disabled for this jeb! Skipping summary info!\n");
257 return 0;
258 }
259
255 node = invecs[0].iov_base; 260 node = invecs[0].iov_base;
256 jeb = &c->blocks[ofs / c->sector_size]; 261 jeb = &c->blocks[ofs / c->sector_size];
257 ofs -= jeb->offset; 262 ofs -= jeb->offset;
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 18e66dbf23b4..25bc1ae08648 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -50,9 +50,10 @@
50 * is used to write xdatum to medium. xd->version will be incremented. 50 * is used to write xdatum to medium. xd->version will be incremented.
51 * create_xattr_datum(c, xprefix, xname, xvalue, xsize) 51 * create_xattr_datum(c, xprefix, xname, xvalue, xsize)
52 * is used to create new xdatum and write to medium. 52 * is used to create new xdatum and write to medium.
53 * delete_xattr_datum(c, xd) 53 * unrefer_xattr_datum(c, xd)
54 * is used to delete a xdatum. It marks xd JFFS2_XFLAGS_DEAD, and allows 54 * is used to delete a xdatum. When nobody refers this xdatum, JFFS2_XFLAGS_DEAD
55 * GC to reclaim those physical nodes. 55 * is set on xd->flags and chained xattr_dead_list or release it immediately.
56 * In the first case, the garbage collector release it later.
56 * -------------------------------------------------- */ 57 * -------------------------------------------------- */
57static uint32_t xattr_datum_hashkey(int xprefix, const char *xname, const char *xvalue, int xsize) 58static uint32_t xattr_datum_hashkey(int xprefix, const char *xname, const char *xvalue, int xsize)
58{ 59{
@@ -394,22 +395,24 @@ static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c,
394 return xd; 395 return xd;
395} 396}
396 397
397static void delete_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) 398static void unrefer_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
398{ 399{
399 /* must be called under down_write(xattr_sem) */ 400 /* must be called under down_write(xattr_sem) */
400 BUG_ON(atomic_read(&xd->refcnt)); 401 if (atomic_dec_and_lock(&xd->refcnt, &c->erase_completion_lock)) {
402 uint32_t xid = xd->xid, version = xd->version;
401 403
402 unload_xattr_datum(c, xd); 404 unload_xattr_datum(c, xd);
403 xd->flags |= JFFS2_XFLAGS_DEAD; 405 xd->flags |= JFFS2_XFLAGS_DEAD;
404 spin_lock(&c->erase_completion_lock); 406 if (xd->node == (void *)xd) {
405 if (xd->node == (void *)xd) { 407 BUG_ON(!(xd->flags & JFFS2_XFLAGS_INVALID));
406 BUG_ON(!(xd->flags & JFFS2_XFLAGS_INVALID)); 408 jffs2_free_xattr_datum(xd);
407 jffs2_free_xattr_datum(xd); 409 } else {
408 } else { 410 list_add(&xd->xindex, &c->xattr_dead_list);
409 list_add(&xd->xindex, &c->xattr_dead_list); 411 }
412 spin_unlock(&c->erase_completion_lock);
413
414 dbg_xattr("xdatum(xid=%u, version=%u) was removed.\n", xid, version);
410 } 415 }
411 spin_unlock(&c->erase_completion_lock);
412 dbg_xattr("xdatum(xid=%u, version=%u) was removed.\n", xd->xid, xd->version);
413} 416}
414 417
415/* -------- xref related functions ------------------ 418/* -------- xref related functions ------------------
@@ -580,8 +583,7 @@ static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *re
580 dbg_xattr("xref(ino=%u, xid=%u, xseqno=%u) was removed.\n", 583 dbg_xattr("xref(ino=%u, xid=%u, xseqno=%u) was removed.\n",
581 ref->ino, ref->xid, ref->xseqno); 584 ref->ino, ref->xid, ref->xseqno);
582 585
583 if (atomic_dec_and_test(&xd->refcnt)) 586 unrefer_xattr_datum(c, xd);
584 delete_xattr_datum(c, xd);
585} 587}
586 588
587void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic) 589void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
@@ -1119,8 +1121,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
1119 ref->next = c->xref_dead_list; 1121 ref->next = c->xref_dead_list;
1120 c->xref_dead_list = ref; 1122 c->xref_dead_list = ref;
1121 spin_unlock(&c->erase_completion_lock); 1123 spin_unlock(&c->erase_completion_lock);
1122 if (atomic_dec_and_test(&xd->refcnt)) 1124 unrefer_xattr_datum(c, xd);
1123 delete_xattr_datum(c, xd);
1124 } else { 1125 } else {
1125 ref->ic = ic; 1126 ref->ic = ic;
1126 ref->xd = xd; 1127 ref->xd = xd;
@@ -1156,8 +1157,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
1156 down_write(&c->xattr_sem); 1157 down_write(&c->xattr_sem);
1157 if (rc) { 1158 if (rc) {
1158 JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request); 1159 JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
1159 if (atomic_dec_and_test(&xd->refcnt)) 1160 unrefer_xattr_datum(c, xd);
1160 delete_xattr_datum(c, xd);
1161 up_write(&c->xattr_sem); 1161 up_write(&c->xattr_sem);
1162 return rc; 1162 return rc;
1163 } 1163 }
@@ -1170,8 +1170,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
1170 ic->xref = ref; 1170 ic->xref = ref;
1171 } 1171 }
1172 rc = PTR_ERR(newref); 1172 rc = PTR_ERR(newref);
1173 if (atomic_dec_and_test(&xd->refcnt)) 1173 unrefer_xattr_datum(c, xd);
1174 delete_xattr_datum(c, xd);
1175 } else if (ref) { 1174 } else if (ref) {
1176 delete_xattr_ref(c, ref); 1175 delete_xattr_ref(c, ref);
1177 } 1176 }
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 43e3f566aad6..a223cf4faa9b 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -168,16 +168,15 @@ void jfs_dirty_inode(struct inode *inode)
168 set_cflag(COMMIT_Dirty, inode); 168 set_cflag(COMMIT_Dirty, inode);
169} 169}
170 170
171static int 171int jfs_get_block(struct inode *ip, sector_t lblock,
172jfs_get_blocks(struct inode *ip, sector_t lblock, unsigned long max_blocks, 172 struct buffer_head *bh_result, int create)
173 struct buffer_head *bh_result, int create)
174{ 173{
175 s64 lblock64 = lblock; 174 s64 lblock64 = lblock;
176 int rc = 0; 175 int rc = 0;
177 xad_t xad; 176 xad_t xad;
178 s64 xaddr; 177 s64 xaddr;
179 int xflag; 178 int xflag;
180 s32 xlen = max_blocks; 179 s32 xlen = bh_result->b_size >> ip->i_blkbits;
181 180
182 /* 181 /*
183 * Take appropriate lock on inode 182 * Take appropriate lock on inode
@@ -188,7 +187,7 @@ jfs_get_blocks(struct inode *ip, sector_t lblock, unsigned long max_blocks,
188 IREAD_LOCK(ip); 187 IREAD_LOCK(ip);
189 188
190 if (((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size) && 189 if (((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size) &&
191 (!xtLookup(ip, lblock64, max_blocks, &xflag, &xaddr, &xlen, 0)) && 190 (!xtLookup(ip, lblock64, xlen, &xflag, &xaddr, &xlen, 0)) &&
192 xaddr) { 191 xaddr) {
193 if (xflag & XAD_NOTRECORDED) { 192 if (xflag & XAD_NOTRECORDED) {
194 if (!create) 193 if (!create)
@@ -255,13 +254,6 @@ jfs_get_blocks(struct inode *ip, sector_t lblock, unsigned long max_blocks,
255 return rc; 254 return rc;
256} 255}
257 256
258static int jfs_get_block(struct inode *ip, sector_t lblock,
259 struct buffer_head *bh_result, int create)
260{
261 return jfs_get_blocks(ip, lblock, bh_result->b_size >> ip->i_blkbits,
262 bh_result, create);
263}
264
265static int jfs_writepage(struct page *page, struct writeback_control *wbc) 257static int jfs_writepage(struct page *page, struct writeback_control *wbc)
266{ 258{
267 return nobh_writepage(page, jfs_get_block, wbc); 259 return nobh_writepage(page, jfs_get_block, wbc);
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index b5c7da6190dc..1fc48df670c8 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -32,6 +32,7 @@ extern void jfs_truncate_nolock(struct inode *, loff_t);
32extern void jfs_free_zero_link(struct inode *); 32extern void jfs_free_zero_link(struct inode *);
33extern struct dentry *jfs_get_parent(struct dentry *dentry); 33extern struct dentry *jfs_get_parent(struct dentry *dentry);
34extern void jfs_set_inode_flags(struct inode *); 34extern void jfs_set_inode_flags(struct inode *);
35extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
35 36
36extern const struct address_space_operations jfs_aops; 37extern const struct address_space_operations jfs_aops;
37extern struct inode_operations jfs_dir_inode_operations; 38extern struct inode_operations jfs_dir_inode_operations;
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 10c46231ce15..efbb586bed4b 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2944,7 +2944,7 @@ int jfs_sync(void *arg)
2944 * Inode is being freed 2944 * Inode is being freed
2945 */ 2945 */
2946 list_del_init(&jfs_ip->anon_inode_list); 2946 list_del_init(&jfs_ip->anon_inode_list);
2947 } else if (! !mutex_trylock(&jfs_ip->commit_mutex)) { 2947 } else if (mutex_trylock(&jfs_ip->commit_mutex)) {
2948 /* 2948 /*
2949 * inode will be removed from anonymous list 2949 * inode will be removed from anonymous list
2950 * when it is committed 2950 * when it is committed
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 09ea03f62277..295268ad231b 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -165,8 +165,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
165 165
166 out3: 166 out3:
167 txEnd(tid); 167 txEnd(tid);
168 mutex_unlock(&JFS_IP(dip)->commit_mutex);
169 mutex_unlock(&JFS_IP(ip)->commit_mutex); 168 mutex_unlock(&JFS_IP(ip)->commit_mutex);
169 mutex_unlock(&JFS_IP(dip)->commit_mutex);
170 if (rc) { 170 if (rc) {
171 free_ea_wmap(ip); 171 free_ea_wmap(ip);
172 ip->i_nlink = 0; 172 ip->i_nlink = 0;
@@ -300,8 +300,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
300 300
301 out3: 301 out3:
302 txEnd(tid); 302 txEnd(tid);
303 mutex_unlock(&JFS_IP(dip)->commit_mutex);
304 mutex_unlock(&JFS_IP(ip)->commit_mutex); 303 mutex_unlock(&JFS_IP(ip)->commit_mutex);
304 mutex_unlock(&JFS_IP(dip)->commit_mutex);
305 if (rc) { 305 if (rc) {
306 free_ea_wmap(ip); 306 free_ea_wmap(ip);
307 ip->i_nlink = 0; 307 ip->i_nlink = 0;
@@ -384,8 +384,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
384 if (rc == -EIO) 384 if (rc == -EIO)
385 txAbort(tid, 1); 385 txAbort(tid, 1);
386 txEnd(tid); 386 txEnd(tid);
387 mutex_unlock(&JFS_IP(dip)->commit_mutex);
388 mutex_unlock(&JFS_IP(ip)->commit_mutex); 387 mutex_unlock(&JFS_IP(ip)->commit_mutex);
388 mutex_unlock(&JFS_IP(dip)->commit_mutex);
389 389
390 goto out2; 390 goto out2;
391 } 391 }
@@ -422,8 +422,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
422 422
423 txEnd(tid); 423 txEnd(tid);
424 424
425 mutex_unlock(&JFS_IP(dip)->commit_mutex);
426 mutex_unlock(&JFS_IP(ip)->commit_mutex); 425 mutex_unlock(&JFS_IP(ip)->commit_mutex);
426 mutex_unlock(&JFS_IP(dip)->commit_mutex);
427 427
428 /* 428 /*
429 * Truncating the directory index table is not guaranteed. It 429 * Truncating the directory index table is not guaranteed. It
@@ -503,8 +503,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
503 if (rc == -EIO) 503 if (rc == -EIO)
504 txAbort(tid, 1); /* Marks FS Dirty */ 504 txAbort(tid, 1); /* Marks FS Dirty */
505 txEnd(tid); 505 txEnd(tid);
506 mutex_unlock(&JFS_IP(dip)->commit_mutex);
507 mutex_unlock(&JFS_IP(ip)->commit_mutex); 506 mutex_unlock(&JFS_IP(ip)->commit_mutex);
507 mutex_unlock(&JFS_IP(dip)->commit_mutex);
508 IWRITE_UNLOCK(ip); 508 IWRITE_UNLOCK(ip);
509 goto out1; 509 goto out1;
510 } 510 }
@@ -527,8 +527,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
527 if ((new_size = commitZeroLink(tid, ip)) < 0) { 527 if ((new_size = commitZeroLink(tid, ip)) < 0) {
528 txAbort(tid, 1); /* Marks FS Dirty */ 528 txAbort(tid, 1); /* Marks FS Dirty */
529 txEnd(tid); 529 txEnd(tid);
530 mutex_unlock(&JFS_IP(dip)->commit_mutex);
531 mutex_unlock(&JFS_IP(ip)->commit_mutex); 530 mutex_unlock(&JFS_IP(ip)->commit_mutex);
531 mutex_unlock(&JFS_IP(dip)->commit_mutex);
532 IWRITE_UNLOCK(ip); 532 IWRITE_UNLOCK(ip);
533 rc = new_size; 533 rc = new_size;
534 goto out1; 534 goto out1;
@@ -556,9 +556,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
556 556
557 txEnd(tid); 557 txEnd(tid);
558 558
559 mutex_unlock(&JFS_IP(dip)->commit_mutex);
560 mutex_unlock(&JFS_IP(ip)->commit_mutex); 559 mutex_unlock(&JFS_IP(ip)->commit_mutex);
561 560 mutex_unlock(&JFS_IP(dip)->commit_mutex);
562 561
563 while (new_size && (rc == 0)) { 562 while (new_size && (rc == 0)) {
564 tid = txBegin(dip->i_sb, 0); 563 tid = txBegin(dip->i_sb, 0);
@@ -847,8 +846,8 @@ static int jfs_link(struct dentry *old_dentry,
847 out: 846 out:
848 txEnd(tid); 847 txEnd(tid);
849 848
850 mutex_unlock(&JFS_IP(dir)->commit_mutex);
851 mutex_unlock(&JFS_IP(ip)->commit_mutex); 849 mutex_unlock(&JFS_IP(ip)->commit_mutex);
850 mutex_unlock(&JFS_IP(dir)->commit_mutex);
852 851
853 jfs_info("jfs_link: rc:%d", rc); 852 jfs_info("jfs_link: rc:%d", rc);
854 return rc; 853 return rc;
@@ -1037,8 +1036,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
1037 1036
1038 out3: 1037 out3:
1039 txEnd(tid); 1038 txEnd(tid);
1040 mutex_unlock(&JFS_IP(dip)->commit_mutex);
1041 mutex_unlock(&JFS_IP(ip)->commit_mutex); 1039 mutex_unlock(&JFS_IP(ip)->commit_mutex);
1040 mutex_unlock(&JFS_IP(dip)->commit_mutex);
1042 if (rc) { 1041 if (rc) {
1043 free_ea_wmap(ip); 1042 free_ea_wmap(ip);
1044 ip->i_nlink = 0; 1043 ip->i_nlink = 0;
@@ -1160,10 +1159,11 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1160 if (S_ISDIR(new_ip->i_mode)) { 1159 if (S_ISDIR(new_ip->i_mode)) {
1161 new_ip->i_nlink--; 1160 new_ip->i_nlink--;
1162 if (new_ip->i_nlink) { 1161 if (new_ip->i_nlink) {
1163 mutex_unlock(&JFS_IP(new_dir)->commit_mutex); 1162 mutex_unlock(&JFS_IP(new_ip)->commit_mutex);
1164 mutex_unlock(&JFS_IP(old_ip)->commit_mutex);
1165 if (old_dir != new_dir) 1163 if (old_dir != new_dir)
1166 mutex_unlock(&JFS_IP(old_dir)->commit_mutex); 1164 mutex_unlock(&JFS_IP(old_dir)->commit_mutex);
1165 mutex_unlock(&JFS_IP(old_ip)->commit_mutex);
1166 mutex_unlock(&JFS_IP(new_dir)->commit_mutex);
1167 if (!S_ISDIR(old_ip->i_mode) && new_ip) 1167 if (!S_ISDIR(old_ip->i_mode) && new_ip)
1168 IWRITE_UNLOCK(new_ip); 1168 IWRITE_UNLOCK(new_ip);
1169 jfs_error(new_ip->i_sb, 1169 jfs_error(new_ip->i_sb,
@@ -1281,13 +1281,12 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1281 1281
1282 out4: 1282 out4:
1283 txEnd(tid); 1283 txEnd(tid);
1284
1285 mutex_unlock(&JFS_IP(new_dir)->commit_mutex);
1286 mutex_unlock(&JFS_IP(old_ip)->commit_mutex);
1287 if (old_dir != new_dir)
1288 mutex_unlock(&JFS_IP(old_dir)->commit_mutex);
1289 if (new_ip) 1284 if (new_ip)
1290 mutex_unlock(&JFS_IP(new_ip)->commit_mutex); 1285 mutex_unlock(&JFS_IP(new_ip)->commit_mutex);
1286 if (old_dir != new_dir)
1287 mutex_unlock(&JFS_IP(old_dir)->commit_mutex);
1288 mutex_unlock(&JFS_IP(old_ip)->commit_mutex);
1289 mutex_unlock(&JFS_IP(new_dir)->commit_mutex);
1291 1290
1292 while (new_size && (rc == 0)) { 1291 while (new_size && (rc == 0)) {
1293 tid = txBegin(new_ip->i_sb, 0); 1292 tid = txBegin(new_ip->i_sb, 0);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 4f6cfebc82db..143bcd1d5eaa 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -26,6 +26,7 @@
26#include <linux/moduleparam.h> 26#include <linux/moduleparam.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/posix_acl.h> 28#include <linux/posix_acl.h>
29#include <linux/buffer_head.h>
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30#include <linux/seq_file.h> 31#include <linux/seq_file.h>
31 32
@@ -298,7 +299,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
298 break; 299 break;
299 } 300 }
300 301
301#if defined(CONFIG_QUOTA) 302#ifdef CONFIG_QUOTA
302 case Opt_quota: 303 case Opt_quota:
303 case Opt_usrquota: 304 case Opt_usrquota:
304 *flag |= JFS_USRQUOTA; 305 *flag |= JFS_USRQUOTA;
@@ -597,7 +598,7 @@ static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
597 if (sbi->flag & JFS_NOINTEGRITY) 598 if (sbi->flag & JFS_NOINTEGRITY)
598 seq_puts(seq, ",nointegrity"); 599 seq_puts(seq, ",nointegrity");
599 600
600#if defined(CONFIG_QUOTA) 601#ifdef CONFIG_QUOTA
601 if (sbi->flag & JFS_USRQUOTA) 602 if (sbi->flag & JFS_USRQUOTA)
602 seq_puts(seq, ",usrquota"); 603 seq_puts(seq, ",usrquota");
603 604
@@ -608,6 +609,113 @@ static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
608 return 0; 609 return 0;
609} 610}
610 611
612#ifdef CONFIG_QUOTA
613
614/* Read data from quotafile - avoid pagecache and such because we cannot afford
615 * acquiring the locks... As quota files are never truncated and quota code
616 * itself serializes the operations (and noone else should touch the files)
617 * we don't have to be afraid of races */
618static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data,
619 size_t len, loff_t off)
620{
621 struct inode *inode = sb_dqopt(sb)->files[type];
622 sector_t blk = off >> sb->s_blocksize_bits;
623 int err = 0;
624 int offset = off & (sb->s_blocksize - 1);
625 int tocopy;
626 size_t toread;
627 struct buffer_head tmp_bh;
628 struct buffer_head *bh;
629 loff_t i_size = i_size_read(inode);
630
631 if (off > i_size)
632 return 0;
633 if (off+len > i_size)
634 len = i_size-off;
635 toread = len;
636 while (toread > 0) {
637 tocopy = sb->s_blocksize - offset < toread ?
638 sb->s_blocksize - offset : toread;
639
640 tmp_bh.b_state = 0;
641 tmp_bh.b_size = 1 << inode->i_blkbits;
642 err = jfs_get_block(inode, blk, &tmp_bh, 0);
643 if (err)
644 return err;
645 if (!buffer_mapped(&tmp_bh)) /* A hole? */
646 memset(data, 0, tocopy);
647 else {
648 bh = sb_bread(sb, tmp_bh.b_blocknr);
649 if (!bh)
650 return -EIO;
651 memcpy(data, bh->b_data+offset, tocopy);
652 brelse(bh);
653 }
654 offset = 0;
655 toread -= tocopy;
656 data += tocopy;
657 blk++;
658 }
659 return len;
660}
661
662/* Write to quotafile */
663static ssize_t jfs_quota_write(struct super_block *sb, int type,
664 const char *data, size_t len, loff_t off)
665{
666 struct inode *inode = sb_dqopt(sb)->files[type];
667 sector_t blk = off >> sb->s_blocksize_bits;
668 int err = 0;
669 int offset = off & (sb->s_blocksize - 1);
670 int tocopy;
671 size_t towrite = len;
672 struct buffer_head tmp_bh;
673 struct buffer_head *bh;
674
675 mutex_lock(&inode->i_mutex);
676 while (towrite > 0) {
677 tocopy = sb->s_blocksize - offset < towrite ?
678 sb->s_blocksize - offset : towrite;
679
680 tmp_bh.b_state = 0;
681 tmp_bh.b_size = 1 << inode->i_blkbits;
682 err = jfs_get_block(inode, blk, &tmp_bh, 1);
683 if (err)
684 goto out;
685 if (offset || tocopy != sb->s_blocksize)
686 bh = sb_bread(sb, tmp_bh.b_blocknr);
687 else
688 bh = sb_getblk(sb, tmp_bh.b_blocknr);
689 if (!bh) {
690 err = -EIO;
691 goto out;
692 }
693 lock_buffer(bh);
694 memcpy(bh->b_data+offset, data, tocopy);
695 flush_dcache_page(bh->b_page);
696 set_buffer_uptodate(bh);
697 mark_buffer_dirty(bh);
698 unlock_buffer(bh);
699 brelse(bh);
700 offset = 0;
701 towrite -= tocopy;
702 data += tocopy;
703 blk++;
704 }
705out:
706 if (len == towrite)
707 return err;
708 if (inode->i_size < off+len-towrite)
709 i_size_write(inode, off+len-towrite);
710 inode->i_version++;
711 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
712 mark_inode_dirty(inode);
713 mutex_unlock(&inode->i_mutex);
714 return len - towrite;
715}
716
717#endif
718
611static struct super_operations jfs_super_operations = { 719static struct super_operations jfs_super_operations = {
612 .alloc_inode = jfs_alloc_inode, 720 .alloc_inode = jfs_alloc_inode,
613 .destroy_inode = jfs_destroy_inode, 721 .destroy_inode = jfs_destroy_inode,
@@ -621,7 +729,11 @@ static struct super_operations jfs_super_operations = {
621 .unlockfs = jfs_unlockfs, 729 .unlockfs = jfs_unlockfs,
622 .statfs = jfs_statfs, 730 .statfs = jfs_statfs,
623 .remount_fs = jfs_remount, 731 .remount_fs = jfs_remount,
624 .show_options = jfs_show_options 732 .show_options = jfs_show_options,
733#ifdef CONFIG_QUOTA
734 .quota_read = jfs_quota_read,
735 .quota_write = jfs_quota_write,
736#endif
625}; 737};
626 738
627static struct export_operations jfs_export_operations = { 739static struct export_operations jfs_export_operations = {
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 5980c45998cc..89ba0df14c22 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -454,7 +454,7 @@ static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *ho
454 fl->fl_ops = &nlmclnt_lock_ops; 454 fl->fl_ops = &nlmclnt_lock_ops;
455} 455}
456 456
457static void do_vfs_lock(struct file_lock *fl) 457static int do_vfs_lock(struct file_lock *fl)
458{ 458{
459 int res = 0; 459 int res = 0;
460 switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { 460 switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
@@ -467,9 +467,7 @@ static void do_vfs_lock(struct file_lock *fl)
467 default: 467 default:
468 BUG(); 468 BUG();
469 } 469 }
470 if (res < 0) 470 return res;
471 printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n",
472 __FUNCTION__);
473} 471}
474 472
475/* 473/*
@@ -498,6 +496,7 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
498 struct nlm_host *host = req->a_host; 496 struct nlm_host *host = req->a_host;
499 struct nlm_res *resp = &req->a_res; 497 struct nlm_res *resp = &req->a_res;
500 struct nlm_wait *block = NULL; 498 struct nlm_wait *block = NULL;
499 unsigned char fl_flags = fl->fl_flags;
501 int status = -ENOLCK; 500 int status = -ENOLCK;
502 501
503 if (!host->h_monitored && nsm_monitor(host) < 0) { 502 if (!host->h_monitored && nsm_monitor(host) < 0) {
@@ -505,6 +504,10 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
505 host->h_name); 504 host->h_name);
506 goto out; 505 goto out;
507 } 506 }
507 fl->fl_flags |= FL_ACCESS;
508 status = do_vfs_lock(fl);
509 if (status < 0)
510 goto out;
508 511
509 block = nlmclnt_prepare_block(host, fl); 512 block = nlmclnt_prepare_block(host, fl);
510again: 513again:
@@ -539,9 +542,10 @@ again:
539 up_read(&host->h_rwsem); 542 up_read(&host->h_rwsem);
540 goto again; 543 goto again;
541 } 544 }
542 fl->fl_flags |= FL_SLEEP;
543 /* Ensure the resulting lock will get added to granted list */ 545 /* Ensure the resulting lock will get added to granted list */
544 do_vfs_lock(fl); 546 fl->fl_flags = fl_flags | FL_SLEEP;
547 if (do_vfs_lock(fl) < 0)
548 printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __FUNCTION__);
545 up_read(&host->h_rwsem); 549 up_read(&host->h_rwsem);
546 } 550 }
547 status = nlm_stat_to_errno(resp->status); 551 status = nlm_stat_to_errno(resp->status);
@@ -552,6 +556,7 @@ out_unblock:
552 nlmclnt_cancel(host, req->a_args.block, fl); 556 nlmclnt_cancel(host, req->a_args.block, fl);
553out: 557out:
554 nlm_release_call(req); 558 nlm_release_call(req);
559 fl->fl_flags = fl_flags;
555 return status; 560 return status;
556} 561}
557 562
@@ -606,15 +611,19 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
606{ 611{
607 struct nlm_host *host = req->a_host; 612 struct nlm_host *host = req->a_host;
608 struct nlm_res *resp = &req->a_res; 613 struct nlm_res *resp = &req->a_res;
609 int status; 614 int status = 0;
610 615
611 /* 616 /*
612 * Note: the server is supposed to either grant us the unlock 617 * Note: the server is supposed to either grant us the unlock
613 * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either 618 * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either
614 * case, we want to unlock. 619 * case, we want to unlock.
615 */ 620 */
621 fl->fl_flags |= FL_EXISTS;
616 down_read(&host->h_rwsem); 622 down_read(&host->h_rwsem);
617 do_vfs_lock(fl); 623 if (do_vfs_lock(fl) == -ENOENT) {
624 up_read(&host->h_rwsem);
625 goto out;
626 }
618 up_read(&host->h_rwsem); 627 up_read(&host->h_rwsem);
619 628
620 if (req->a_flags & RPC_TASK_ASYNC) 629 if (req->a_flags & RPC_TASK_ASYNC)
@@ -624,7 +633,6 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
624 if (status < 0) 633 if (status < 0)
625 goto out; 634 goto out;
626 635
627 status = 0;
628 if (resp->status == NLM_LCK_GRANTED) 636 if (resp->status == NLM_LCK_GRANTED)
629 goto out; 637 goto out;
630 638
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index baf5ae513481..c9d419703cf3 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -638,9 +638,6 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
638 if (task->tk_status < 0) { 638 if (task->tk_status < 0) {
639 /* RPC error: Re-insert for retransmission */ 639 /* RPC error: Re-insert for retransmission */
640 timeout = 10 * HZ; 640 timeout = 10 * HZ;
641 } else if (block->b_done) {
642 /* Block already removed, kill it for real */
643 timeout = 0;
644 } else { 641 } else {
645 /* Call was successful, now wait for client callback */ 642 /* Call was successful, now wait for client callback */
646 timeout = 60 * HZ; 643 timeout = 60 * HZ;
@@ -709,13 +706,10 @@ nlmsvc_retry_blocked(void)
709 break; 706 break;
710 if (time_after(block->b_when,jiffies)) 707 if (time_after(block->b_when,jiffies))
711 break; 708 break;
712 dprintk("nlmsvc_retry_blocked(%p, when=%ld, done=%d)\n", 709 dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n",
713 block, block->b_when, block->b_done); 710 block, block->b_when);
714 kref_get(&block->b_count); 711 kref_get(&block->b_count);
715 if (block->b_done) 712 nlmsvc_grant_blocked(block);
716 nlmsvc_unlink_block(block);
717 else
718 nlmsvc_grant_blocked(block);
719 nlmsvc_release_block(block); 713 nlmsvc_release_block(block);
720 } 714 }
721 715
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 2a4df9b3779a..01b4db9e5466 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -237,19 +237,22 @@ static int
237nlm_traverse_files(struct nlm_host *host, int action) 237nlm_traverse_files(struct nlm_host *host, int action)
238{ 238{
239 struct nlm_file *file, **fp; 239 struct nlm_file *file, **fp;
240 int i; 240 int i, ret = 0;
241 241
242 mutex_lock(&nlm_file_mutex); 242 mutex_lock(&nlm_file_mutex);
243 for (i = 0; i < FILE_NRHASH; i++) { 243 for (i = 0; i < FILE_NRHASH; i++) {
244 fp = nlm_files + i; 244 fp = nlm_files + i;
245 while ((file = *fp) != NULL) { 245 while ((file = *fp) != NULL) {
246 file->f_count++;
247 mutex_unlock(&nlm_file_mutex);
248
246 /* Traverse locks, blocks and shares of this file 249 /* Traverse locks, blocks and shares of this file
247 * and update file->f_locks count */ 250 * and update file->f_locks count */
248 if (nlm_inspect_file(host, file, action)) { 251 if (nlm_inspect_file(host, file, action))
249 mutex_unlock(&nlm_file_mutex); 252 ret = 1;
250 return 1;
251 }
252 253
254 mutex_lock(&nlm_file_mutex);
255 file->f_count--;
253 /* No more references to this file. Let go of it. */ 256 /* No more references to this file. Let go of it. */
254 if (!file->f_blocks && !file->f_locks 257 if (!file->f_blocks && !file->f_locks
255 && !file->f_shares && !file->f_count) { 258 && !file->f_shares && !file->f_count) {
@@ -262,7 +265,7 @@ nlm_traverse_files(struct nlm_host *host, int action)
262 } 265 }
263 } 266 }
264 mutex_unlock(&nlm_file_mutex); 267 mutex_unlock(&nlm_file_mutex);
265 return 0; 268 return ret;
266} 269}
267 270
268/* 271/*
diff --git a/fs/locks.c b/fs/locks.c
index 1ad29c9b6252..d7c53392cac1 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -725,6 +725,10 @@ next_task:
725/* Try to create a FLOCK lock on filp. We always insert new FLOCK locks 725/* Try to create a FLOCK lock on filp. We always insert new FLOCK locks
726 * at the head of the list, but that's secret knowledge known only to 726 * at the head of the list, but that's secret knowledge known only to
727 * flock_lock_file and posix_lock_file. 727 * flock_lock_file and posix_lock_file.
728 *
729 * Note that if called with an FL_EXISTS argument, the caller may determine
730 * whether or not a lock was successfully freed by testing the return
731 * value for -ENOENT.
728 */ 732 */
729static int flock_lock_file(struct file *filp, struct file_lock *request) 733static int flock_lock_file(struct file *filp, struct file_lock *request)
730{ 734{
@@ -735,6 +739,8 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
735 int found = 0; 739 int found = 0;
736 740
737 lock_kernel(); 741 lock_kernel();
742 if (request->fl_flags & FL_ACCESS)
743 goto find_conflict;
738 for_each_lock(inode, before) { 744 for_each_lock(inode, before) {
739 struct file_lock *fl = *before; 745 struct file_lock *fl = *before;
740 if (IS_POSIX(fl)) 746 if (IS_POSIX(fl))
@@ -750,8 +756,11 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
750 break; 756 break;
751 } 757 }
752 758
753 if (request->fl_type == F_UNLCK) 759 if (request->fl_type == F_UNLCK) {
760 if ((request->fl_flags & FL_EXISTS) && !found)
761 error = -ENOENT;
754 goto out; 762 goto out;
763 }
755 764
756 error = -ENOMEM; 765 error = -ENOMEM;
757 new_fl = locks_alloc_lock(); 766 new_fl = locks_alloc_lock();
@@ -764,6 +773,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
764 if (found) 773 if (found)
765 cond_resched(); 774 cond_resched();
766 775
776find_conflict:
767 for_each_lock(inode, before) { 777 for_each_lock(inode, before) {
768 struct file_lock *fl = *before; 778 struct file_lock *fl = *before;
769 if (IS_POSIX(fl)) 779 if (IS_POSIX(fl))
@@ -777,6 +787,8 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
777 locks_insert_block(fl, request); 787 locks_insert_block(fl, request);
778 goto out; 788 goto out;
779 } 789 }
790 if (request->fl_flags & FL_ACCESS)
791 goto out;
780 locks_copy_lock(new_fl, request); 792 locks_copy_lock(new_fl, request);
781 locks_insert_lock(&inode->i_flock, new_fl); 793 locks_insert_lock(&inode->i_flock, new_fl);
782 new_fl = NULL; 794 new_fl = NULL;
@@ -948,8 +960,11 @@ static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request
948 960
949 error = 0; 961 error = 0;
950 if (!added) { 962 if (!added) {
951 if (request->fl_type == F_UNLCK) 963 if (request->fl_type == F_UNLCK) {
964 if (request->fl_flags & FL_EXISTS)
965 error = -ENOENT;
952 goto out; 966 goto out;
967 }
953 968
954 if (!new_fl) { 969 if (!new_fl) {
955 error = -ENOLCK; 970 error = -ENOLCK;
@@ -996,6 +1011,10 @@ static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request
996 * Add a POSIX style lock to a file. 1011 * Add a POSIX style lock to a file.
997 * We merge adjacent & overlapping locks whenever possible. 1012 * We merge adjacent & overlapping locks whenever possible.
998 * POSIX locks are sorted by owner task, then by starting address 1013 * POSIX locks are sorted by owner task, then by starting address
1014 *
1015 * Note that if called with an FL_EXISTS argument, the caller may determine
1016 * whether or not a lock was successfully freed by testing the return
1017 * value for -ENOENT.
999 */ 1018 */
1000int posix_lock_file(struct file *filp, struct file_lock *fl) 1019int posix_lock_file(struct file *filp, struct file_lock *fl)
1001{ 1020{
@@ -1402,8 +1421,9 @@ static int __setlease(struct file *filp, long arg, struct file_lock **flp)
1402 if (!leases_enable) 1421 if (!leases_enable)
1403 goto out; 1422 goto out;
1404 1423
1405 error = lease_alloc(filp, arg, &fl); 1424 error = -ENOMEM;
1406 if (error) 1425 fl = locks_alloc_lock();
1426 if (fl == NULL)
1407 goto out; 1427 goto out;
1408 1428
1409 locks_copy_lock(fl, lease); 1429 locks_copy_lock(fl, lease);
@@ -1411,6 +1431,7 @@ static int __setlease(struct file *filp, long arg, struct file_lock **flp)
1411 locks_insert_lock(before, fl); 1431 locks_insert_lock(before, fl);
1412 1432
1413 *flp = fl; 1433 *flp = fl;
1434 error = 0;
1414out: 1435out:
1415 return error; 1436 return error;
1416} 1437}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 9ea91c5eeb7b..330ff9fc7cf0 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -204,6 +204,8 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
204 /* 204 /*
205 * Allocate the buffer map to keep the superblock small. 205 * Allocate the buffer map to keep the superblock small.
206 */ 206 */
207 if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0)
208 goto out_illegal_sb;
207 i = (sbi->s_imap_blocks + sbi->s_zmap_blocks) * sizeof(bh); 209 i = (sbi->s_imap_blocks + sbi->s_zmap_blocks) * sizeof(bh);
208 map = kmalloc(i, GFP_KERNEL); 210 map = kmalloc(i, GFP_KERNEL);
209 if (!map) 211 if (!map)
@@ -263,7 +265,7 @@ out_no_root:
263 265
264out_no_bitmap: 266out_no_bitmap:
265 printk("MINIX-fs: bad superblock or unable to read bitmaps\n"); 267 printk("MINIX-fs: bad superblock or unable to read bitmaps\n");
266 out_freemap: 268out_freemap:
267 for (i = 0; i < sbi->s_imap_blocks; i++) 269 for (i = 0; i < sbi->s_imap_blocks; i++)
268 brelse(sbi->s_imap[i]); 270 brelse(sbi->s_imap[i]);
269 for (i = 0; i < sbi->s_zmap_blocks; i++) 271 for (i = 0; i < sbi->s_zmap_blocks; i++)
@@ -276,11 +278,16 @@ out_no_map:
276 printk("MINIX-fs: can't allocate map\n"); 278 printk("MINIX-fs: can't allocate map\n");
277 goto out_release; 279 goto out_release;
278 280
281out_illegal_sb:
282 if (!silent)
283 printk("MINIX-fs: bad superblock\n");
284 goto out_release;
285
279out_no_fs: 286out_no_fs:
280 if (!silent) 287 if (!silent)
281 printk("VFS: Can't find a Minix or Minix V2 filesystem " 288 printk("VFS: Can't find a Minix or Minix V2 filesystem "
282 "on device %s\n", s->s_id); 289 "on device %s\n", s->s_id);
283 out_release: 290out_release:
284 brelse(bh); 291 brelse(bh);
285 goto out; 292 goto out;
286 293
@@ -290,7 +297,7 @@ out_bad_hblock:
290 297
291out_bad_sb: 298out_bad_sb:
292 printk("MINIX-fs: unable to read superblock\n"); 299 printk("MINIX-fs: unable to read superblock\n");
293 out: 300out:
294 s->s_fs_info = NULL; 301 s->s_fs_info = NULL;
295 kfree(sbi); 302 kfree(sbi);
296 return -EINVAL; 303 return -EINVAL;
diff --git a/fs/namei.c b/fs/namei.c
index c784e8bb57a3..432d6bc6fab0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -159,7 +159,7 @@ char * getname(const char __user * filename)
159#ifdef CONFIG_AUDITSYSCALL 159#ifdef CONFIG_AUDITSYSCALL
160void putname(const char *name) 160void putname(const char *name)
161{ 161{
162 if (unlikely(current->audit_context)) 162 if (unlikely(!audit_dummy_context()))
163 audit_putname(name); 163 audit_putname(name);
164 else 164 else
165 __putname(name); 165 __putname(name);
@@ -227,10 +227,10 @@ int generic_permission(struct inode *inode, int mask,
227 227
228int permission(struct inode *inode, int mask, struct nameidata *nd) 228int permission(struct inode *inode, int mask, struct nameidata *nd)
229{ 229{
230 umode_t mode = inode->i_mode;
230 int retval, submask; 231 int retval, submask;
231 232
232 if (mask & MAY_WRITE) { 233 if (mask & MAY_WRITE) {
233 umode_t mode = inode->i_mode;
234 234
235 /* 235 /*
236 * Nobody gets write access to a read-only fs. 236 * Nobody gets write access to a read-only fs.
@@ -247,6 +247,13 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
247 } 247 }
248 248
249 249
250 /*
251 * MAY_EXEC on regular files requires special handling: We override
252 * filesystem execute permissions if the mode bits aren't set.
253 */
254 if ((mask & MAY_EXEC) && S_ISREG(mode) && !(mode & S_IXUGO))
255 return -EACCES;
256
250 /* Ordinary permission routines do not understand MAY_APPEND. */ 257 /* Ordinary permission routines do not understand MAY_APPEND. */
251 submask = mask & ~MAY_APPEND; 258 submask = mask & ~MAY_APPEND;
252 if (inode->i_op && inode->i_op->permission) 259 if (inode->i_op && inode->i_op->permission)
@@ -1125,7 +1132,7 @@ static int fastcall do_path_lookup(int dfd, const char *name,
1125 retval = link_path_walk(name, nd); 1132 retval = link_path_walk(name, nd);
1126out: 1133out:
1127 if (likely(retval == 0)) { 1134 if (likely(retval == 0)) {
1128 if (unlikely(current->audit_context && nd && nd->dentry && 1135 if (unlikely(!audit_dummy_context() && nd && nd->dentry &&
1129 nd->dentry->d_inode)) 1136 nd->dentry->d_inode))
1130 audit_inode(name, nd->dentry->d_inode); 1137 audit_inode(name, nd->dentry->d_inode);
1131 } 1138 }
@@ -1357,7 +1364,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
1357 return -ENOENT; 1364 return -ENOENT;
1358 1365
1359 BUG_ON(victim->d_parent->d_inode != dir); 1366 BUG_ON(victim->d_parent->d_inode != dir);
1360 audit_inode_child(victim->d_name.name, victim->d_inode, dir->i_ino); 1367 audit_inode_child(victim->d_name.name, victim->d_inode, dir);
1361 1368
1362 error = permission(dir,MAY_WRITE | MAY_EXEC, NULL); 1369 error = permission(dir,MAY_WRITE | MAY_EXEC, NULL);
1363 if (error) 1370 if (error)
@@ -1423,7 +1430,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
1423 struct dentry *p; 1430 struct dentry *p;
1424 1431
1425 if (p1 == p2) { 1432 if (p1 == p2) {
1426 mutex_lock(&p1->d_inode->i_mutex); 1433 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1427 return NULL; 1434 return NULL;
1428 } 1435 }
1429 1436
@@ -1431,22 +1438,22 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
1431 1438
1432 for (p = p1; p->d_parent != p; p = p->d_parent) { 1439 for (p = p1; p->d_parent != p; p = p->d_parent) {
1433 if (p->d_parent == p2) { 1440 if (p->d_parent == p2) {
1434 mutex_lock(&p2->d_inode->i_mutex); 1441 mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
1435 mutex_lock(&p1->d_inode->i_mutex); 1442 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
1436 return p; 1443 return p;
1437 } 1444 }
1438 } 1445 }
1439 1446
1440 for (p = p2; p->d_parent != p; p = p->d_parent) { 1447 for (p = p2; p->d_parent != p; p = p->d_parent) {
1441 if (p->d_parent == p1) { 1448 if (p->d_parent == p1) {
1442 mutex_lock(&p1->d_inode->i_mutex); 1449 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1443 mutex_lock(&p2->d_inode->i_mutex); 1450 mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
1444 return p; 1451 return p;
1445 } 1452 }
1446 } 1453 }
1447 1454
1448 mutex_lock(&p1->d_inode->i_mutex); 1455 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1449 mutex_lock(&p2->d_inode->i_mutex); 1456 mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
1450 return NULL; 1457 return NULL;
1451} 1458}
1452 1459
@@ -1659,6 +1666,7 @@ do_last:
1659 * It already exists. 1666 * It already exists.
1660 */ 1667 */
1661 mutex_unlock(&dir->d_inode->i_mutex); 1668 mutex_unlock(&dir->d_inode->i_mutex);
1669 audit_inode_update(path.dentry->d_inode);
1662 1670
1663 error = -EEXIST; 1671 error = -EEXIST;
1664 if (flag & O_EXCL) 1672 if (flag & O_EXCL)
@@ -1669,6 +1677,7 @@ do_last:
1669 if (flag & O_NOFOLLOW) 1677 if (flag & O_NOFOLLOW)
1670 goto exit_dput; 1678 goto exit_dput;
1671 } 1679 }
1680
1672 error = -ENOENT; 1681 error = -ENOENT;
1673 if (!path.dentry->d_inode) 1682 if (!path.dentry->d_inode)
1674 goto exit_dput; 1683 goto exit_dput;
@@ -1712,8 +1721,14 @@ do_link:
1712 if (error) 1721 if (error)
1713 goto exit_dput; 1722 goto exit_dput;
1714 error = __do_follow_link(&path, nd); 1723 error = __do_follow_link(&path, nd);
1715 if (error) 1724 if (error) {
1725 /* Does someone understand code flow here? Or it is only
1726 * me so stupid? Anathema to whoever designed this non-sense
1727 * with "intent.open".
1728 */
1729 release_open_intent(nd);
1716 return error; 1730 return error;
1731 }
1717 nd->flags &= ~LOOKUP_PARENT; 1732 nd->flags &= ~LOOKUP_PARENT;
1718 if (nd->last_type == LAST_BIND) 1733 if (nd->last_type == LAST_BIND)
1719 goto ok; 1734 goto ok;
@@ -1751,7 +1766,7 @@ struct dentry *lookup_create(struct nameidata *nd, int is_dir)
1751{ 1766{
1752 struct dentry *dentry = ERR_PTR(-EEXIST); 1767 struct dentry *dentry = ERR_PTR(-EEXIST);
1753 1768
1754 mutex_lock(&nd->dentry->d_inode->i_mutex); 1769 mutex_lock_nested(&nd->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
1755 /* 1770 /*
1756 * Yucky last component or no last component at all? 1771 * Yucky last component or no last component at all?
1757 * (foo/., foo/.., /////) 1772 * (foo/., foo/.., /////)
@@ -1759,6 +1774,8 @@ struct dentry *lookup_create(struct nameidata *nd, int is_dir)
1759 if (nd->last_type != LAST_NORM) 1774 if (nd->last_type != LAST_NORM)
1760 goto fail; 1775 goto fail;
1761 nd->flags &= ~LOOKUP_PARENT; 1776 nd->flags &= ~LOOKUP_PARENT;
1777 nd->flags |= LOOKUP_CREATE;
1778 nd->intent.open.flags = O_EXCL;
1762 1779
1763 /* 1780 /*
1764 * Do the final lookup. 1781 * Do the final lookup.
@@ -2008,7 +2025,7 @@ static long do_rmdir(int dfd, const char __user *pathname)
2008 error = -EBUSY; 2025 error = -EBUSY;
2009 goto exit1; 2026 goto exit1;
2010 } 2027 }
2011 mutex_lock(&nd.dentry->d_inode->i_mutex); 2028 mutex_lock_nested(&nd.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2012 dentry = lookup_hash(&nd); 2029 dentry = lookup_hash(&nd);
2013 error = PTR_ERR(dentry); 2030 error = PTR_ERR(dentry);
2014 if (!IS_ERR(dentry)) { 2031 if (!IS_ERR(dentry)) {
@@ -2082,7 +2099,7 @@ static long do_unlinkat(int dfd, const char __user *pathname)
2082 error = -EISDIR; 2099 error = -EISDIR;
2083 if (nd.last_type != LAST_NORM) 2100 if (nd.last_type != LAST_NORM)
2084 goto exit1; 2101 goto exit1;
2085 mutex_lock(&nd.dentry->d_inode->i_mutex); 2102 mutex_lock_nested(&nd.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2086 dentry = lookup_hash(&nd); 2103 dentry = lookup_hash(&nd);
2087 error = PTR_ERR(dentry); 2104 error = PTR_ERR(dentry);
2088 if (!IS_ERR(dentry)) { 2105 if (!IS_ERR(dentry)) {
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3ddda6f7ecc2..e7ffb4deb3e5 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -690,7 +690,9 @@ int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd)
690 goto out_force; 690 goto out_force;
691 /* This is an open(2) */ 691 /* This is an open(2) */
692 if (nfs_lookup_check_intent(nd, LOOKUP_OPEN) != 0 && 692 if (nfs_lookup_check_intent(nd, LOOKUP_OPEN) != 0 &&
693 !(server->flags & NFS_MOUNT_NOCTO)) 693 !(server->flags & NFS_MOUNT_NOCTO) &&
694 (S_ISREG(inode->i_mode) ||
695 S_ISDIR(inode->i_mode)))
694 goto out_force; 696 goto out_force;
695 } 697 }
696 return nfs_revalidate_inode(server, inode); 698 return nfs_revalidate_inode(server, inode);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4cdd1b499e35..76ca1cbc38f9 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -67,25 +67,19 @@ struct nfs_direct_req {
67 struct kref kref; /* release manager */ 67 struct kref kref; /* release manager */
68 68
69 /* I/O parameters */ 69 /* I/O parameters */
70 struct list_head list, /* nfs_read/write_data structs */
71 rewrite_list; /* saved nfs_write_data structs */
72 struct nfs_open_context *ctx; /* file open context info */ 70 struct nfs_open_context *ctx; /* file open context info */
73 struct kiocb * iocb; /* controlling i/o request */ 71 struct kiocb * iocb; /* controlling i/o request */
74 struct inode * inode; /* target file of i/o */ 72 struct inode * inode; /* target file of i/o */
75 unsigned long user_addr; /* location of user's buffer */
76 size_t user_count; /* total bytes to move */
77 loff_t pos; /* starting offset in file */
78 struct page ** pages; /* pages in our buffer */
79 unsigned int npages; /* count of pages */
80 73
81 /* completion state */ 74 /* completion state */
75 atomic_t io_count; /* i/os we're waiting for */
82 spinlock_t lock; /* protect completion state */ 76 spinlock_t lock; /* protect completion state */
83 int outstanding; /* i/os we're waiting for */
84 ssize_t count, /* bytes actually processed */ 77 ssize_t count, /* bytes actually processed */
85 error; /* any reported error */ 78 error; /* any reported error */
86 struct completion completion; /* wait for i/o completion */ 79 struct completion completion; /* wait for i/o completion */
87 80
88 /* commit state */ 81 /* commit state */
82 struct list_head rewrite_list; /* saved nfs_write_data structs */
89 struct nfs_write_data * commit_data; /* special write_data for commits */ 83 struct nfs_write_data * commit_data; /* special write_data for commits */
90 int flags; 84 int flags;
91#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ 85#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */
@@ -93,8 +87,18 @@ struct nfs_direct_req {
93 struct nfs_writeverf verf; /* unstable write verifier */ 87 struct nfs_writeverf verf; /* unstable write verifier */
94}; 88};
95 89
96static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync);
97static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode); 90static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
91static const struct rpc_call_ops nfs_write_direct_ops;
92
93static inline void get_dreq(struct nfs_direct_req *dreq)
94{
95 atomic_inc(&dreq->io_count);
96}
97
98static inline int put_dreq(struct nfs_direct_req *dreq)
99{
100 return atomic_dec_and_test(&dreq->io_count);
101}
98 102
99/** 103/**
100 * nfs_direct_IO - NFS address space operation for direct I/O 104 * nfs_direct_IO - NFS address space operation for direct I/O
@@ -118,50 +122,21 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_
118 return -EINVAL; 122 return -EINVAL;
119} 123}
120 124
121static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty) 125static void nfs_direct_dirty_pages(struct page **pages, int npages)
122{ 126{
123 int i; 127 int i;
124 for (i = 0; i < npages; i++) { 128 for (i = 0; i < npages; i++) {
125 struct page *page = pages[i]; 129 struct page *page = pages[i];
126 if (do_dirty && !PageCompound(page)) 130 if (!PageCompound(page))
127 set_page_dirty_lock(page); 131 set_page_dirty_lock(page);
128 page_cache_release(page);
129 } 132 }
130 kfree(pages);
131} 133}
132 134
133static inline int nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, struct page ***pages) 135static void nfs_direct_release_pages(struct page **pages, int npages)
134{ 136{
135 int result = -ENOMEM; 137 int i;
136 unsigned long page_count; 138 for (i = 0; i < npages; i++)
137 size_t array_size; 139 page_cache_release(pages[i]);
138
139 page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
140 page_count -= user_addr >> PAGE_SHIFT;
141
142 array_size = (page_count * sizeof(struct page *));
143 *pages = kmalloc(array_size, GFP_KERNEL);
144 if (*pages) {
145 down_read(&current->mm->mmap_sem);
146 result = get_user_pages(current, current->mm, user_addr,
147 page_count, (rw == READ), 0,
148 *pages, NULL);
149 up_read(&current->mm->mmap_sem);
150 if (result != page_count) {
151 /*
152 * If we got fewer pages than expected from
153 * get_user_pages(), the user buffer runs off the
154 * end of a mapping; return EFAULT.
155 */
156 if (result >= 0) {
157 nfs_free_user_pages(*pages, result, 0);
158 result = -EFAULT;
159 } else
160 kfree(*pages);
161 *pages = NULL;
162 }
163 }
164 return result;
165} 140}
166 141
167static inline struct nfs_direct_req *nfs_direct_req_alloc(void) 142static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
@@ -173,13 +148,13 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
173 return NULL; 148 return NULL;
174 149
175 kref_init(&dreq->kref); 150 kref_init(&dreq->kref);
151 kref_get(&dreq->kref);
176 init_completion(&dreq->completion); 152 init_completion(&dreq->completion);
177 INIT_LIST_HEAD(&dreq->list);
178 INIT_LIST_HEAD(&dreq->rewrite_list); 153 INIT_LIST_HEAD(&dreq->rewrite_list);
179 dreq->iocb = NULL; 154 dreq->iocb = NULL;
180 dreq->ctx = NULL; 155 dreq->ctx = NULL;
181 spin_lock_init(&dreq->lock); 156 spin_lock_init(&dreq->lock);
182 dreq->outstanding = 0; 157 atomic_set(&dreq->io_count, 0);
183 dreq->count = 0; 158 dreq->count = 0;
184 dreq->error = 0; 159 dreq->error = 0;
185 dreq->flags = 0; 160 dreq->flags = 0;
@@ -220,18 +195,11 @@ out:
220} 195}
221 196
222/* 197/*
223 * We must hold a reference to all the pages in this direct read request 198 * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust
224 * until the RPCs complete. This could be long *after* we are woken up in 199 * the iocb is still valid here if this is a synchronous request.
225 * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
226 *
227 * In addition, synchronous I/O uses a stack-allocated iocb. Thus we
228 * can't trust the iocb is still valid here if this is a synchronous
229 * request. If the waiter is woken prematurely, the iocb is long gone.
230 */ 200 */
231static void nfs_direct_complete(struct nfs_direct_req *dreq) 201static void nfs_direct_complete(struct nfs_direct_req *dreq)
232{ 202{
233 nfs_free_user_pages(dreq->pages, dreq->npages, 1);
234
235 if (dreq->iocb) { 203 if (dreq->iocb) {
236 long res = (long) dreq->error; 204 long res = (long) dreq->error;
237 if (!res) 205 if (!res)
@@ -244,48 +212,10 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
244} 212}
245 213
246/* 214/*
247 * Note we also set the number of requests we have in the dreq when we are 215 * We must hold a reference to all the pages in this direct read request
248 * done. This prevents races with I/O completion so we will always wait 216 * until the RPCs complete. This could be long *after* we are woken up in
249 * until all requests have been dispatched and completed. 217 * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
250 */ 218 */
251static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, size_t rsize)
252{
253 struct list_head *list;
254 struct nfs_direct_req *dreq;
255 unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
256
257 dreq = nfs_direct_req_alloc();
258 if (!dreq)
259 return NULL;
260
261 list = &dreq->list;
262 for(;;) {
263 struct nfs_read_data *data = nfs_readdata_alloc(rpages);
264
265 if (unlikely(!data)) {
266 while (!list_empty(list)) {
267 data = list_entry(list->next,
268 struct nfs_read_data, pages);
269 list_del(&data->pages);
270 nfs_readdata_free(data);
271 }
272 kref_put(&dreq->kref, nfs_direct_req_release);
273 return NULL;
274 }
275
276 INIT_LIST_HEAD(&data->pages);
277 list_add(&data->pages, list);
278
279 data->req = (struct nfs_page *) dreq;
280 dreq->outstanding++;
281 if (nbytes <= rsize)
282 break;
283 nbytes -= rsize;
284 }
285 kref_get(&dreq->kref);
286 return dreq;
287}
288
289static void nfs_direct_read_result(struct rpc_task *task, void *calldata) 219static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
290{ 220{
291 struct nfs_read_data *data = calldata; 221 struct nfs_read_data *data = calldata;
@@ -294,6 +224,9 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
294 if (nfs_readpage_result(task, data) != 0) 224 if (nfs_readpage_result(task, data) != 0)
295 return; 225 return;
296 226
227 nfs_direct_dirty_pages(data->pagevec, data->npages);
228 nfs_direct_release_pages(data->pagevec, data->npages);
229
297 spin_lock(&dreq->lock); 230 spin_lock(&dreq->lock);
298 231
299 if (likely(task->tk_status >= 0)) 232 if (likely(task->tk_status >= 0))
@@ -301,13 +234,10 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
301 else 234 else
302 dreq->error = task->tk_status; 235 dreq->error = task->tk_status;
303 236
304 if (--dreq->outstanding) {
305 spin_unlock(&dreq->lock);
306 return;
307 }
308
309 spin_unlock(&dreq->lock); 237 spin_unlock(&dreq->lock);
310 nfs_direct_complete(dreq); 238
239 if (put_dreq(dreq))
240 nfs_direct_complete(dreq);
311} 241}
312 242
313static const struct rpc_call_ops nfs_read_direct_ops = { 243static const struct rpc_call_ops nfs_read_direct_ops = {
@@ -316,41 +246,56 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
316}; 246};
317 247
318/* 248/*
319 * For each nfs_read_data struct that was allocated on the list, dispatch 249 * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
320 * an NFS READ operation 250 * operation. If nfs_readdata_alloc() or get_user_pages() fails,
251 * bail and stop sending more reads. Read length accounting is
252 * handled automatically by nfs_direct_read_result(). Otherwise, if
253 * no requests have been sent, just return an error.
321 */ 254 */
322static void nfs_direct_read_schedule(struct nfs_direct_req *dreq) 255static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
323{ 256{
324 struct nfs_open_context *ctx = dreq->ctx; 257 struct nfs_open_context *ctx = dreq->ctx;
325 struct inode *inode = ctx->dentry->d_inode; 258 struct inode *inode = ctx->dentry->d_inode;
326 struct list_head *list = &dreq->list;
327 struct page **pages = dreq->pages;
328 size_t count = dreq->user_count;
329 loff_t pos = dreq->pos;
330 size_t rsize = NFS_SERVER(inode)->rsize; 259 size_t rsize = NFS_SERVER(inode)->rsize;
331 unsigned int curpage, pgbase; 260 unsigned int pgbase;
261 int result;
262 ssize_t started = 0;
263
264 get_dreq(dreq);
332 265
333 curpage = 0;
334 pgbase = dreq->user_addr & ~PAGE_MASK;
335 do { 266 do {
336 struct nfs_read_data *data; 267 struct nfs_read_data *data;
337 size_t bytes; 268 size_t bytes;
338 269
339 bytes = rsize; 270 pgbase = user_addr & ~PAGE_MASK;
340 if (count < rsize) 271 bytes = min(rsize,count);
341 bytes = count;
342 272
343 BUG_ON(list_empty(list)); 273 result = -ENOMEM;
344 data = list_entry(list->next, struct nfs_read_data, pages); 274 data = nfs_readdata_alloc(pgbase + bytes);
345 list_del_init(&data->pages); 275 if (unlikely(!data))
276 break;
277
278 down_read(&current->mm->mmap_sem);
279 result = get_user_pages(current, current->mm, user_addr,
280 data->npages, 1, 0, data->pagevec, NULL);
281 up_read(&current->mm->mmap_sem);
282 if (unlikely(result < data->npages)) {
283 if (result > 0)
284 nfs_direct_release_pages(data->pagevec, result);
285 nfs_readdata_release(data);
286 break;
287 }
288
289 get_dreq(dreq);
346 290
291 data->req = (struct nfs_page *) dreq;
347 data->inode = inode; 292 data->inode = inode;
348 data->cred = ctx->cred; 293 data->cred = ctx->cred;
349 data->args.fh = NFS_FH(inode); 294 data->args.fh = NFS_FH(inode);
350 data->args.context = ctx; 295 data->args.context = ctx;
351 data->args.offset = pos; 296 data->args.offset = pos;
352 data->args.pgbase = pgbase; 297 data->args.pgbase = pgbase;
353 data->args.pages = &pages[curpage]; 298 data->args.pages = data->pagevec;
354 data->args.count = bytes; 299 data->args.count = bytes;
355 data->res.fattr = &data->fattr; 300 data->res.fattr = &data->fattr;
356 data->res.eof = 0; 301 data->res.eof = 0;
@@ -373,33 +318,37 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq)
373 bytes, 318 bytes,
374 (unsigned long long)data->args.offset); 319 (unsigned long long)data->args.offset);
375 320
321 started += bytes;
322 user_addr += bytes;
376 pos += bytes; 323 pos += bytes;
324 /* FIXME: Remove this unnecessary math from final patch */
377 pgbase += bytes; 325 pgbase += bytes;
378 curpage += pgbase >> PAGE_SHIFT;
379 pgbase &= ~PAGE_MASK; 326 pgbase &= ~PAGE_MASK;
327 BUG_ON(pgbase != (user_addr & ~PAGE_MASK));
380 328
381 count -= bytes; 329 count -= bytes;
382 } while (count != 0); 330 } while (count != 0);
383 BUG_ON(!list_empty(list)); 331
332 if (put_dreq(dreq))
333 nfs_direct_complete(dreq);
334
335 if (started)
336 return 0;
337 return result < 0 ? (ssize_t) result : -EFAULT;
384} 338}
385 339
386static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, unsigned int nr_pages) 340static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
387{ 341{
388 ssize_t result; 342 ssize_t result = 0;
389 sigset_t oldset; 343 sigset_t oldset;
390 struct inode *inode = iocb->ki_filp->f_mapping->host; 344 struct inode *inode = iocb->ki_filp->f_mapping->host;
391 struct rpc_clnt *clnt = NFS_CLIENT(inode); 345 struct rpc_clnt *clnt = NFS_CLIENT(inode);
392 struct nfs_direct_req *dreq; 346 struct nfs_direct_req *dreq;
393 347
394 dreq = nfs_direct_read_alloc(count, NFS_SERVER(inode)->rsize); 348 dreq = nfs_direct_req_alloc();
395 if (!dreq) 349 if (!dreq)
396 return -ENOMEM; 350 return -ENOMEM;
397 351
398 dreq->user_addr = user_addr;
399 dreq->user_count = count;
400 dreq->pos = pos;
401 dreq->pages = pages;
402 dreq->npages = nr_pages;
403 dreq->inode = inode; 352 dreq->inode = inode;
404 dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data); 353 dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
405 if (!is_sync_kiocb(iocb)) 354 if (!is_sync_kiocb(iocb))
@@ -407,8 +356,9 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
407 356
408 nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count); 357 nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
409 rpc_clnt_sigmask(clnt, &oldset); 358 rpc_clnt_sigmask(clnt, &oldset);
410 nfs_direct_read_schedule(dreq); 359 result = nfs_direct_read_schedule(dreq, user_addr, count, pos);
411 result = nfs_direct_wait(dreq); 360 if (!result)
361 result = nfs_direct_wait(dreq);
412 rpc_clnt_sigunmask(clnt, &oldset); 362 rpc_clnt_sigunmask(clnt, &oldset);
413 363
414 return result; 364 return result;
@@ -416,10 +366,10 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
416 366
417static void nfs_direct_free_writedata(struct nfs_direct_req *dreq) 367static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
418{ 368{
419 list_splice_init(&dreq->rewrite_list, &dreq->list); 369 while (!list_empty(&dreq->rewrite_list)) {
420 while (!list_empty(&dreq->list)) { 370 struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages);
421 struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages);
422 list_del(&data->pages); 371 list_del(&data->pages);
372 nfs_direct_release_pages(data->pagevec, data->npages);
423 nfs_writedata_release(data); 373 nfs_writedata_release(data);
424 } 374 }
425} 375}
@@ -427,14 +377,51 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
427#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 377#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
428static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) 378static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
429{ 379{
430 struct list_head *pos; 380 struct inode *inode = dreq->inode;
381 struct list_head *p;
382 struct nfs_write_data *data;
431 383
432 list_splice_init(&dreq->rewrite_list, &dreq->list);
433 list_for_each(pos, &dreq->list)
434 dreq->outstanding++;
435 dreq->count = 0; 384 dreq->count = 0;
385 get_dreq(dreq);
386
387 list_for_each(p, &dreq->rewrite_list) {
388 data = list_entry(p, struct nfs_write_data, pages);
389
390 get_dreq(dreq);
436 391
437 nfs_direct_write_schedule(dreq, FLUSH_STABLE); 392 /*
393 * Reset data->res.
394 */
395 nfs_fattr_init(&data->fattr);
396 data->res.count = data->args.count;
397 memset(&data->verf, 0, sizeof(data->verf));
398
399 /*
400 * Reuse data->task; data->args should not have changed
401 * since the original request was sent.
402 */
403 rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
404 &nfs_write_direct_ops, data);
405 NFS_PROTO(inode)->write_setup(data, FLUSH_STABLE);
406
407 data->task.tk_priority = RPC_PRIORITY_NORMAL;
408 data->task.tk_cookie = (unsigned long) inode;
409
410 /*
411 * We're called via an RPC callback, so BKL is already held.
412 */
413 rpc_execute(&data->task);
414
415 dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
416 data->task.tk_pid,
417 inode->i_sb->s_id,
418 (long long)NFS_FILEID(inode),
419 data->args.count,
420 (unsigned long long)data->args.offset);
421 }
422
423 if (put_dreq(dreq))
424 nfs_direct_write_complete(dreq, inode);
438} 425}
439 426
440static void nfs_direct_commit_result(struct rpc_task *task, void *calldata) 427static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
@@ -471,8 +458,8 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
471 data->cred = dreq->ctx->cred; 458 data->cred = dreq->ctx->cred;
472 459
473 data->args.fh = NFS_FH(data->inode); 460 data->args.fh = NFS_FH(data->inode);
474 data->args.offset = dreq->pos; 461 data->args.offset = 0;
475 data->args.count = dreq->user_count; 462 data->args.count = 0;
476 data->res.count = 0; 463 data->res.count = 0;
477 data->res.fattr = &data->fattr; 464 data->res.fattr = &data->fattr;
478 data->res.verf = &data->verf; 465 data->res.verf = &data->verf;
@@ -516,7 +503,7 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
516 503
517static void nfs_alloc_commit_data(struct nfs_direct_req *dreq) 504static void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
518{ 505{
519 dreq->commit_data = nfs_commit_alloc(0); 506 dreq->commit_data = nfs_commit_alloc();
520 if (dreq->commit_data != NULL) 507 if (dreq->commit_data != NULL)
521 dreq->commit_data->req = (struct nfs_page *) dreq; 508 dreq->commit_data->req = (struct nfs_page *) dreq;
522} 509}
@@ -534,47 +521,6 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
534} 521}
535#endif 522#endif
536 523
537static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize)
538{
539 struct list_head *list;
540 struct nfs_direct_req *dreq;
541 unsigned int wpages = (wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
542
543 dreq = nfs_direct_req_alloc();
544 if (!dreq)
545 return NULL;
546
547 list = &dreq->list;
548 for(;;) {
549 struct nfs_write_data *data = nfs_writedata_alloc(wpages);
550
551 if (unlikely(!data)) {
552 while (!list_empty(list)) {
553 data = list_entry(list->next,
554 struct nfs_write_data, pages);
555 list_del(&data->pages);
556 nfs_writedata_free(data);
557 }
558 kref_put(&dreq->kref, nfs_direct_req_release);
559 return NULL;
560 }
561
562 INIT_LIST_HEAD(&data->pages);
563 list_add(&data->pages, list);
564
565 data->req = (struct nfs_page *) dreq;
566 dreq->outstanding++;
567 if (nbytes <= wsize)
568 break;
569 nbytes -= wsize;
570 }
571
572 nfs_alloc_commit_data(dreq);
573
574 kref_get(&dreq->kref);
575 return dreq;
576}
577
578static void nfs_direct_write_result(struct rpc_task *task, void *calldata) 524static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
579{ 525{
580 struct nfs_write_data *data = calldata; 526 struct nfs_write_data *data = calldata;
@@ -604,8 +550,6 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
604 } 550 }
605 } 551 }
606 } 552 }
607 /* In case we have to resend */
608 data->args.stable = NFS_FILE_SYNC;
609 553
610 spin_unlock(&dreq->lock); 554 spin_unlock(&dreq->lock);
611} 555}
@@ -619,14 +563,8 @@ static void nfs_direct_write_release(void *calldata)
619 struct nfs_write_data *data = calldata; 563 struct nfs_write_data *data = calldata;
620 struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; 564 struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
621 565
622 spin_lock(&dreq->lock); 566 if (put_dreq(dreq))
623 if (--dreq->outstanding) { 567 nfs_direct_write_complete(dreq, data->inode);
624 spin_unlock(&dreq->lock);
625 return;
626 }
627 spin_unlock(&dreq->lock);
628
629 nfs_direct_write_complete(dreq, data->inode);
630} 568}
631 569
632static const struct rpc_call_ops nfs_write_direct_ops = { 570static const struct rpc_call_ops nfs_write_direct_ops = {
@@ -635,41 +573,58 @@ static const struct rpc_call_ops nfs_write_direct_ops = {
635}; 573};
636 574
637/* 575/*
638 * For each nfs_write_data struct that was allocated on the list, dispatch 576 * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
639 * an NFS WRITE operation 577 * operation. If nfs_writedata_alloc() or get_user_pages() fails,
578 * bail and stop sending more writes. Write length accounting is
579 * handled automatically by nfs_direct_write_result(). Otherwise, if
580 * no requests have been sent, just return an error.
640 */ 581 */
641static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync) 582static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync)
642{ 583{
643 struct nfs_open_context *ctx = dreq->ctx; 584 struct nfs_open_context *ctx = dreq->ctx;
644 struct inode *inode = ctx->dentry->d_inode; 585 struct inode *inode = ctx->dentry->d_inode;
645 struct list_head *list = &dreq->list;
646 struct page **pages = dreq->pages;
647 size_t count = dreq->user_count;
648 loff_t pos = dreq->pos;
649 size_t wsize = NFS_SERVER(inode)->wsize; 586 size_t wsize = NFS_SERVER(inode)->wsize;
650 unsigned int curpage, pgbase; 587 unsigned int pgbase;
588 int result;
589 ssize_t started = 0;
590
591 get_dreq(dreq);
651 592
652 curpage = 0;
653 pgbase = dreq->user_addr & ~PAGE_MASK;
654 do { 593 do {
655 struct nfs_write_data *data; 594 struct nfs_write_data *data;
656 size_t bytes; 595 size_t bytes;
657 596
658 bytes = wsize; 597 pgbase = user_addr & ~PAGE_MASK;
659 if (count < wsize) 598 bytes = min(wsize,count);
660 bytes = count; 599
600 result = -ENOMEM;
601 data = nfs_writedata_alloc(pgbase + bytes);
602 if (unlikely(!data))
603 break;
604
605 down_read(&current->mm->mmap_sem);
606 result = get_user_pages(current, current->mm, user_addr,
607 data->npages, 0, 0, data->pagevec, NULL);
608 up_read(&current->mm->mmap_sem);
609 if (unlikely(result < data->npages)) {
610 if (result > 0)
611 nfs_direct_release_pages(data->pagevec, result);
612 nfs_writedata_release(data);
613 break;
614 }
615
616 get_dreq(dreq);
661 617
662 BUG_ON(list_empty(list));
663 data = list_entry(list->next, struct nfs_write_data, pages);
664 list_move_tail(&data->pages, &dreq->rewrite_list); 618 list_move_tail(&data->pages, &dreq->rewrite_list);
665 619
620 data->req = (struct nfs_page *) dreq;
666 data->inode = inode; 621 data->inode = inode;
667 data->cred = ctx->cred; 622 data->cred = ctx->cred;
668 data->args.fh = NFS_FH(inode); 623 data->args.fh = NFS_FH(inode);
669 data->args.context = ctx; 624 data->args.context = ctx;
670 data->args.offset = pos; 625 data->args.offset = pos;
671 data->args.pgbase = pgbase; 626 data->args.pgbase = pgbase;
672 data->args.pages = &pages[curpage]; 627 data->args.pages = data->pagevec;
673 data->args.count = bytes; 628 data->args.count = bytes;
674 data->res.fattr = &data->fattr; 629 data->res.fattr = &data->fattr;
675 data->res.count = bytes; 630 data->res.count = bytes;
@@ -693,19 +648,29 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync)
693 bytes, 648 bytes,
694 (unsigned long long)data->args.offset); 649 (unsigned long long)data->args.offset);
695 650
651 started += bytes;
652 user_addr += bytes;
696 pos += bytes; 653 pos += bytes;
654
655 /* FIXME: Remove this useless math from the final patch */
697 pgbase += bytes; 656 pgbase += bytes;
698 curpage += pgbase >> PAGE_SHIFT;
699 pgbase &= ~PAGE_MASK; 657 pgbase &= ~PAGE_MASK;
658 BUG_ON(pgbase != (user_addr & ~PAGE_MASK));
700 659
701 count -= bytes; 660 count -= bytes;
702 } while (count != 0); 661 } while (count != 0);
703 BUG_ON(!list_empty(list)); 662
663 if (put_dreq(dreq))
664 nfs_direct_write_complete(dreq, inode);
665
666 if (started)
667 return 0;
668 return result < 0 ? (ssize_t) result : -EFAULT;
704} 669}
705 670
706static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, int nr_pages) 671static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
707{ 672{
708 ssize_t result; 673 ssize_t result = 0;
709 sigset_t oldset; 674 sigset_t oldset;
710 struct inode *inode = iocb->ki_filp->f_mapping->host; 675 struct inode *inode = iocb->ki_filp->f_mapping->host;
711 struct rpc_clnt *clnt = NFS_CLIENT(inode); 676 struct rpc_clnt *clnt = NFS_CLIENT(inode);
@@ -713,17 +678,14 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
713 size_t wsize = NFS_SERVER(inode)->wsize; 678 size_t wsize = NFS_SERVER(inode)->wsize;
714 int sync = 0; 679 int sync = 0;
715 680
716 dreq = nfs_direct_write_alloc(count, wsize); 681 dreq = nfs_direct_req_alloc();
717 if (!dreq) 682 if (!dreq)
718 return -ENOMEM; 683 return -ENOMEM;
684 nfs_alloc_commit_data(dreq);
685
719 if (dreq->commit_data == NULL || count < wsize) 686 if (dreq->commit_data == NULL || count < wsize)
720 sync = FLUSH_STABLE; 687 sync = FLUSH_STABLE;
721 688
722 dreq->user_addr = user_addr;
723 dreq->user_count = count;
724 dreq->pos = pos;
725 dreq->pages = pages;
726 dreq->npages = nr_pages;
727 dreq->inode = inode; 689 dreq->inode = inode;
728 dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data); 690 dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
729 if (!is_sync_kiocb(iocb)) 691 if (!is_sync_kiocb(iocb))
@@ -734,8 +696,9 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
734 nfs_begin_data_update(inode); 696 nfs_begin_data_update(inode);
735 697
736 rpc_clnt_sigmask(clnt, &oldset); 698 rpc_clnt_sigmask(clnt, &oldset);
737 nfs_direct_write_schedule(dreq, sync); 699 result = nfs_direct_write_schedule(dreq, user_addr, count, pos, sync);
738 result = nfs_direct_wait(dreq); 700 if (!result)
701 result = nfs_direct_wait(dreq);
739 rpc_clnt_sigunmask(clnt, &oldset); 702 rpc_clnt_sigunmask(clnt, &oldset);
740 703
741 return result; 704 return result;
@@ -765,8 +728,6 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
765ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos) 728ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
766{ 729{
767 ssize_t retval = -EINVAL; 730 ssize_t retval = -EINVAL;
768 int page_count;
769 struct page **pages;
770 struct file *file = iocb->ki_filp; 731 struct file *file = iocb->ki_filp;
771 struct address_space *mapping = file->f_mapping; 732 struct address_space *mapping = file->f_mapping;
772 733
@@ -788,14 +749,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count,
788 if (retval) 749 if (retval)
789 goto out; 750 goto out;
790 751
791 retval = nfs_get_user_pages(READ, (unsigned long) buf, 752 retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos);
792 count, &pages);
793 if (retval < 0)
794 goto out;
795 page_count = retval;
796
797 retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos,
798 pages, page_count);
799 if (retval > 0) 753 if (retval > 0)
800 iocb->ki_pos = pos + retval; 754 iocb->ki_pos = pos + retval;
801 755
@@ -831,8 +785,6 @@ out:
831ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos) 785ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
832{ 786{
833 ssize_t retval; 787 ssize_t retval;
834 int page_count;
835 struct page **pages;
836 struct file *file = iocb->ki_filp; 788 struct file *file = iocb->ki_filp;
837 struct address_space *mapping = file->f_mapping; 789 struct address_space *mapping = file->f_mapping;
838 790
@@ -860,14 +812,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t
860 if (retval) 812 if (retval)
861 goto out; 813 goto out;
862 814
863 retval = nfs_get_user_pages(WRITE, (unsigned long) buf, 815 retval = nfs_direct_write(iocb, (unsigned long) buf, count, pos);
864 count, &pages);
865 if (retval < 0)
866 goto out;
867 page_count = retval;
868
869 retval = nfs_direct_write(iocb, (unsigned long) buf, count,
870 pos, pages, page_count);
871 816
872 /* 817 /*
873 * XXX: nfs_end_data_update() already ensures this file's 818 * XXX: nfs_end_data_update() already ensures this file's
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index cc2b874ad5a4..48e892880d5b 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -312,7 +312,13 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
312 312
313static int nfs_release_page(struct page *page, gfp_t gfp) 313static int nfs_release_page(struct page *page, gfp_t gfp)
314{ 314{
315 return !nfs_wb_page(page->mapping->host, page); 315 if (gfp & __GFP_FS)
316 return !nfs_wb_page(page->mapping->host, page);
317 else
318 /*
319 * Avoid deadlock on nfs_wait_on_request().
320 */
321 return 0;
316} 322}
317 323
318const struct address_space_operations nfs_file_aops = { 324const struct address_space_operations nfs_file_aops = {
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index b81e7ed3c902..07a5dd57646e 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -130,9 +130,7 @@ nfs_idmap_delete(struct nfs4_client *clp)
130 130
131 if (!idmap) 131 if (!idmap)
132 return; 132 return;
133 dput(idmap->idmap_dentry); 133 rpc_unlink(idmap->idmap_dentry);
134 idmap->idmap_dentry = NULL;
135 rpc_unlink(idmap->idmap_path);
136 clp->cl_idmap = NULL; 134 clp->cl_idmap = NULL;
137 kfree(idmap); 135 kfree(idmap);
138} 136}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 19b98ca468eb..86b3169c8cac 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -51,7 +51,7 @@ char *nfs_path(const char *base, const struct dentry *dentry,
51 namelen = dentry->d_name.len; 51 namelen = dentry->d_name.len;
52 buflen -= namelen + 1; 52 buflen -= namelen + 1;
53 if (buflen < 0) 53 if (buflen < 0)
54 goto Elong; 54 goto Elong_unlock;
55 end -= namelen; 55 end -= namelen;
56 memcpy(end, dentry->d_name.name, namelen); 56 memcpy(end, dentry->d_name.name, namelen);
57 *--end = '/'; 57 *--end = '/';
@@ -68,6 +68,8 @@ char *nfs_path(const char *base, const struct dentry *dentry,
68 end -= namelen; 68 end -= namelen;
69 memcpy(end, base, namelen); 69 memcpy(end, base, namelen);
70 return end; 70 return end;
71Elong_unlock:
72 spin_unlock(&dcache_lock);
71Elong: 73Elong:
72 return ERR_PTR(-ENAMETOOLONG); 74 return ERR_PTR(-ENAMETOOLONG);
73} 75}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b4916b092194..153898e1331f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2668,7 +2668,7 @@ out:
2668 nfs4_set_cached_acl(inode, acl); 2668 nfs4_set_cached_acl(inode, acl);
2669} 2669}
2670 2670
2671static inline ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) 2671static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)
2672{ 2672{
2673 struct page *pages[NFS4ACL_MAXPAGES]; 2673 struct page *pages[NFS4ACL_MAXPAGES];
2674 struct nfs_getaclargs args = { 2674 struct nfs_getaclargs args = {
@@ -2721,6 +2721,19 @@ out_free:
2721 return ret; 2721 return ret;
2722} 2722}
2723 2723
2724static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)
2725{
2726 struct nfs4_exception exception = { };
2727 ssize_t ret;
2728 do {
2729 ret = __nfs4_get_acl_uncached(inode, buf, buflen);
2730 if (ret >= 0)
2731 break;
2732 ret = nfs4_handle_exception(NFS_SERVER(inode), ret, &exception);
2733 } while (exception.retry);
2734 return ret;
2735}
2736
2724static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) 2737static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
2725{ 2738{
2726 struct nfs_server *server = NFS_SERVER(inode); 2739 struct nfs_server *server = NFS_SERVER(inode);
@@ -2737,7 +2750,7 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
2737 return nfs4_get_acl_uncached(inode, buf, buflen); 2750 return nfs4_get_acl_uncached(inode, buf, buflen);
2738} 2751}
2739 2752
2740static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) 2753static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen)
2741{ 2754{
2742 struct nfs_server *server = NFS_SERVER(inode); 2755 struct nfs_server *server = NFS_SERVER(inode);
2743 struct page *pages[NFS4ACL_MAXPAGES]; 2756 struct page *pages[NFS4ACL_MAXPAGES];
@@ -2763,6 +2776,18 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
2763 return ret; 2776 return ret;
2764} 2777}
2765 2778
2779static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen)
2780{
2781 struct nfs4_exception exception = { };
2782 int err;
2783 do {
2784 err = nfs4_handle_exception(NFS_SERVER(inode),
2785 __nfs4_proc_set_acl(inode, buf, buflen),
2786 &exception);
2787 } while (exception.retry);
2788 return err;
2789}
2790
2766static int 2791static int
2767nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server) 2792nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
2768{ 2793{
@@ -3144,9 +3169,6 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl)
3144 default: 3169 default:
3145 BUG(); 3170 BUG();
3146 } 3171 }
3147 if (res < 0)
3148 printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n",
3149 __FUNCTION__);
3150 return res; 3172 return res;
3151} 3173}
3152 3174
@@ -3258,8 +3280,6 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
3258 return ERR_PTR(-ENOMEM); 3280 return ERR_PTR(-ENOMEM);
3259 } 3281 }
3260 3282
3261 /* Unlock _before_ we do the RPC call */
3262 do_vfs_lock(fl->fl_file, fl);
3263 return rpc_run_task(NFS_CLIENT(lsp->ls_state->inode), RPC_TASK_ASYNC, &nfs4_locku_ops, data); 3283 return rpc_run_task(NFS_CLIENT(lsp->ls_state->inode), RPC_TASK_ASYNC, &nfs4_locku_ops, data);
3264} 3284}
3265 3285
@@ -3270,30 +3290,28 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
3270 struct rpc_task *task; 3290 struct rpc_task *task;
3271 int status = 0; 3291 int status = 0;
3272 3292
3273 /* Is this a delegated lock? */
3274 if (test_bit(NFS_DELEGATED_STATE, &state->flags))
3275 goto out_unlock;
3276 /* Is this open_owner holding any locks on the server? */
3277 if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
3278 goto out_unlock;
3279
3280 status = nfs4_set_lock_state(state, request); 3293 status = nfs4_set_lock_state(state, request);
3294 /* Unlock _before_ we do the RPC call */
3295 request->fl_flags |= FL_EXISTS;
3296 if (do_vfs_lock(request->fl_file, request) == -ENOENT)
3297 goto out;
3281 if (status != 0) 3298 if (status != 0)
3282 goto out_unlock; 3299 goto out;
3300 /* Is this a delegated lock? */
3301 if (test_bit(NFS_DELEGATED_STATE, &state->flags))
3302 goto out;
3283 lsp = request->fl_u.nfs4_fl.owner; 3303 lsp = request->fl_u.nfs4_fl.owner;
3284 status = -ENOMEM;
3285 seqid = nfs_alloc_seqid(&lsp->ls_seqid); 3304 seqid = nfs_alloc_seqid(&lsp->ls_seqid);
3305 status = -ENOMEM;
3286 if (seqid == NULL) 3306 if (seqid == NULL)
3287 goto out_unlock; 3307 goto out;
3288 task = nfs4_do_unlck(request, request->fl_file->private_data, lsp, seqid); 3308 task = nfs4_do_unlck(request, request->fl_file->private_data, lsp, seqid);
3289 status = PTR_ERR(task); 3309 status = PTR_ERR(task);
3290 if (IS_ERR(task)) 3310 if (IS_ERR(task))
3291 goto out_unlock; 3311 goto out;
3292 status = nfs4_wait_for_completion_rpc_task(task); 3312 status = nfs4_wait_for_completion_rpc_task(task);
3293 rpc_release_task(task); 3313 rpc_release_task(task);
3294 return status; 3314out:
3295out_unlock:
3296 do_vfs_lock(request->fl_file, request);
3297 return status; 3315 return status;
3298} 3316}
3299 3317
@@ -3461,10 +3479,10 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
3461 struct nfs4_exception exception = { }; 3479 struct nfs4_exception exception = { };
3462 int err; 3480 int err;
3463 3481
3464 /* Cache the lock if possible... */
3465 if (test_bit(NFS_DELEGATED_STATE, &state->flags))
3466 return 0;
3467 do { 3482 do {
3483 /* Cache the lock if possible... */
3484 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
3485 return 0;
3468 err = _nfs4_do_setlk(state, F_SETLK, request, 1); 3486 err = _nfs4_do_setlk(state, F_SETLK, request, 1);
3469 if (err != -NFS4ERR_DELAY) 3487 if (err != -NFS4ERR_DELAY)
3470 break; 3488 break;
@@ -3483,6 +3501,8 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
3483 if (err != 0) 3501 if (err != 0)
3484 return err; 3502 return err;
3485 do { 3503 do {
3504 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
3505 return 0;
3486 err = _nfs4_do_setlk(state, F_SETLK, request, 0); 3506 err = _nfs4_do_setlk(state, F_SETLK, request, 0);
3487 if (err != -NFS4ERR_DELAY) 3507 if (err != -NFS4ERR_DELAY)
3488 break; 3508 break;
@@ -3494,29 +3514,42 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
3494static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) 3514static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
3495{ 3515{
3496 struct nfs4_client *clp = state->owner->so_client; 3516 struct nfs4_client *clp = state->owner->so_client;
3517 unsigned char fl_flags = request->fl_flags;
3497 int status; 3518 int status;
3498 3519
3499 /* Is this a delegated open? */ 3520 /* Is this a delegated open? */
3500 if (NFS_I(state->inode)->delegation_state != 0) {
3501 /* Yes: cache locks! */
3502 status = do_vfs_lock(request->fl_file, request);
3503 /* ...but avoid races with delegation recall... */
3504 if (status < 0 || test_bit(NFS_DELEGATED_STATE, &state->flags))
3505 return status;
3506 }
3507 down_read(&clp->cl_sem);
3508 status = nfs4_set_lock_state(state, request); 3521 status = nfs4_set_lock_state(state, request);
3509 if (status != 0) 3522 if (status != 0)
3510 goto out; 3523 goto out;
3524 request->fl_flags |= FL_ACCESS;
3525 status = do_vfs_lock(request->fl_file, request);
3526 if (status < 0)
3527 goto out;
3528 down_read(&clp->cl_sem);
3529 if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
3530 struct nfs_inode *nfsi = NFS_I(state->inode);
3531 /* Yes: cache locks! */
3532 down_read(&nfsi->rwsem);
3533 /* ...but avoid races with delegation recall... */
3534 if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
3535 request->fl_flags = fl_flags & ~FL_SLEEP;
3536 status = do_vfs_lock(request->fl_file, request);
3537 up_read(&nfsi->rwsem);
3538 goto out_unlock;
3539 }
3540 up_read(&nfsi->rwsem);
3541 }
3511 status = _nfs4_do_setlk(state, cmd, request, 0); 3542 status = _nfs4_do_setlk(state, cmd, request, 0);
3512 if (status != 0) 3543 if (status != 0)
3513 goto out; 3544 goto out_unlock;
3514 /* Note: we always want to sleep here! */ 3545 /* Note: we always want to sleep here! */
3515 request->fl_flags |= FL_SLEEP; 3546 request->fl_flags = fl_flags | FL_SLEEP;
3516 if (do_vfs_lock(request->fl_file, request) < 0) 3547 if (do_vfs_lock(request->fl_file, request) < 0)
3517 printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __FUNCTION__); 3548 printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __FUNCTION__);
3518out: 3549out_unlock:
3519 up_read(&clp->cl_sem); 3550 up_read(&clp->cl_sem);
3551out:
3552 request->fl_flags = fl_flags;
3520 return status; 3553 return status;
3521} 3554}
3522 3555
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1750d996f49f..730ec8fb31c6 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3355,7 +3355,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
3355 struct kvec *iov = rcvbuf->head; 3355 struct kvec *iov = rcvbuf->head;
3356 unsigned int nr, pglen = rcvbuf->page_len; 3356 unsigned int nr, pglen = rcvbuf->page_len;
3357 uint32_t *end, *entry, *p, *kaddr; 3357 uint32_t *end, *entry, *p, *kaddr;
3358 uint32_t len, attrlen; 3358 uint32_t len, attrlen, xlen;
3359 int hdrlen, recvd, status; 3359 int hdrlen, recvd, status;
3360 3360
3361 status = decode_op_hdr(xdr, OP_READDIR); 3361 status = decode_op_hdr(xdr, OP_READDIR);
@@ -3377,10 +3377,10 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
3377 3377
3378 BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE); 3378 BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE);
3379 kaddr = p = (uint32_t *) kmap_atomic(page, KM_USER0); 3379 kaddr = p = (uint32_t *) kmap_atomic(page, KM_USER0);
3380 end = (uint32_t *) ((char *)p + pglen + readdir->pgbase); 3380 end = p + ((pglen + readdir->pgbase) >> 2);
3381 entry = p; 3381 entry = p;
3382 for (nr = 0; *p++; nr++) { 3382 for (nr = 0; *p++; nr++) {
3383 if (p + 3 > end) 3383 if (end - p < 3)
3384 goto short_pkt; 3384 goto short_pkt;
3385 dprintk("cookie = %Lu, ", *((unsigned long long *)p)); 3385 dprintk("cookie = %Lu, ", *((unsigned long long *)p));
3386 p += 2; /* cookie */ 3386 p += 2; /* cookie */
@@ -3389,18 +3389,19 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
3389 printk(KERN_WARNING "NFS: giant filename in readdir (len 0x%x)\n", len); 3389 printk(KERN_WARNING "NFS: giant filename in readdir (len 0x%x)\n", len);
3390 goto err_unmap; 3390 goto err_unmap;
3391 } 3391 }
3392 dprintk("filename = %*s\n", len, (char *)p); 3392 xlen = XDR_QUADLEN(len);
3393 p += XDR_QUADLEN(len); 3393 if (end - p < xlen + 1)
3394 if (p + 1 > end)
3395 goto short_pkt; 3394 goto short_pkt;
3395 dprintk("filename = %*s\n", len, (char *)p);
3396 p += xlen;
3396 len = ntohl(*p++); /* bitmap length */ 3397 len = ntohl(*p++); /* bitmap length */
3397 p += len; 3398 if (end - p < len + 1)
3398 if (p + 1 > end)
3399 goto short_pkt; 3399 goto short_pkt;
3400 p += len;
3400 attrlen = XDR_QUADLEN(ntohl(*p++)); 3401 attrlen = XDR_QUADLEN(ntohl(*p++));
3401 p += attrlen; /* attributes */ 3402 if (end - p < attrlen + 2)
3402 if (p + 2 > end)
3403 goto short_pkt; 3403 goto short_pkt;
3404 p += attrlen; /* attributes */
3404 entry = p; 3405 entry = p;
3405 } 3406 }
3406 if (!nr && (entry[0] != 0 || entry[1] == 0)) 3407 if (!nr && (entry[0] != 0 || entry[1] == 0))
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 52bf634260a1..7a9ee00e0c61 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -43,13 +43,15 @@ static mempool_t *nfs_rdata_mempool;
43 43
44#define MIN_POOL_READ (32) 44#define MIN_POOL_READ (32)
45 45
46struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) 46struct nfs_read_data *nfs_readdata_alloc(size_t len)
47{ 47{
48 unsigned int pagecount = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
48 struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS); 49 struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS);
49 50
50 if (p) { 51 if (p) {
51 memset(p, 0, sizeof(*p)); 52 memset(p, 0, sizeof(*p));
52 INIT_LIST_HEAD(&p->pages); 53 INIT_LIST_HEAD(&p->pages);
54 p->npages = pagecount;
53 if (pagecount <= ARRAY_SIZE(p->page_array)) 55 if (pagecount <= ARRAY_SIZE(p->page_array))
54 p->pagevec = p->page_array; 56 p->pagevec = p->page_array;
55 else { 57 else {
@@ -63,7 +65,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
63 return p; 65 return p;
64} 66}
65 67
66void nfs_readdata_free(struct nfs_read_data *p) 68static void nfs_readdata_free(struct nfs_read_data *p)
67{ 69{
68 if (p && (p->pagevec != &p->page_array[0])) 70 if (p && (p->pagevec != &p->page_array[0]))
69 kfree(p->pagevec); 71 kfree(p->pagevec);
@@ -116,10 +118,17 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
116 pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; 118 pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
117 base &= ~PAGE_CACHE_MASK; 119 base &= ~PAGE_CACHE_MASK;
118 pglen = PAGE_CACHE_SIZE - base; 120 pglen = PAGE_CACHE_SIZE - base;
119 if (pglen < remainder) 121 for (;;) {
122 if (remainder <= pglen) {
123 memclear_highpage_flush(*pages, base, remainder);
124 break;
125 }
120 memclear_highpage_flush(*pages, base, pglen); 126 memclear_highpage_flush(*pages, base, pglen);
121 else 127 pages++;
122 memclear_highpage_flush(*pages, base, remainder); 128 remainder -= pglen;
129 pglen = PAGE_CACHE_SIZE;
130 base = 0;
131 }
123} 132}
124 133
125/* 134/*
@@ -133,7 +142,7 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode,
133 int result; 142 int result;
134 struct nfs_read_data *rdata; 143 struct nfs_read_data *rdata;
135 144
136 rdata = nfs_readdata_alloc(1); 145 rdata = nfs_readdata_alloc(count);
137 if (!rdata) 146 if (!rdata)
138 return -ENOMEM; 147 return -ENOMEM;
139 148
@@ -329,25 +338,25 @@ static int nfs_pagein_multi(struct list_head *head, struct inode *inode)
329 struct nfs_page *req = nfs_list_entry(head->next); 338 struct nfs_page *req = nfs_list_entry(head->next);
330 struct page *page = req->wb_page; 339 struct page *page = req->wb_page;
331 struct nfs_read_data *data; 340 struct nfs_read_data *data;
332 unsigned int rsize = NFS_SERVER(inode)->rsize; 341 size_t rsize = NFS_SERVER(inode)->rsize, nbytes;
333 unsigned int nbytes, offset; 342 unsigned int offset;
334 int requests = 0; 343 int requests = 0;
335 LIST_HEAD(list); 344 LIST_HEAD(list);
336 345
337 nfs_list_remove_request(req); 346 nfs_list_remove_request(req);
338 347
339 nbytes = req->wb_bytes; 348 nbytes = req->wb_bytes;
340 for(;;) { 349 do {
341 data = nfs_readdata_alloc(1); 350 size_t len = min(nbytes,rsize);
351
352 data = nfs_readdata_alloc(len);
342 if (!data) 353 if (!data)
343 goto out_bad; 354 goto out_bad;
344 INIT_LIST_HEAD(&data->pages); 355 INIT_LIST_HEAD(&data->pages);
345 list_add(&data->pages, &list); 356 list_add(&data->pages, &list);
346 requests++; 357 requests++;
347 if (nbytes <= rsize) 358 nbytes -= len;
348 break; 359 } while(nbytes != 0);
349 nbytes -= rsize;
350 }
351 atomic_set(&req->wb_complete, requests); 360 atomic_set(&req->wb_complete, requests);
352 361
353 ClearPageError(page); 362 ClearPageError(page);
@@ -395,7 +404,7 @@ static int nfs_pagein_one(struct list_head *head, struct inode *inode)
395 if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) 404 if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
396 return nfs_pagein_multi(head, inode); 405 return nfs_pagein_multi(head, inode);
397 406
398 data = nfs_readdata_alloc(NFS_SERVER(inode)->rpages); 407 data = nfs_readdata_alloc(NFS_SERVER(inode)->rsize);
399 if (!data) 408 if (!data)
400 goto out_bad; 409 goto out_bad;
401 410
@@ -476,6 +485,8 @@ static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data)
476 unsigned int base = data->args.pgbase; 485 unsigned int base = data->args.pgbase;
477 struct page **pages; 486 struct page **pages;
478 487
488 if (data->res.eof)
489 count = data->args.count;
479 if (unlikely(count == 0)) 490 if (unlikely(count == 0))
480 return; 491 return;
481 pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; 492 pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
@@ -483,11 +494,7 @@ static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data)
483 count += base; 494 count += base;
484 for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++) 495 for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++)
485 SetPageUptodate(*pages); 496 SetPageUptodate(*pages);
486 /* 497 if (count != 0)
487 * Was this an eof or a short read? If the latter, don't mark the page
488 * as uptodate yet.
489 */
490 if (count > 0 && (data->res.eof || data->args.count == data->res.count))
491 SetPageUptodate(*pages); 498 SetPageUptodate(*pages);
492} 499}
493 500
@@ -502,6 +509,8 @@ static void nfs_readpage_set_pages_error(struct nfs_read_data *data)
502 count += base; 509 count += base;
503 for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++) 510 for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++)
504 SetPageError(*pages); 511 SetPageError(*pages);
512 if (count != 0)
513 SetPageError(*pages);
505} 514}
506 515
507/* 516/*
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index bca5734ca9fb..8ab3cf10d792 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -90,22 +90,13 @@ static mempool_t *nfs_commit_mempool;
90 90
91static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion); 91static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
92 92
93struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount) 93struct nfs_write_data *nfs_commit_alloc(void)
94{ 94{
95 struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, SLAB_NOFS); 95 struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, SLAB_NOFS);
96 96
97 if (p) { 97 if (p) {
98 memset(p, 0, sizeof(*p)); 98 memset(p, 0, sizeof(*p));
99 INIT_LIST_HEAD(&p->pages); 99 INIT_LIST_HEAD(&p->pages);
100 if (pagecount <= ARRAY_SIZE(p->page_array))
101 p->pagevec = p->page_array;
102 else {
103 p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
104 if (!p->pagevec) {
105 mempool_free(p, nfs_commit_mempool);
106 p = NULL;
107 }
108 }
109 } 100 }
110 return p; 101 return p;
111} 102}
@@ -117,13 +108,15 @@ void nfs_commit_free(struct nfs_write_data *p)
117 mempool_free(p, nfs_commit_mempool); 108 mempool_free(p, nfs_commit_mempool);
118} 109}
119 110
120struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) 111struct nfs_write_data *nfs_writedata_alloc(size_t len)
121{ 112{
113 unsigned int pagecount = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
122 struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, SLAB_NOFS); 114 struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, SLAB_NOFS);
123 115
124 if (p) { 116 if (p) {
125 memset(p, 0, sizeof(*p)); 117 memset(p, 0, sizeof(*p));
126 INIT_LIST_HEAD(&p->pages); 118 INIT_LIST_HEAD(&p->pages);
119 p->npages = pagecount;
127 if (pagecount <= ARRAY_SIZE(p->page_array)) 120 if (pagecount <= ARRAY_SIZE(p->page_array))
128 p->pagevec = p->page_array; 121 p->pagevec = p->page_array;
129 else { 122 else {
@@ -137,7 +130,7 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
137 return p; 130 return p;
138} 131}
139 132
140void nfs_writedata_free(struct nfs_write_data *p) 133static void nfs_writedata_free(struct nfs_write_data *p)
141{ 134{
142 if (p && (p->pagevec != &p->page_array[0])) 135 if (p && (p->pagevec != &p->page_array[0]))
143 kfree(p->pagevec); 136 kfree(p->pagevec);
@@ -208,7 +201,7 @@ static int nfs_writepage_sync(struct nfs_open_context *ctx, struct inode *inode,
208 int result, written = 0; 201 int result, written = 0;
209 struct nfs_write_data *wdata; 202 struct nfs_write_data *wdata;
210 203
211 wdata = nfs_writedata_alloc(1); 204 wdata = nfs_writedata_alloc(wsize);
212 if (!wdata) 205 if (!wdata)
213 return -ENOMEM; 206 return -ENOMEM;
214 207
@@ -578,7 +571,7 @@ static int nfs_wait_on_requests(struct inode *inode, unsigned long idx_start, un
578 return ret; 571 return ret;
579} 572}
580 573
581static void nfs_cancel_requests(struct list_head *head) 574static void nfs_cancel_dirty_list(struct list_head *head)
582{ 575{
583 struct nfs_page *req; 576 struct nfs_page *req;
584 while(!list_empty(head)) { 577 while(!list_empty(head)) {
@@ -589,6 +582,19 @@ static void nfs_cancel_requests(struct list_head *head)
589 } 582 }
590} 583}
591 584
585static void nfs_cancel_commit_list(struct list_head *head)
586{
587 struct nfs_page *req;
588
589 while(!list_empty(head)) {
590 req = nfs_list_entry(head->next);
591 nfs_list_remove_request(req);
592 nfs_inode_remove_request(req);
593 nfs_clear_page_writeback(req);
594 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
595 }
596}
597
592/* 598/*
593 * nfs_scan_dirty - Scan an inode for dirty requests 599 * nfs_scan_dirty - Scan an inode for dirty requests
594 * @inode: NFS inode to scan 600 * @inode: NFS inode to scan
@@ -986,24 +992,24 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, int how)
986 struct nfs_page *req = nfs_list_entry(head->next); 992 struct nfs_page *req = nfs_list_entry(head->next);
987 struct page *page = req->wb_page; 993 struct page *page = req->wb_page;
988 struct nfs_write_data *data; 994 struct nfs_write_data *data;
989 unsigned int wsize = NFS_SERVER(inode)->wsize; 995 size_t wsize = NFS_SERVER(inode)->wsize, nbytes;
990 unsigned int nbytes, offset; 996 unsigned int offset;
991 int requests = 0; 997 int requests = 0;
992 LIST_HEAD(list); 998 LIST_HEAD(list);
993 999
994 nfs_list_remove_request(req); 1000 nfs_list_remove_request(req);
995 1001
996 nbytes = req->wb_bytes; 1002 nbytes = req->wb_bytes;
997 for (;;) { 1003 do {
998 data = nfs_writedata_alloc(1); 1004 size_t len = min(nbytes, wsize);
1005
1006 data = nfs_writedata_alloc(len);
999 if (!data) 1007 if (!data)
1000 goto out_bad; 1008 goto out_bad;
1001 list_add(&data->pages, &list); 1009 list_add(&data->pages, &list);
1002 requests++; 1010 requests++;
1003 if (nbytes <= wsize) 1011 nbytes -= len;
1004 break; 1012 } while (nbytes != 0);
1005 nbytes -= wsize;
1006 }
1007 atomic_set(&req->wb_complete, requests); 1013 atomic_set(&req->wb_complete, requests);
1008 1014
1009 ClearPageError(page); 1015 ClearPageError(page);
@@ -1057,7 +1063,7 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, int how)
1057 struct nfs_write_data *data; 1063 struct nfs_write_data *data;
1058 unsigned int count; 1064 unsigned int count;
1059 1065
1060 data = nfs_writedata_alloc(NFS_SERVER(inode)->wpages); 1066 data = nfs_writedata_alloc(NFS_SERVER(inode)->wsize);
1061 if (!data) 1067 if (!data)
1062 goto out_bad; 1068 goto out_bad;
1063 1069
@@ -1365,7 +1371,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
1365 struct nfs_write_data *data; 1371 struct nfs_write_data *data;
1366 struct nfs_page *req; 1372 struct nfs_page *req;
1367 1373
1368 data = nfs_commit_alloc(NFS_SERVER(inode)->wpages); 1374 data = nfs_commit_alloc();
1369 1375
1370 if (!data) 1376 if (!data)
1371 goto out_bad; 1377 goto out_bad;
@@ -1381,6 +1387,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
1381 nfs_list_remove_request(req); 1387 nfs_list_remove_request(req);
1382 nfs_mark_request_commit(req); 1388 nfs_mark_request_commit(req);
1383 nfs_clear_page_writeback(req); 1389 nfs_clear_page_writeback(req);
1390 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
1384 } 1391 }
1385 return -ENOMEM; 1392 return -ENOMEM;
1386} 1393}
@@ -1499,7 +1506,7 @@ int nfs_sync_inode_wait(struct inode *inode, unsigned long idx_start,
1499 if (pages != 0) { 1506 if (pages != 0) {
1500 spin_unlock(&nfsi->req_lock); 1507 spin_unlock(&nfsi->req_lock);
1501 if (how & FLUSH_INVALIDATE) 1508 if (how & FLUSH_INVALIDATE)
1502 nfs_cancel_requests(&head); 1509 nfs_cancel_dirty_list(&head);
1503 else 1510 else
1504 ret = nfs_flush_list(inode, &head, pages, how); 1511 ret = nfs_flush_list(inode, &head, pages, how);
1505 spin_lock(&nfsi->req_lock); 1512 spin_lock(&nfsi->req_lock);
@@ -1512,7 +1519,7 @@ int nfs_sync_inode_wait(struct inode *inode, unsigned long idx_start,
1512 break; 1519 break;
1513 if (how & FLUSH_INVALIDATE) { 1520 if (how & FLUSH_INVALIDATE) {
1514 spin_unlock(&nfsi->req_lock); 1521 spin_unlock(&nfsi->req_lock);
1515 nfs_cancel_requests(&head); 1522 nfs_cancel_commit_list(&head);
1516 spin_lock(&nfsi->req_lock); 1523 spin_lock(&nfsi->req_lock);
1517 continue; 1524 continue;
1518 } 1525 }
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index b0e095ea0c03..ee4eff27aedc 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -721,6 +721,12 @@ nfsd4_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
721 return nfs_ok; 721 return nfs_ok;
722} 722}
723 723
724static inline void nfsd4_increment_op_stats(u32 opnum)
725{
726 if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP)
727 nfsdstats.nfs4_opcount[opnum]++;
728}
729
724 730
725/* 731/*
726 * COMPOUND call. 732 * COMPOUND call.
@@ -930,6 +936,8 @@ encode_op:
930 /* XXX Ugh, we need to get rid of this kind of special case: */ 936 /* XXX Ugh, we need to get rid of this kind of special case: */
931 if (op->opnum == OP_READ && op->u.read.rd_filp) 937 if (op->opnum == OP_READ && op->u.read.rd_filp)
932 fput(op->u.read.rd_filp); 938 fput(op->u.read.rd_filp);
939
940 nfsd4_increment_op_stats(op->opnum);
933 } 941 }
934 942
935out: 943out:
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index ecc439d2565f..501d83884530 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -187,6 +187,11 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
187 goto out; 187 goto out;
188 } 188 }
189 189
190 /* Set user creds for this exportpoint */
191 error = nfserrno(nfsd_setuser(rqstp, exp));
192 if (error)
193 goto out;
194
190 /* 195 /*
191 * Look up the dentry using the NFS file handle. 196 * Look up the dentry using the NFS file handle.
192 */ 197 */
@@ -241,16 +246,17 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
241 dprintk("nfsd: fh_verify - just checking\n"); 246 dprintk("nfsd: fh_verify - just checking\n");
242 dentry = fhp->fh_dentry; 247 dentry = fhp->fh_dentry;
243 exp = fhp->fh_export; 248 exp = fhp->fh_export;
249 /* Set user creds for this exportpoint; necessary even
250 * in the "just checking" case because this may be a
251 * filehandle that was created by fh_compose, and that
252 * is about to be used in another nfsv4 compound
253 * operation */
254 error = nfserrno(nfsd_setuser(rqstp, exp));
255 if (error)
256 goto out;
244 } 257 }
245 cache_get(&exp->h); 258 cache_get(&exp->h);
246 259
247 /* Set user creds for this exportpoint; necessary even in the "just
248 * checking" case because this may be a filehandle that was created by
249 * fh_compose, and that is about to be used in another nfsv4 compound
250 * operation */
251 error = nfserrno(nfsd_setuser(rqstp, exp));
252 if (error)
253 goto out;
254 260
255 error = nfsd_mode_check(rqstp, dentry->d_inode->i_mode, type); 261 error = nfsd_mode_check(rqstp, dentry->d_inode->i_mode, type);
256 if (error) 262 if (error)
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 57265d563804..71944cddf680 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -72,6 +72,16 @@ static int nfsd_proc_show(struct seq_file *seq, void *v)
72 /* show my rpc info */ 72 /* show my rpc info */
73 svc_seq_show(seq, &nfsd_svcstats); 73 svc_seq_show(seq, &nfsd_svcstats);
74 74
75#ifdef CONFIG_NFSD_V4
76 /* Show count for individual nfsv4 operations */
77 /* Writing operation numbers 0 1 2 also for maintaining uniformity */
78 seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1);
79 for (i = 0; i <= LAST_NFS4_OP; i++)
80 seq_printf(seq, " %u", nfsdstats.nfs4_opcount[i]);
81
82 seq_putc(seq, '\n');
83#endif
84
75 return 0; 85 return 0;
76} 86}
77 87
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 4c86b7e1d1eb..d313f356e66a 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -367,6 +367,12 @@ static void ntfs_destroy_extent_inode(ntfs_inode *ni)
367 kmem_cache_free(ntfs_inode_cache, ni); 367 kmem_cache_free(ntfs_inode_cache, ni);
368} 368}
369 369
370/*
371 * The attribute runlist lock has separate locking rules from the
372 * normal runlist lock, so split the two lock-classes:
373 */
374static struct lock_class_key attr_list_rl_lock_class;
375
370/** 376/**
371 * __ntfs_init_inode - initialize ntfs specific part of an inode 377 * __ntfs_init_inode - initialize ntfs specific part of an inode
372 * @sb: super block of mounted volume 378 * @sb: super block of mounted volume
@@ -394,6 +400,8 @@ void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni)
394 ni->attr_list_size = 0; 400 ni->attr_list_size = 0;
395 ni->attr_list = NULL; 401 ni->attr_list = NULL;
396 ntfs_init_runlist(&ni->attr_list_rl); 402 ntfs_init_runlist(&ni->attr_list_rl);
403 lockdep_set_class(&ni->attr_list_rl.lock,
404 &attr_list_rl_lock_class);
397 ni->itype.index.bmp_ino = NULL; 405 ni->itype.index.bmp_ino = NULL;
398 ni->itype.index.block_size = 0; 406 ni->itype.index.block_size = 0;
399 ni->itype.index.vcn_size = 0; 407 ni->itype.index.vcn_size = 0;
@@ -405,6 +413,13 @@ void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni)
405 ni->ext.base_ntfs_ino = NULL; 413 ni->ext.base_ntfs_ino = NULL;
406} 414}
407 415
416/*
417 * Extent inodes get MFT-mapped in a nested way, while the base inode
418 * is still mapped. Teach this nesting to the lock validator by creating
419 * a separate class for nested inode's mrec_lock's:
420 */
421static struct lock_class_key extent_inode_mrec_lock_key;
422
408inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb, 423inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
409 unsigned long mft_no) 424 unsigned long mft_no)
410{ 425{
@@ -413,6 +428,7 @@ inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
413 ntfs_debug("Entering."); 428 ntfs_debug("Entering.");
414 if (likely(ni != NULL)) { 429 if (likely(ni != NULL)) {
415 __ntfs_init_inode(sb, ni); 430 __ntfs_init_inode(sb, ni);
431 lockdep_set_class(&ni->mrec_lock, &extent_inode_mrec_lock_key);
416 ni->mft_no = mft_no; 432 ni->mft_no = mft_no;
417 ni->type = AT_UNUSED; 433 ni->type = AT_UNUSED;
418 ni->name = NULL; 434 ni->name = NULL;
@@ -1722,6 +1738,15 @@ err_out:
1722 return err; 1738 return err;
1723} 1739}
1724 1740
1741/*
1742 * The MFT inode has special locking, so teach the lock validator
1743 * about this by splitting off the locking rules of the MFT from
1744 * the locking rules of other inodes. The MFT inode can never be
1745 * accessed from the VFS side (or even internally), only by the
1746 * map_mft functions.
1747 */
1748static struct lock_class_key mft_ni_runlist_lock_key, mft_ni_mrec_lock_key;
1749
1725/** 1750/**
1726 * ntfs_read_inode_mount - special read_inode for mount time use only 1751 * ntfs_read_inode_mount - special read_inode for mount time use only
1727 * @vi: inode to read 1752 * @vi: inode to read
@@ -2148,6 +2173,14 @@ int ntfs_read_inode_mount(struct inode *vi)
2148 ntfs_attr_put_search_ctx(ctx); 2173 ntfs_attr_put_search_ctx(ctx);
2149 ntfs_debug("Done."); 2174 ntfs_debug("Done.");
2150 ntfs_free(m); 2175 ntfs_free(m);
2176
2177 /*
2178 * Split the locking rules of the MFT inode from the
2179 * locking rules of other inodes:
2180 */
2181 lockdep_set_class(&ni->runlist.lock, &mft_ni_runlist_lock_key);
2182 lockdep_set_class(&ni->mrec_lock, &mft_ni_mrec_lock_key);
2183
2151 return 0; 2184 return 0;
2152 2185
2153em_put_err_out: 2186em_put_err_out:
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 0e14acea3f8b..74e0ee8fce72 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1724,6 +1724,14 @@ upcase_failed:
1724 return FALSE; 1724 return FALSE;
1725} 1725}
1726 1726
1727/*
1728 * The lcn and mft bitmap inodes are NTFS-internal inodes with
1729 * their own special locking rules:
1730 */
1731static struct lock_class_key
1732 lcnbmp_runlist_lock_key, lcnbmp_mrec_lock_key,
1733 mftbmp_runlist_lock_key, mftbmp_mrec_lock_key;
1734
1727/** 1735/**
1728 * load_system_files - open the system files using normal functions 1736 * load_system_files - open the system files using normal functions
1729 * @vol: ntfs super block describing device whose system files to load 1737 * @vol: ntfs super block describing device whose system files to load
@@ -1780,6 +1788,10 @@ static BOOL load_system_files(ntfs_volume *vol)
1780 ntfs_error(sb, "Failed to load $MFT/$BITMAP attribute."); 1788 ntfs_error(sb, "Failed to load $MFT/$BITMAP attribute.");
1781 goto iput_mirr_err_out; 1789 goto iput_mirr_err_out;
1782 } 1790 }
1791 lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->runlist.lock,
1792 &mftbmp_runlist_lock_key);
1793 lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->mrec_lock,
1794 &mftbmp_mrec_lock_key);
1783 /* Read upcase table and setup @vol->upcase and @vol->upcase_len. */ 1795 /* Read upcase table and setup @vol->upcase and @vol->upcase_len. */
1784 if (!load_and_init_upcase(vol)) 1796 if (!load_and_init_upcase(vol))
1785 goto iput_mftbmp_err_out; 1797 goto iput_mftbmp_err_out;
@@ -1802,6 +1814,11 @@ static BOOL load_system_files(ntfs_volume *vol)
1802 iput(vol->lcnbmp_ino); 1814 iput(vol->lcnbmp_ino);
1803 goto bitmap_failed; 1815 goto bitmap_failed;
1804 } 1816 }
1817 lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->runlist.lock,
1818 &lcnbmp_runlist_lock_key);
1819 lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->mrec_lock,
1820 &lcnbmp_mrec_lock_key);
1821
1805 NInoSetSparseDisabled(NTFS_I(vol->lcnbmp_ino)); 1822 NInoSetSparseDisabled(NTFS_I(vol->lcnbmp_ino));
1806 if ((vol->nr_clusters + 7) >> 3 > i_size_read(vol->lcnbmp_ino)) { 1823 if ((vol->nr_clusters + 7) >> 3 > i_size_read(vol->lcnbmp_ino)) {
1807 iput(vol->lcnbmp_ino); 1824 iput(vol->lcnbmp_ino);
@@ -2743,6 +2760,17 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2743 struct inode *tmp_ino; 2760 struct inode *tmp_ino;
2744 int blocksize, result; 2761 int blocksize, result;
2745 2762
2763 /*
2764 * We do a pretty difficult piece of bootstrap by reading the
2765 * MFT (and other metadata) from disk into memory. We'll only
2766 * release this metadata during umount, so the locking patterns
2767 * observed during bootstrap do not count. So turn off the
2768 * observation of locking patterns (strictly for this context
2769 * only) while mounting NTFS. [The validator is still active
2770 * otherwise, even for this context: it will for example record
2771 * lock class registrations.]
2772 */
2773 lockdep_off();
2746 ntfs_debug("Entering."); 2774 ntfs_debug("Entering.");
2747#ifndef NTFS_RW 2775#ifndef NTFS_RW
2748 sb->s_flags |= MS_RDONLY; 2776 sb->s_flags |= MS_RDONLY;
@@ -2754,6 +2782,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2754 if (!silent) 2782 if (!silent)
2755 ntfs_error(sb, "Allocation of NTFS volume structure " 2783 ntfs_error(sb, "Allocation of NTFS volume structure "
2756 "failed. Aborting mount..."); 2784 "failed. Aborting mount...");
2785 lockdep_on();
2757 return -ENOMEM; 2786 return -ENOMEM;
2758 } 2787 }
2759 /* Initialize ntfs_volume structure. */ 2788 /* Initialize ntfs_volume structure. */
@@ -2940,6 +2969,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
2940 mutex_unlock(&ntfs_lock); 2969 mutex_unlock(&ntfs_lock);
2941 sb->s_export_op = &ntfs_export_ops; 2970 sb->s_export_op = &ntfs_export_ops;
2942 lock_kernel(); 2971 lock_kernel();
2972 lockdep_on();
2943 return 0; 2973 return 0;
2944 } 2974 }
2945 ntfs_error(sb, "Failed to allocate root directory."); 2975 ntfs_error(sb, "Failed to allocate root directory.");
@@ -3059,6 +3089,7 @@ err_out_now:
3059 sb->s_fs_info = NULL; 3089 sb->s_fs_info = NULL;
3060 kfree(vol); 3090 kfree(vol);
3061 ntfs_debug("Failed, returning -EINVAL."); 3091 ntfs_debug("Failed, returning -EINVAL.");
3092 lockdep_on();
3062 return -EINVAL; 3093 return -EINVAL;
3063} 3094}
3064 3095
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 1b8346dd0572..9503240ef0e5 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2375,7 +2375,6 @@ leave:
2375 mlog(0, "returning %d\n", ret); 2375 mlog(0, "returning %d\n", ret);
2376 return ret; 2376 return ret;
2377} 2377}
2378EXPORT_SYMBOL_GPL(dlm_migrate_lockres);
2379 2378
2380int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock) 2379int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock)
2381{ 2380{
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index b0c3134f4f70..37be4b2e0d4a 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -155,7 +155,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
155 else 155 else
156 status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions); 156 status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions);
157 157
158 if (status != DLM_NORMAL) 158 if (status != DLM_NORMAL && (status != DLM_CANCELGRANT || !master_node))
159 goto leave; 159 goto leave;
160 160
161 /* By now this has been masked out of cancel requests. */ 161 /* By now this has been masked out of cancel requests. */
@@ -183,8 +183,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
183 spin_lock(&lock->spinlock); 183 spin_lock(&lock->spinlock);
184 /* if the master told us the lock was already granted, 184 /* if the master told us the lock was already granted,
185 * let the ast handle all of these actions */ 185 * let the ast handle all of these actions */
186 if (status == DLM_NORMAL && 186 if (status == DLM_CANCELGRANT) {
187 lksb->status == DLM_CANCELGRANT) {
188 actions &= ~(DLM_UNLOCK_REMOVE_LOCK| 187 actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
189 DLM_UNLOCK_REGRANT_LOCK| 188 DLM_UNLOCK_REGRANT_LOCK|
190 DLM_UNLOCK_CLEAR_CONVERT_TYPE); 189 DLM_UNLOCK_CLEAR_CONVERT_TYPE);
@@ -349,14 +348,9 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
349 vec, veclen, owner, &status); 348 vec, veclen, owner, &status);
350 if (tmpret >= 0) { 349 if (tmpret >= 0) {
351 // successfully sent and received 350 // successfully sent and received
352 if (status == DLM_CANCELGRANT) 351 if (status == DLM_FORWARD)
353 ret = DLM_NORMAL;
354 else if (status == DLM_FORWARD) {
355 mlog(0, "master was in-progress. retry\n"); 352 mlog(0, "master was in-progress. retry\n");
356 ret = DLM_FORWARD; 353 ret = status;
357 } else
358 ret = status;
359 lksb->status = status;
360 } else { 354 } else {
361 mlog_errno(tmpret); 355 mlog_errno(tmpret);
362 if (dlm_is_host_down(tmpret)) { 356 if (dlm_is_host_down(tmpret)) {
@@ -372,7 +366,6 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
372 /* something bad. this will BUG in ocfs2 */ 366 /* something bad. this will BUG in ocfs2 */
373 ret = dlm_err_to_dlm_status(tmpret); 367 ret = dlm_err_to_dlm_status(tmpret);
374 } 368 }
375 lksb->status = ret;
376 } 369 }
377 370
378 return ret; 371 return ret;
@@ -483,6 +476,10 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data)
483 476
484 /* lock was found on queue */ 477 /* lock was found on queue */
485 lksb = lock->lksb; 478 lksb = lock->lksb;
479 if (flags & (LKM_VALBLK|LKM_PUT_LVB) &&
480 lock->ml.type != LKM_EXMODE)
481 flags &= ~(LKM_VALBLK|LKM_PUT_LVB);
482
486 /* unlockast only called on originating node */ 483 /* unlockast only called on originating node */
487 if (flags & LKM_PUT_LVB) { 484 if (flags & LKM_PUT_LVB) {
488 lksb->flags |= DLM_LKSB_PUT_LVB; 485 lksb->flags |= DLM_LKSB_PUT_LVB;
@@ -507,11 +504,8 @@ not_found:
507 "cookie=%u:%llu\n", 504 "cookie=%u:%llu\n",
508 dlm_get_lock_cookie_node(unlock->cookie), 505 dlm_get_lock_cookie_node(unlock->cookie),
509 dlm_get_lock_cookie_seq(unlock->cookie)); 506 dlm_get_lock_cookie_seq(unlock->cookie));
510 else { 507 else
511 /* send the lksb->status back to the other node */
512 status = lksb->status;
513 dlm_lock_put(lock); 508 dlm_lock_put(lock);
514 }
515 509
516leave: 510leave:
517 if (res) 511 if (res)
@@ -533,26 +527,22 @@ static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
533 527
534 if (dlm_lock_on_list(&res->blocked, lock)) { 528 if (dlm_lock_on_list(&res->blocked, lock)) {
535 /* cancel this outright */ 529 /* cancel this outright */
536 lksb->status = DLM_NORMAL;
537 status = DLM_NORMAL; 530 status = DLM_NORMAL;
538 *actions = (DLM_UNLOCK_CALL_AST | 531 *actions = (DLM_UNLOCK_CALL_AST |
539 DLM_UNLOCK_REMOVE_LOCK); 532 DLM_UNLOCK_REMOVE_LOCK);
540 } else if (dlm_lock_on_list(&res->converting, lock)) { 533 } else if (dlm_lock_on_list(&res->converting, lock)) {
541 /* cancel the request, put back on granted */ 534 /* cancel the request, put back on granted */
542 lksb->status = DLM_NORMAL;
543 status = DLM_NORMAL; 535 status = DLM_NORMAL;
544 *actions = (DLM_UNLOCK_CALL_AST | 536 *actions = (DLM_UNLOCK_CALL_AST |
545 DLM_UNLOCK_REMOVE_LOCK | 537 DLM_UNLOCK_REMOVE_LOCK |
546 DLM_UNLOCK_REGRANT_LOCK | 538 DLM_UNLOCK_REGRANT_LOCK |
547 DLM_UNLOCK_CLEAR_CONVERT_TYPE); 539 DLM_UNLOCK_CLEAR_CONVERT_TYPE);
548 } else if (dlm_lock_on_list(&res->granted, lock)) { 540 } else if (dlm_lock_on_list(&res->granted, lock)) {
549 /* too late, already granted. DLM_CANCELGRANT */ 541 /* too late, already granted. */
550 lksb->status = DLM_CANCELGRANT; 542 status = DLM_CANCELGRANT;
551 status = DLM_NORMAL;
552 *actions = DLM_UNLOCK_CALL_AST; 543 *actions = DLM_UNLOCK_CALL_AST;
553 } else { 544 } else {
554 mlog(ML_ERROR, "lock to cancel is not on any list!\n"); 545 mlog(ML_ERROR, "lock to cancel is not on any list!\n");
555 lksb->status = DLM_IVLOCKID;
556 status = DLM_IVLOCKID; 546 status = DLM_IVLOCKID;
557 *actions = 0; 547 *actions = 0;
558 } 548 }
@@ -569,13 +559,11 @@ static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
569 559
570 /* unlock request */ 560 /* unlock request */
571 if (!dlm_lock_on_list(&res->granted, lock)) { 561 if (!dlm_lock_on_list(&res->granted, lock)) {
572 lksb->status = DLM_DENIED;
573 status = DLM_DENIED; 562 status = DLM_DENIED;
574 dlm_error(status); 563 dlm_error(status);
575 *actions = 0; 564 *actions = 0;
576 } else { 565 } else {
577 /* unlock granted lock */ 566 /* unlock granted lock */
578 lksb->status = DLM_NORMAL;
579 status = DLM_NORMAL; 567 status = DLM_NORMAL;
580 *actions = (DLM_UNLOCK_FREE_LOCK | 568 *actions = (DLM_UNLOCK_FREE_LOCK |
581 DLM_UNLOCK_CALL_AST | 569 DLM_UNLOCK_CALL_AST |
@@ -632,6 +620,8 @@ retry:
632 620
633 spin_lock(&res->spinlock); 621 spin_lock(&res->spinlock);
634 is_master = (res->owner == dlm->node_num); 622 is_master = (res->owner == dlm->node_num);
623 if (flags & LKM_VALBLK && lock->ml.type != LKM_EXMODE)
624 flags &= ~LKM_VALBLK;
635 spin_unlock(&res->spinlock); 625 spin_unlock(&res->spinlock);
636 626
637 if (is_master) { 627 if (is_master) {
@@ -665,7 +655,7 @@ retry:
665 } 655 }
666 656
667 if (call_ast) { 657 if (call_ast) {
668 mlog(0, "calling unlockast(%p, %d)\n", data, lksb->status); 658 mlog(0, "calling unlockast(%p, %d)\n", data, status);
669 if (is_master) { 659 if (is_master) {
670 /* it is possible that there is one last bast 660 /* it is possible that there is one last bast
671 * pending. make sure it is flushed, then 661 * pending. make sure it is flushed, then
@@ -677,9 +667,12 @@ retry:
677 wait_event(dlm->ast_wq, 667 wait_event(dlm->ast_wq,
678 dlm_lock_basts_flushed(dlm, lock)); 668 dlm_lock_basts_flushed(dlm, lock));
679 } 669 }
680 (*unlockast)(data, lksb->status); 670 (*unlockast)(data, status);
681 } 671 }
682 672
673 if (status == DLM_CANCELGRANT)
674 status = DLM_NORMAL;
675
683 if (status == DLM_NORMAL) { 676 if (status == DLM_NORMAL) {
684 mlog(0, "kicking the thread\n"); 677 mlog(0, "kicking the thread\n");
685 dlm_kick_thread(dlm, res); 678 dlm_kick_thread(dlm, res);
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 0d1973ea32b0..1f17a4d08287 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -840,6 +840,12 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
840 840
841 mlog(0, "Allocating %u clusters for a new window.\n", 841 mlog(0, "Allocating %u clusters for a new window.\n",
842 ocfs2_local_alloc_window_bits(osb)); 842 ocfs2_local_alloc_window_bits(osb));
843
844 /* Instruct the allocation code to try the most recently used
845 * cluster group. We'll re-record the group used this pass
846 * below. */
847 ac->ac_last_group = osb->la_last_gd;
848
843 /* we used the generic suballoc reserve function, but we set 849 /* we used the generic suballoc reserve function, but we set
844 * everything up nicely, so there's no reason why we can't use 850 * everything up nicely, so there's no reason why we can't use
845 * the more specific cluster api to claim bits. */ 851 * the more specific cluster api to claim bits. */
@@ -852,6 +858,8 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
852 goto bail; 858 goto bail;
853 } 859 }
854 860
861 osb->la_last_gd = ac->ac_last_group;
862
855 la->la_bm_off = cpu_to_le32(cluster_off); 863 la->la_bm_off = cpu_to_le32(cluster_off);
856 alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count); 864 alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count);
857 /* just in case... In the future when we find space ourselves, 865 /* just in case... In the future when we find space ourselves,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index cd4a6f253d13..0462a7f4e21b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -197,7 +197,6 @@ struct ocfs2_super
197 struct ocfs2_node_map recovery_map; 197 struct ocfs2_node_map recovery_map;
198 struct ocfs2_node_map umount_map; 198 struct ocfs2_node_map umount_map;
199 199
200 u32 num_clusters;
201 u64 root_blkno; 200 u64 root_blkno;
202 u64 system_dir_blkno; 201 u64 system_dir_blkno;
203 u64 bitmap_blkno; 202 u64 bitmap_blkno;
@@ -237,6 +236,7 @@ struct ocfs2_super
237 236
238 enum ocfs2_local_alloc_state local_alloc_state; 237 enum ocfs2_local_alloc_state local_alloc_state;
239 struct buffer_head *local_alloc_bh; 238 struct buffer_head *local_alloc_bh;
239 u64 la_last_gd;
240 240
241 /* Next two fields are for local node slot recovery during 241 /* Next two fields are for local node slot recovery during
242 * mount. */ 242 * mount. */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 195523090c87..9d91e66f51a9 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -70,12 +70,6 @@ static int ocfs2_block_group_search(struct inode *inode,
70 struct buffer_head *group_bh, 70 struct buffer_head *group_bh,
71 u32 bits_wanted, u32 min_bits, 71 u32 bits_wanted, u32 min_bits,
72 u16 *bit_off, u16 *bits_found); 72 u16 *bit_off, u16 *bits_found);
73static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
74 u32 bits_wanted,
75 u32 min_bits,
76 u16 *bit_off,
77 unsigned int *num_bits,
78 u64 *bg_blkno);
79static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, 73static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
80 struct ocfs2_alloc_context *ac, 74 struct ocfs2_alloc_context *ac,
81 u32 bits_wanted, 75 u32 bits_wanted,
@@ -85,11 +79,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
85 u64 *bg_blkno); 79 u64 *bg_blkno);
86static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 80static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
87 int nr); 81 int nr);
88static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
89 struct buffer_head *bg_bh,
90 unsigned int bits_wanted,
91 u16 *bit_off,
92 u16 *bits_found);
93static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle, 82static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
94 struct inode *alloc_inode, 83 struct inode *alloc_inode,
95 struct ocfs2_group_desc *bg, 84 struct ocfs2_group_desc *bg,
@@ -143,6 +132,64 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
143 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); 132 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
144} 133}
145 134
135/* somewhat more expensive than our other checks, so use sparingly. */
136static int ocfs2_check_group_descriptor(struct super_block *sb,
137 struct ocfs2_dinode *di,
138 struct ocfs2_group_desc *gd)
139{
140 unsigned int max_bits;
141
142 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
143 OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd);
144 return -EIO;
145 }
146
147 if (di->i_blkno != gd->bg_parent_dinode) {
148 ocfs2_error(sb, "Group descriptor # %llu has bad parent "
149 "pointer (%llu, expected %llu)",
150 (unsigned long long)le64_to_cpu(gd->bg_blkno),
151 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
152 (unsigned long long)le64_to_cpu(di->i_blkno));
153 return -EIO;
154 }
155
156 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
157 if (le16_to_cpu(gd->bg_bits) > max_bits) {
158 ocfs2_error(sb, "Group descriptor # %llu has bit count of %u",
159 (unsigned long long)le64_to_cpu(gd->bg_blkno),
160 le16_to_cpu(gd->bg_bits));
161 return -EIO;
162 }
163
164 if (le16_to_cpu(gd->bg_chain) >=
165 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
166 ocfs2_error(sb, "Group descriptor # %llu has bad chain %u",
167 (unsigned long long)le64_to_cpu(gd->bg_blkno),
168 le16_to_cpu(gd->bg_chain));
169 return -EIO;
170 }
171
172 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
173 ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
174 "claims that %u are free",
175 (unsigned long long)le64_to_cpu(gd->bg_blkno),
176 le16_to_cpu(gd->bg_bits),
177 le16_to_cpu(gd->bg_free_bits_count));
178 return -EIO;
179 }
180
181 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
182 ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
183 "max bitmap bits of %u",
184 (unsigned long long)le64_to_cpu(gd->bg_blkno),
185 le16_to_cpu(gd->bg_bits),
186 8 * le16_to_cpu(gd->bg_size));
187 return -EIO;
188 }
189
190 return 0;
191}
192
146static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle, 193static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
147 struct inode *alloc_inode, 194 struct inode *alloc_inode,
148 struct buffer_head *bg_bh, 195 struct buffer_head *bg_bh,
@@ -663,6 +710,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
663static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, 710static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
664 struct buffer_head *bg_bh, 711 struct buffer_head *bg_bh,
665 unsigned int bits_wanted, 712 unsigned int bits_wanted,
713 unsigned int total_bits,
666 u16 *bit_off, 714 u16 *bit_off,
667 u16 *bits_found) 715 u16 *bits_found)
668{ 716{
@@ -679,10 +727,8 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
679 found = start = best_offset = best_size = 0; 727 found = start = best_offset = best_size = 0;
680 bitmap = bg->bg_bitmap; 728 bitmap = bg->bg_bitmap;
681 729
682 while((offset = ocfs2_find_next_zero_bit(bitmap, 730 while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
683 le16_to_cpu(bg->bg_bits), 731 if (offset == total_bits)
684 start)) != -1) {
685 if (offset == le16_to_cpu(bg->bg_bits))
686 break; 732 break;
687 733
688 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { 734 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
@@ -911,14 +957,35 @@ static int ocfs2_cluster_group_search(struct inode *inode,
911{ 957{
912 int search = -ENOSPC; 958 int search = -ENOSPC;
913 int ret; 959 int ret;
914 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; 960 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
915 u16 tmp_off, tmp_found; 961 u16 tmp_off, tmp_found;
962 unsigned int max_bits, gd_cluster_off;
916 963
917 BUG_ON(!ocfs2_is_cluster_bitmap(inode)); 964 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
918 965
919 if (bg->bg_free_bits_count) { 966 if (gd->bg_free_bits_count) {
967 max_bits = le16_to_cpu(gd->bg_bits);
968
969 /* Tail groups in cluster bitmaps which aren't cpg
970 * aligned are prone to partial extention by a failed
971 * fs resize. If the file system resize never got to
972 * update the dinode cluster count, then we don't want
973 * to trust any clusters past it, regardless of what
974 * the group descriptor says. */
975 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
976 le64_to_cpu(gd->bg_blkno));
977 if ((gd_cluster_off + max_bits) >
978 OCFS2_I(inode)->ip_clusters) {
979 max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
980 mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
981 (unsigned long long)le64_to_cpu(gd->bg_blkno),
982 le16_to_cpu(gd->bg_bits),
983 OCFS2_I(inode)->ip_clusters, max_bits);
984 }
985
920 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 986 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
921 group_bh, bits_wanted, 987 group_bh, bits_wanted,
988 max_bits,
922 &tmp_off, &tmp_found); 989 &tmp_off, &tmp_found);
923 if (ret) 990 if (ret)
924 return ret; 991 return ret;
@@ -951,17 +1018,109 @@ static int ocfs2_block_group_search(struct inode *inode,
951 if (bg->bg_free_bits_count) 1018 if (bg->bg_free_bits_count)
952 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1019 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
953 group_bh, bits_wanted, 1020 group_bh, bits_wanted,
1021 le16_to_cpu(bg->bg_bits),
954 bit_off, bits_found); 1022 bit_off, bits_found);
955 1023
956 return ret; 1024 return ret;
957} 1025}
958 1026
1027static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1028 struct ocfs2_journal_handle *handle,
1029 struct buffer_head *di_bh,
1030 u32 num_bits,
1031 u16 chain)
1032{
1033 int ret;
1034 u32 tmp_used;
1035 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1036 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1037
1038 ret = ocfs2_journal_access(handle, inode, di_bh,
1039 OCFS2_JOURNAL_ACCESS_WRITE);
1040 if (ret < 0) {
1041 mlog_errno(ret);
1042 goto out;
1043 }
1044
1045 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1046 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1047 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1048
1049 ret = ocfs2_journal_dirty(handle, di_bh);
1050 if (ret < 0)
1051 mlog_errno(ret);
1052
1053out:
1054 return ret;
1055}
1056
1057static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1058 u32 bits_wanted,
1059 u32 min_bits,
1060 u16 *bit_off,
1061 unsigned int *num_bits,
1062 u64 gd_blkno,
1063 u16 *bits_left)
1064{
1065 int ret;
1066 u16 found;
1067 struct buffer_head *group_bh = NULL;
1068 struct ocfs2_group_desc *gd;
1069 struct inode *alloc_inode = ac->ac_inode;
1070 struct ocfs2_journal_handle *handle = ac->ac_handle;
1071
1072 ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno,
1073 &group_bh, OCFS2_BH_CACHED, alloc_inode);
1074 if (ret < 0) {
1075 mlog_errno(ret);
1076 return ret;
1077 }
1078
1079 gd = (struct ocfs2_group_desc *) group_bh->b_data;
1080 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
1081 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd);
1082 ret = -EIO;
1083 goto out;
1084 }
1085
1086 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1087 bit_off, &found);
1088 if (ret < 0) {
1089 if (ret != -ENOSPC)
1090 mlog_errno(ret);
1091 goto out;
1092 }
1093
1094 *num_bits = found;
1095
1096 ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1097 *num_bits,
1098 le16_to_cpu(gd->bg_chain));
1099 if (ret < 0) {
1100 mlog_errno(ret);
1101 goto out;
1102 }
1103
1104 ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1105 *bit_off, *num_bits);
1106 if (ret < 0)
1107 mlog_errno(ret);
1108
1109 *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1110
1111out:
1112 brelse(group_bh);
1113
1114 return ret;
1115}
1116
959static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, 1117static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
960 u32 bits_wanted, 1118 u32 bits_wanted,
961 u32 min_bits, 1119 u32 min_bits,
962 u16 *bit_off, 1120 u16 *bit_off,
963 unsigned int *num_bits, 1121 unsigned int *num_bits,
964 u64 *bg_blkno) 1122 u64 *bg_blkno,
1123 u16 *bits_left)
965{ 1124{
966 int status; 1125 int status;
967 u16 chain, tmp_bits; 1126 u16 chain, tmp_bits;
@@ -988,9 +1147,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
988 goto bail; 1147 goto bail;
989 } 1148 }
990 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1149 bg = (struct ocfs2_group_desc *) group_bh->b_data;
991 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { 1150 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
992 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); 1151 if (status) {
993 status = -EIO; 1152 mlog_errno(status);
994 goto bail; 1153 goto bail;
995 } 1154 }
996 1155
@@ -1018,9 +1177,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1018 goto bail; 1177 goto bail;
1019 } 1178 }
1020 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1179 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1021 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { 1180 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
1022 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); 1181 if (status) {
1023 status = -EIO; 1182 mlog_errno(status);
1024 goto bail; 1183 goto bail;
1025 } 1184 }
1026 } 1185 }
@@ -1099,6 +1258,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1099 (unsigned long long)fe->i_blkno); 1258 (unsigned long long)fe->i_blkno);
1100 1259
1101 *bg_blkno = le64_to_cpu(bg->bg_blkno); 1260 *bg_blkno = le64_to_cpu(bg->bg_blkno);
1261 *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1102bail: 1262bail:
1103 if (group_bh) 1263 if (group_bh)
1104 brelse(group_bh); 1264 brelse(group_bh);
@@ -1120,6 +1280,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1120{ 1280{
1121 int status; 1281 int status;
1122 u16 victim, i; 1282 u16 victim, i;
1283 u16 bits_left = 0;
1284 u64 hint_blkno = ac->ac_last_group;
1123 struct ocfs2_chain_list *cl; 1285 struct ocfs2_chain_list *cl;
1124 struct ocfs2_dinode *fe; 1286 struct ocfs2_dinode *fe;
1125 1287
@@ -1146,6 +1308,28 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1146 goto bail; 1308 goto bail;
1147 } 1309 }
1148 1310
1311 if (hint_blkno) {
1312 /* Attempt to short-circuit the usual search mechanism
1313 * by jumping straight to the most recently used
1314 * allocation group. This helps us mantain some
1315 * contiguousness across allocations. */
1316 status = ocfs2_search_one_group(ac, bits_wanted, min_bits,
1317 bit_off, num_bits,
1318 hint_blkno, &bits_left);
1319 if (!status) {
1320 /* Be careful to update *bg_blkno here as the
1321 * caller is expecting it to be filled in, and
1322 * ocfs2_search_one_group() won't do that for
1323 * us. */
1324 *bg_blkno = hint_blkno;
1325 goto set_hint;
1326 }
1327 if (status < 0 && status != -ENOSPC) {
1328 mlog_errno(status);
1329 goto bail;
1330 }
1331 }
1332
1149 cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; 1333 cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1150 1334
1151 victim = ocfs2_find_victim_chain(cl); 1335 victim = ocfs2_find_victim_chain(cl);
@@ -1153,9 +1337,9 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1153 ac->ac_allow_chain_relink = 1; 1337 ac->ac_allow_chain_relink = 1;
1154 1338
1155 status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off, 1339 status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off,
1156 num_bits, bg_blkno); 1340 num_bits, bg_blkno, &bits_left);
1157 if (!status) 1341 if (!status)
1158 goto bail; 1342 goto set_hint;
1159 if (status < 0 && status != -ENOSPC) { 1343 if (status < 0 && status != -ENOSPC) {
1160 mlog_errno(status); 1344 mlog_errno(status);
1161 goto bail; 1345 goto bail;
@@ -1177,8 +1361,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1177 1361
1178 ac->ac_chain = i; 1362 ac->ac_chain = i;
1179 status = ocfs2_search_chain(ac, bits_wanted, min_bits, 1363 status = ocfs2_search_chain(ac, bits_wanted, min_bits,
1180 bit_off, num_bits, 1364 bit_off, num_bits, bg_blkno,
1181 bg_blkno); 1365 &bits_left);
1182 if (!status) 1366 if (!status)
1183 break; 1367 break;
1184 if (status < 0 && status != -ENOSPC) { 1368 if (status < 0 && status != -ENOSPC) {
@@ -1186,8 +1370,19 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1186 goto bail; 1370 goto bail;
1187 } 1371 }
1188 } 1372 }
1189bail:
1190 1373
1374set_hint:
1375 if (status != -ENOSPC) {
1376 /* If the next search of this group is not likely to
1377 * yield a suitable extent, then we reset the last
1378 * group hint so as to not waste a disk read */
1379 if (bits_left < min_bits)
1380 ac->ac_last_group = 0;
1381 else
1382 ac->ac_last_group = *bg_blkno;
1383 }
1384
1385bail:
1191 mlog_exit(status); 1386 mlog_exit(status);
1192 return status; 1387 return status;
1193} 1388}
@@ -1341,7 +1536,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
1341{ 1536{
1342 int status; 1537 int status;
1343 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; 1538 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1344 u64 bg_blkno; 1539 u64 bg_blkno = 0;
1345 u16 bg_bit_off; 1540 u16 bg_bit_off;
1346 1541
1347 mlog_entry_void(); 1542 mlog_entry_void();
@@ -1494,9 +1689,9 @@ static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
1494 } 1689 }
1495 1690
1496 group = (struct ocfs2_group_desc *) group_bh->b_data; 1691 group = (struct ocfs2_group_desc *) group_bh->b_data;
1497 if (!OCFS2_IS_VALID_GROUP_DESC(group)) { 1692 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group);
1498 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, group); 1693 if (status) {
1499 status = -EIO; 1694 mlog_errno(status);
1500 goto bail; 1695 goto bail;
1501 } 1696 }
1502 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); 1697 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index a76c82a7ceac..c787838d1052 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -49,6 +49,8 @@ struct ocfs2_alloc_context {
49 u16 ac_chain; 49 u16 ac_chain;
50 int ac_allow_chain_relink; 50 int ac_allow_chain_relink;
51 group_search_t *ac_group_search; 51 group_search_t *ac_group_search;
52
53 u64 ac_last_group;
52}; 54};
53 55
54void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); 56void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 382706a67ffd..d17e33e66a1e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1442,8 +1442,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
1442 1442
1443 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; 1443 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
1444 1444
1445 /* We don't have a cluster lock on the bitmap here because
1446 * we're only interested in static information and the extra
1447 * complexity at mount time isn't worht it. Don't pass the
1448 * inode in to the read function though as we don't want it to
1449 * be put in the cache. */
1445 status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0, 1450 status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
1446 inode); 1451 NULL);
1447 iput(inode); 1452 iput(inode);
1448 if (status < 0) { 1453 if (status < 0) {
1449 mlog_errno(status); 1454 mlog_errno(status);
@@ -1452,7 +1457,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
1452 1457
1453 di = (struct ocfs2_dinode *) bitmap_bh->b_data; 1458 di = (struct ocfs2_dinode *) bitmap_bh->b_data;
1454 osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg); 1459 osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
1455 osb->num_clusters = le32_to_cpu(di->id1.bitmap1.i_total);
1456 brelse(bitmap_bh); 1460 brelse(bitmap_bh);
1457 mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n", 1461 mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n",
1458 (unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg); 1462 (unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg);
diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig
index c9a478099281..e478f1941831 100644
--- a/fs/partitions/Kconfig
+++ b/fs/partitions/Kconfig
@@ -99,7 +99,7 @@ config IBM_PARTITION
99 99
100config MAC_PARTITION 100config MAC_PARTITION
101 bool "Macintosh partition map support" if PARTITION_ADVANCED 101 bool "Macintosh partition map support" if PARTITION_ADVANCED
102 default y if MAC 102 default y if (MAC || PPC_PMAC)
103 help 103 help
104 Say Y here if you would like to use hard disks under Linux which 104 Say Y here if you would like to use hard disks under Linux which
105 were partitioned on a Macintosh. 105 were partitioned on a Macintosh.
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 839634026eb5..51c6a748df49 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -339,6 +339,7 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len)
339 p->start_sect = start; 339 p->start_sect = start;
340 p->nr_sects = len; 340 p->nr_sects = len;
341 p->partno = part; 341 p->partno = part;
342 p->policy = disk->policy;
342 343
343 if (isdigit(disk->kobj.name[strlen(disk->kobj.name)-1])) 344 if (isdigit(disk->kobj.name[strlen(disk->kobj.name)-1]))
344 snprintf(p->kobj.name,KOBJ_NAME_LEN,"%sp%d",disk->kobj.name,part); 345 snprintf(p->kobj.name,KOBJ_NAME_LEN,"%sp%d",disk->kobj.name,part);
diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c
index abe91ca03edf..0a5927c806ca 100644
--- a/fs/partitions/sun.c
+++ b/fs/partitions/sun.c
@@ -74,7 +74,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
74 spc = be16_to_cpu(label->ntrks) * be16_to_cpu(label->nsect); 74 spc = be16_to_cpu(label->ntrks) * be16_to_cpu(label->nsect);
75 for (i = 0; i < 8; i++, p++) { 75 for (i = 0; i < 8; i++, p++) {
76 unsigned long st_sector; 76 unsigned long st_sector;
77 int num_sectors; 77 unsigned int num_sectors;
78 78
79 st_sector = be32_to_cpu(p->start_cylinder) * spc; 79 st_sector = be32_to_cpu(p->start_cylinder) * spc;
80 num_sectors = be32_to_cpu(p->num_sectors); 80 num_sectors = be32_to_cpu(p->num_sectors);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 7495d3e20775..0b615d62a159 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -74,6 +74,7 @@
74#include <linux/times.h> 74#include <linux/times.h>
75#include <linux/cpuset.h> 75#include <linux/cpuset.h>
76#include <linux/rcupdate.h> 76#include <linux/rcupdate.h>
77#include <linux/delayacct.h>
77 78
78#include <asm/uaccess.h> 79#include <asm/uaccess.h>
79#include <asm/pgtable.h> 80#include <asm/pgtable.h>
@@ -411,7 +412,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
411 412
412 res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ 413 res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
413%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ 414%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
414%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu\n", 415%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu %llu\n",
415 task->pid, 416 task->pid,
416 tcomm, 417 tcomm,
417 state, 418 state,
@@ -455,7 +456,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
455 task->exit_signal, 456 task->exit_signal,
456 task_cpu(task), 457 task_cpu(task),
457 task->rt_priority, 458 task->rt_priority,
458 task->policy); 459 task->policy,
460 (unsigned long long)delayacct_blkio_ticks(task));
459 if(mm) 461 if(mm)
460 mmput(mm); 462 mmput(mm);
461 return res; 463 return res;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 243a94af0427..fe8d55fb17cc 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -551,6 +551,27 @@ static int proc_fd_access_allowed(struct inode *inode)
551 return allowed; 551 return allowed;
552} 552}
553 553
554static int proc_setattr(struct dentry *dentry, struct iattr *attr)
555{
556 int error;
557 struct inode *inode = dentry->d_inode;
558
559 if (attr->ia_valid & ATTR_MODE)
560 return -EPERM;
561
562 error = inode_change_ok(inode, attr);
563 if (!error) {
564 error = security_inode_setattr(dentry, attr);
565 if (!error)
566 error = inode_setattr(inode, attr);
567 }
568 return error;
569}
570
571static struct inode_operations proc_def_inode_operations = {
572 .setattr = proc_setattr,
573};
574
554extern struct seq_operations mounts_op; 575extern struct seq_operations mounts_op;
555struct proc_mounts { 576struct proc_mounts {
556 struct seq_file m; 577 struct seq_file m;
@@ -1111,7 +1132,8 @@ out:
1111 1132
1112static struct inode_operations proc_pid_link_inode_operations = { 1133static struct inode_operations proc_pid_link_inode_operations = {
1113 .readlink = proc_pid_readlink, 1134 .readlink = proc_pid_readlink,
1114 .follow_link = proc_pid_follow_link 1135 .follow_link = proc_pid_follow_link,
1136 .setattr = proc_setattr,
1115}; 1137};
1116 1138
1117static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) 1139static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
@@ -1285,6 +1307,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
1285 ei = PROC_I(inode); 1307 ei = PROC_I(inode);
1286 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 1308 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1287 inode->i_ino = fake_ino(task->pid, ino); 1309 inode->i_ino = fake_ino(task->pid, ino);
1310 inode->i_op = &proc_def_inode_operations;
1288 1311
1289 /* 1312 /*
1290 * grab the reference to task. 1313 * grab the reference to task.
@@ -1339,6 +1362,7 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
1339 inode->i_uid = 0; 1362 inode->i_uid = 0;
1340 inode->i_gid = 0; 1363 inode->i_gid = 0;
1341 } 1364 }
1365 inode->i_mode &= ~(S_ISUID | S_ISGID);
1342 security_task_to_inode(task, inode); 1366 security_task_to_inode(task, inode);
1343 put_task_struct(task); 1367 put_task_struct(task);
1344 return 1; 1368 return 1;
@@ -1389,6 +1413,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
1389 inode->i_uid = 0; 1413 inode->i_uid = 0;
1390 inode->i_gid = 0; 1414 inode->i_gid = 0;
1391 } 1415 }
1416 inode->i_mode &= ~(S_ISUID | S_ISGID);
1392 security_task_to_inode(task, inode); 1417 security_task_to_inode(task, inode);
1393 put_task_struct(task); 1418 put_task_struct(task);
1394 return 1; 1419 return 1;
@@ -1527,11 +1552,13 @@ static struct file_operations proc_task_operations = {
1527 */ 1552 */
1528static struct inode_operations proc_fd_inode_operations = { 1553static struct inode_operations proc_fd_inode_operations = {
1529 .lookup = proc_lookupfd, 1554 .lookup = proc_lookupfd,
1555 .setattr = proc_setattr,
1530}; 1556};
1531 1557
1532static struct inode_operations proc_task_inode_operations = { 1558static struct inode_operations proc_task_inode_operations = {
1533 .lookup = proc_task_lookup, 1559 .lookup = proc_task_lookup,
1534 .getattr = proc_task_getattr, 1560 .getattr = proc_task_getattr,
1561 .setattr = proc_setattr,
1535}; 1562};
1536 1563
1537#ifdef CONFIG_SECURITY 1564#ifdef CONFIG_SECURITY
@@ -1845,11 +1872,13 @@ static struct file_operations proc_tid_base_operations = {
1845static struct inode_operations proc_tgid_base_inode_operations = { 1872static struct inode_operations proc_tgid_base_inode_operations = {
1846 .lookup = proc_tgid_base_lookup, 1873 .lookup = proc_tgid_base_lookup,
1847 .getattr = pid_getattr, 1874 .getattr = pid_getattr,
1875 .setattr = proc_setattr,
1848}; 1876};
1849 1877
1850static struct inode_operations proc_tid_base_inode_operations = { 1878static struct inode_operations proc_tid_base_inode_operations = {
1851 .lookup = proc_tid_base_lookup, 1879 .lookup = proc_tid_base_lookup,
1852 .getattr = pid_getattr, 1880 .getattr = pid_getattr,
1881 .setattr = proc_setattr,
1853}; 1882};
1854 1883
1855#ifdef CONFIG_SECURITY 1884#ifdef CONFIG_SECURITY
@@ -1892,11 +1921,13 @@ static struct dentry *proc_tid_attr_lookup(struct inode *dir,
1892static struct inode_operations proc_tgid_attr_inode_operations = { 1921static struct inode_operations proc_tgid_attr_inode_operations = {
1893 .lookup = proc_tgid_attr_lookup, 1922 .lookup = proc_tgid_attr_lookup,
1894 .getattr = pid_getattr, 1923 .getattr = pid_getattr,
1924 .setattr = proc_setattr,
1895}; 1925};
1896 1926
1897static struct inode_operations proc_tid_attr_inode_operations = { 1927static struct inode_operations proc_tid_attr_inode_operations = {
1898 .lookup = proc_tid_attr_lookup, 1928 .lookup = proc_tid_attr_lookup,
1899 .getattr = pid_getattr, 1929 .getattr = pid_getattr,
1930 .setattr = proc_setattr,
1900}; 1931};
1901#endif 1932#endif
1902 1933
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 6dcef089e18e..49dfb2ab783e 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -192,7 +192,7 @@ int proc_fill_super(struct super_block *s, void *data, int silent)
192{ 192{
193 struct inode * root_inode; 193 struct inode * root_inode;
194 194
195 s->s_flags |= MS_NODIRATIME; 195 s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
196 s->s_blocksize = 1024; 196 s->s_blocksize = 1024;
197 s->s_blocksize_bits = 10; 197 s->s_blocksize_bits = 10;
198 s->s_magic = PROC_SUPER_MAGIC; 198 s->s_magic = PROC_SUPER_MAGIC;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 036d14d83627..6a984f64edd7 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -42,8 +42,6 @@ const struct file_operations proc_kcore_operations = {
42#define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET) 42#define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET)
43#endif 43#endif
44 44
45#define roundup(x, y) ((((x)+((y)-1))/(y))*(y))
46
47/* An ELF note in memory */ 45/* An ELF note in memory */
48struct memelfnote 46struct memelfnote
49{ 47{
@@ -384,7 +382,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
384 */ 382 */
385 if (n) { 383 if (n) {
386 if (clear_user(buffer + tsz - n, 384 if (clear_user(buffer + tsz - n,
387 tsz - n)) 385 n))
388 return -EFAULT; 386 return -EFAULT;
389 } 387 }
390 } else { 388 } else {
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 9f2cfc30f9cf..942156225447 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -169,7 +169,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
169 "Mapped: %8lu kB\n" 169 "Mapped: %8lu kB\n"
170 "Slab: %8lu kB\n" 170 "Slab: %8lu kB\n"
171 "PageTables: %8lu kB\n" 171 "PageTables: %8lu kB\n"
172 "NFS Unstable: %8lu kB\n" 172 "NFS_Unstable: %8lu kB\n"
173 "Bounce: %8lu kB\n" 173 "Bounce: %8lu kB\n"
174 "CommitLimit: %8lu kB\n" 174 "CommitLimit: %8lu kB\n"
175 "Committed_AS: %8lu kB\n" 175 "Committed_AS: %8lu kB\n"
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index af69f28277b6..4616ed50ffcd 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -107,7 +107,7 @@ int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount *
107{ 107{
108 struct vm_list_struct *vml; 108 struct vm_list_struct *vml;
109 struct vm_area_struct *vma; 109 struct vm_area_struct *vma;
110 struct task_struct *task = proc_task(inode); 110 struct task_struct *task = get_proc_task(inode);
111 struct mm_struct *mm = get_task_mm(task); 111 struct mm_struct *mm = get_task_mm(task);
112 int result = -ENOENT; 112 int result = -ENOENT;
113 113
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 99fffc9e1bfd..677139b48e00 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -283,9 +283,9 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
283 283
284/*****************************************************************************/ 284/*****************************************************************************/
285/* 285/*
286 * set up a mapping 286 * set up a mapping for shared memory segments
287 */ 287 */
288int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) 288int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
289{ 289{
290 return 0; 290 return vma->vm_flags & VM_SHARED ? 0 : -ENOSYS;
291} 291}
diff --git a/fs/read_write.c b/fs/read_write.c
index 5bc0e9234f9d..d4cb3183c99c 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -436,7 +436,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
436 return seg; 436 return seg;
437} 437}
438 438
439EXPORT_SYMBOL(iov_shorten); 439EXPORT_UNUSED_SYMBOL(iov_shorten); /* June 2006 */
440 440
441/* A write operation does a read from user space and vice versa */ 441/* A write operation does a read from user space and vice versa */
442#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) 442#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 752cea12e30f..1627edd50810 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -48,8 +48,8 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
48 return 0; 48 return 0;
49 } 49 }
50 50
51 reiserfs_write_lock(inode->i_sb);
52 mutex_lock(&inode->i_mutex); 51 mutex_lock(&inode->i_mutex);
52 reiserfs_write_lock(inode->i_sb);
53 /* freeing preallocation only involves relogging blocks that 53 /* freeing preallocation only involves relogging blocks that
54 * are already in the current transaction. preallocation gets 54 * are already in the current transaction. preallocation gets
55 * freed at the end of each transaction, so it is impossible for 55 * freed at the end of each transaction, so it is impossible for
@@ -860,8 +860,12 @@ static int reiserfs_submit_file_region_for_write(struct reiserfs_transaction_han
860 // this sets the proper flags for O_SYNC to trigger a commit 860 // this sets the proper flags for O_SYNC to trigger a commit
861 mark_inode_dirty(inode); 861 mark_inode_dirty(inode);
862 reiserfs_write_unlock(inode->i_sb); 862 reiserfs_write_unlock(inode->i_sb);
863 } else 863 } else {
864 reiserfs_write_lock(inode->i_sb);
865 reiserfs_update_inode_transaction(inode);
864 mark_inode_dirty(inode); 866 mark_inode_dirty(inode);
867 reiserfs_write_unlock(inode->i_sb);
868 }
865 869
866 sd_update = 1; 870 sd_update = 1;
867 } 871 }
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 12dfdcfbee3d..52f1e2136546 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -39,14 +39,10 @@ void reiserfs_delete_inode(struct inode *inode)
39 39
40 /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ 40 /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
41 if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ 41 if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */
42 mutex_lock(&inode->i_mutex);
43
44 reiserfs_delete_xattrs(inode); 42 reiserfs_delete_xattrs(inode);
45 43
46 if (journal_begin(&th, inode->i_sb, jbegin_count)) { 44 if (journal_begin(&th, inode->i_sb, jbegin_count))
47 mutex_unlock(&inode->i_mutex);
48 goto out; 45 goto out;
49 }
50 reiserfs_update_inode_transaction(inode); 46 reiserfs_update_inode_transaction(inode);
51 47
52 err = reiserfs_delete_object(&th, inode); 48 err = reiserfs_delete_object(&th, inode);
@@ -57,12 +53,8 @@ void reiserfs_delete_inode(struct inode *inode)
57 if (!err) 53 if (!err)
58 DQUOT_FREE_INODE(inode); 54 DQUOT_FREE_INODE(inode);
59 55
60 if (journal_end(&th, inode->i_sb, jbegin_count)) { 56 if (journal_end(&th, inode->i_sb, jbegin_count))
61 mutex_unlock(&inode->i_mutex);
62 goto out; 57 goto out;
63 }
64
65 mutex_unlock(&inode->i_mutex);
66 58
67 /* check return value from reiserfs_delete_object after 59 /* check return value from reiserfs_delete_object after
68 * ending the transaction 60 * ending the transaction
@@ -2348,6 +2340,7 @@ static int reiserfs_write_full_page(struct page *page,
2348 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; 2340 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2349 int error = 0; 2341 int error = 0;
2350 unsigned long block; 2342 unsigned long block;
2343 sector_t last_block;
2351 struct buffer_head *head, *bh; 2344 struct buffer_head *head, *bh;
2352 int partial = 0; 2345 int partial = 0;
2353 int nr = 0; 2346 int nr = 0;
@@ -2395,10 +2388,19 @@ static int reiserfs_write_full_page(struct page *page,
2395 } 2388 }
2396 bh = head; 2389 bh = head;
2397 block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits); 2390 block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits);
2391 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
2398 /* first map all the buffers, logging any direct items we find */ 2392 /* first map all the buffers, logging any direct items we find */
2399 do { 2393 do {
2400 if ((checked || buffer_dirty(bh)) && (!buffer_mapped(bh) || 2394 if (block > last_block) {
2401 (buffer_mapped(bh) 2395 /*
2396 * This can happen when the block size is less than
2397 * the page size. The corresponding bytes in the page
2398 * were zero filled above
2399 */
2400 clear_buffer_dirty(bh);
2401 set_buffer_uptodate(bh);
2402 } else if ((checked || buffer_dirty(bh)) &&
2403 (!buffer_mapped(bh) || (buffer_mapped(bh)
2402 && bh->b_blocknr == 2404 && bh->b_blocknr ==
2403 0))) { 2405 0))) {
2404 /* not mapped yet, or it points to a direct item, search 2406 /* not mapped yet, or it points to a direct item, search
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 745c88100895..a986b5e1e288 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -116,12 +116,12 @@ static int reiserfs_unpack(struct inode *inode, struct file *filp)
116 if (REISERFS_I(inode)->i_flags & i_nopack_mask) { 116 if (REISERFS_I(inode)->i_flags & i_nopack_mask) {
117 return 0; 117 return 0;
118 } 118 }
119 reiserfs_write_lock(inode->i_sb);
120 119
121 /* we need to make sure nobody is changing the file size beneath 120 /* we need to make sure nobody is changing the file size beneath
122 ** us 121 ** us
123 */ 122 */
124 mutex_lock(&inode->i_mutex); 123 mutex_lock(&inode->i_mutex);
124 reiserfs_write_lock(inode->i_sb);
125 125
126 write_from = inode->i_size & (blocksize - 1); 126 write_from = inode->i_size & (blocksize - 1);
127 /* if we are on a block boundary, we are already unpacked. */ 127 /* if we are on a block boundary, we are already unpacked. */
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 5d8a8cfebc70..c533ec1bcaec 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -492,9 +492,17 @@ static void add_file(struct super_block *sb, char *name,
492 492
493int reiserfs_proc_info_init(struct super_block *sb) 493int reiserfs_proc_info_init(struct super_block *sb)
494{ 494{
495 char b[BDEVNAME_SIZE];
496 char *s;
497
498 /* Some block devices use /'s */
499 strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE);
500 s = strchr(b, '/');
501 if (s)
502 *s = '!';
503
495 spin_lock_init(&__PINFO(sb).lock); 504 spin_lock_init(&__PINFO(sb).lock);
496 REISERFS_SB(sb)->procdir = 505 REISERFS_SB(sb)->procdir = proc_mkdir(b, proc_info_root);
497 proc_mkdir(reiserfs_bdevname(sb), proc_info_root);
498 if (REISERFS_SB(sb)->procdir) { 506 if (REISERFS_SB(sb)->procdir) {
499 REISERFS_SB(sb)->procdir->owner = THIS_MODULE; 507 REISERFS_SB(sb)->procdir->owner = THIS_MODULE;
500 REISERFS_SB(sb)->procdir->data = sb; 508 REISERFS_SB(sb)->procdir->data = sb;
@@ -508,13 +516,22 @@ int reiserfs_proc_info_init(struct super_block *sb)
508 return 0; 516 return 0;
509 } 517 }
510 reiserfs_warning(sb, "reiserfs: cannot create /proc/%s/%s", 518 reiserfs_warning(sb, "reiserfs: cannot create /proc/%s/%s",
511 proc_info_root_name, reiserfs_bdevname(sb)); 519 proc_info_root_name, b);
512 return 1; 520 return 1;
513} 521}
514 522
515int reiserfs_proc_info_done(struct super_block *sb) 523int reiserfs_proc_info_done(struct super_block *sb)
516{ 524{
517 struct proc_dir_entry *de = REISERFS_SB(sb)->procdir; 525 struct proc_dir_entry *de = REISERFS_SB(sb)->procdir;
526 char b[BDEVNAME_SIZE];
527 char *s;
528
529 /* Some block devices use /'s */
530 strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE);
531 s = strchr(b, '/');
532 if (s)
533 *s = '!';
534
518 if (de) { 535 if (de) {
519 remove_proc_entry("journal", de); 536 remove_proc_entry("journal", de);
520 remove_proc_entry("oidmap", de); 537 remove_proc_entry("oidmap", de);
@@ -528,7 +545,7 @@ int reiserfs_proc_info_done(struct super_block *sb)
528 __PINFO(sb).exiting = 1; 545 __PINFO(sb).exiting = 1;
529 spin_unlock(&__PINFO(sb).lock); 546 spin_unlock(&__PINFO(sb).lock);
530 if (proc_info_root) { 547 if (proc_info_root) {
531 remove_proc_entry(reiserfs_bdevname(sb), proc_info_root); 548 remove_proc_entry(b, proc_info_root);
532 REISERFS_SB(sb)->procdir = NULL; 549 REISERFS_SB(sb)->procdir = NULL;
533 } 550 }
534 return 0; 551 return 0;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 28eb3c886034..5567328f1041 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2203,7 +2203,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
2203 size_t towrite = len; 2203 size_t towrite = len;
2204 struct buffer_head tmp_bh, *bh; 2204 struct buffer_head tmp_bh, *bh;
2205 2205
2206 mutex_lock(&inode->i_mutex); 2206 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
2207 while (towrite > 0) { 2207 while (towrite > 0) {
2208 tocopy = sb->s_blocksize - offset < towrite ? 2208 tocopy = sb->s_blocksize - offset < towrite ?
2209 sb->s_blocksize - offset : towrite; 2209 sb->s_blocksize - offset : towrite;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 39fedaa88a0c..d935fb9394e3 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -424,7 +424,7 @@ int xattr_readdir(struct file *file, filldir_t filler, void *buf)
424 int res = -ENOTDIR; 424 int res = -ENOTDIR;
425 if (!file->f_op || !file->f_op->readdir) 425 if (!file->f_op || !file->f_op->readdir)
426 goto out; 426 goto out;
427 mutex_lock(&inode->i_mutex); 427 mutex_lock_nested(&inode->i_mutex, I_MUTEX_XATTR);
428// down(&inode->i_zombie); 428// down(&inode->i_zombie);
429 res = -ENOENT; 429 res = -ENOENT;
430 if (!IS_DEADDIR(inode)) { 430 if (!IS_DEADDIR(inode)) {
diff --git a/fs/splice.c b/fs/splice.c
index 05fd2787be98..684bca3d3a10 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1307,6 +1307,85 @@ asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
1307} 1307}
1308 1308
1309/* 1309/*
1310 * Make sure there's data to read. Wait for input if we can, otherwise
1311 * return an appropriate error.
1312 */
1313static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1314{
1315 int ret;
1316
1317 /*
1318 * Check ->nrbufs without the inode lock first. This function
1319 * is speculative anyways, so missing one is ok.
1320 */
1321 if (pipe->nrbufs)
1322 return 0;
1323
1324 ret = 0;
1325 mutex_lock(&pipe->inode->i_mutex);
1326
1327 while (!pipe->nrbufs) {
1328 if (signal_pending(current)) {
1329 ret = -ERESTARTSYS;
1330 break;
1331 }
1332 if (!pipe->writers)
1333 break;
1334 if (!pipe->waiting_writers) {
1335 if (flags & SPLICE_F_NONBLOCK) {
1336 ret = -EAGAIN;
1337 break;
1338 }
1339 }
1340 pipe_wait(pipe);
1341 }
1342
1343 mutex_unlock(&pipe->inode->i_mutex);
1344 return ret;
1345}
1346
1347/*
1348 * Make sure there's writeable room. Wait for room if we can, otherwise
1349 * return an appropriate error.
1350 */
1351static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1352{
1353 int ret;
1354
1355 /*
1356 * Check ->nrbufs without the inode lock first. This function
1357 * is speculative anyways, so missing one is ok.
1358 */
1359 if (pipe->nrbufs < PIPE_BUFFERS)
1360 return 0;
1361
1362 ret = 0;
1363 mutex_lock(&pipe->inode->i_mutex);
1364
1365 while (pipe->nrbufs >= PIPE_BUFFERS) {
1366 if (!pipe->readers) {
1367 send_sig(SIGPIPE, current, 0);
1368 ret = -EPIPE;
1369 break;
1370 }
1371 if (flags & SPLICE_F_NONBLOCK) {
1372 ret = -EAGAIN;
1373 break;
1374 }
1375 if (signal_pending(current)) {
1376 ret = -ERESTARTSYS;
1377 break;
1378 }
1379 pipe->waiting_writers++;
1380 pipe_wait(pipe);
1381 pipe->waiting_writers--;
1382 }
1383
1384 mutex_unlock(&pipe->inode->i_mutex);
1385 return ret;
1386}
1387
1388/*
1310 * Link contents of ipipe to opipe. 1389 * Link contents of ipipe to opipe.
1311 */ 1390 */
1312static int link_pipe(struct pipe_inode_info *ipipe, 1391static int link_pipe(struct pipe_inode_info *ipipe,
@@ -1314,9 +1393,7 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1314 size_t len, unsigned int flags) 1393 size_t len, unsigned int flags)
1315{ 1394{
1316 struct pipe_buffer *ibuf, *obuf; 1395 struct pipe_buffer *ibuf, *obuf;
1317 int ret, do_wakeup, i, ipipe_first; 1396 int ret = 0, i = 0, nbuf;
1318
1319 ret = do_wakeup = ipipe_first = 0;
1320 1397
1321 /* 1398 /*
1322 * Potential ABBA deadlock, work around it by ordering lock 1399 * Potential ABBA deadlock, work around it by ordering lock
@@ -1324,126 +1401,62 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1324 * could deadlock (one doing tee from A -> B, the other from B -> A). 1401 * could deadlock (one doing tee from A -> B, the other from B -> A).
1325 */ 1402 */
1326 if (ipipe->inode < opipe->inode) { 1403 if (ipipe->inode < opipe->inode) {
1327 ipipe_first = 1; 1404 mutex_lock_nested(&ipipe->inode->i_mutex, I_MUTEX_PARENT);
1328 mutex_lock(&ipipe->inode->i_mutex); 1405 mutex_lock_nested(&opipe->inode->i_mutex, I_MUTEX_CHILD);
1329 mutex_lock(&opipe->inode->i_mutex);
1330 } else { 1406 } else {
1331 mutex_lock(&opipe->inode->i_mutex); 1407 mutex_lock_nested(&opipe->inode->i_mutex, I_MUTEX_PARENT);
1332 mutex_lock(&ipipe->inode->i_mutex); 1408 mutex_lock_nested(&ipipe->inode->i_mutex, I_MUTEX_CHILD);
1333 } 1409 }
1334 1410
1335 for (i = 0;; i++) { 1411 do {
1336 if (!opipe->readers) { 1412 if (!opipe->readers) {
1337 send_sig(SIGPIPE, current, 0); 1413 send_sig(SIGPIPE, current, 0);
1338 if (!ret) 1414 if (!ret)
1339 ret = -EPIPE; 1415 ret = -EPIPE;
1340 break; 1416 break;
1341 } 1417 }
1342 if (ipipe->nrbufs - i) {
1343 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
1344 1418
1345 /* 1419 /*
1346 * If we have room, fill this buffer 1420 * If we have iterated all input buffers or ran out of
1347 */ 1421 * output room, break.
1348 if (opipe->nrbufs < PIPE_BUFFERS) { 1422 */
1349 int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); 1423 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
1350 1424 break;
1351 /*
1352 * Get a reference to this pipe buffer,
1353 * so we can copy the contents over.
1354 */
1355 ibuf->ops->get(ipipe, ibuf);
1356
1357 obuf = opipe->bufs + nbuf;
1358 *obuf = *ibuf;
1359
1360 /*
1361 * Don't inherit the gift flag, we need to
1362 * prevent multiple steals of this page.
1363 */
1364 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1365
1366 if (obuf->len > len)
1367 obuf->len = len;
1368
1369 opipe->nrbufs++;
1370 do_wakeup = 1;
1371 ret += obuf->len;
1372 len -= obuf->len;
1373
1374 if (!len)
1375 break;
1376 if (opipe->nrbufs < PIPE_BUFFERS)
1377 continue;
1378 }
1379
1380 /*
1381 * We have input available, but no output room.
1382 * If we already copied data, return that. If we
1383 * need to drop the opipe lock, it must be ordered
1384 * last to avoid deadlocks.
1385 */
1386 if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) {
1387 if (!ret)
1388 ret = -EAGAIN;
1389 break;
1390 }
1391 if (signal_pending(current)) {
1392 if (!ret)
1393 ret = -ERESTARTSYS;
1394 break;
1395 }
1396 if (do_wakeup) {
1397 smp_mb();
1398 if (waitqueue_active(&opipe->wait))
1399 wake_up_interruptible(&opipe->wait);
1400 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1401 do_wakeup = 0;
1402 }
1403 1425
1404 opipe->waiting_writers++; 1426 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
1405 pipe_wait(opipe); 1427 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
1406 opipe->waiting_writers--;
1407 continue;
1408 }
1409 1428
1410 /* 1429 /*
1411 * No input buffers, do the usual checks for available 1430 * Get a reference to this pipe buffer,
1412 * writers and blocking and wait if necessary 1431 * so we can copy the contents over.
1413 */ 1432 */
1414 if (!ipipe->writers) 1433 ibuf->ops->get(ipipe, ibuf);
1415 break; 1434
1416 if (!ipipe->waiting_writers) { 1435 obuf = opipe->bufs + nbuf;
1417 if (ret) 1436 *obuf = *ibuf;
1418 break; 1437
1419 }
1420 /* 1438 /*
1421 * pipe_wait() drops the ipipe mutex. To avoid deadlocks 1439 * Don't inherit the gift flag, we need to
1422 * with another process, we can only safely do that if 1440 * prevent multiple steals of this page.
1423 * the ipipe lock is ordered last.
1424 */ 1441 */
1425 if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) { 1442 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1426 if (!ret)
1427 ret = -EAGAIN;
1428 break;
1429 }
1430 if (signal_pending(current)) {
1431 if (!ret)
1432 ret = -ERESTARTSYS;
1433 break;
1434 }
1435 1443
1436 if (waitqueue_active(&ipipe->wait)) 1444 if (obuf->len > len)
1437 wake_up_interruptible_sync(&ipipe->wait); 1445 obuf->len = len;
1438 kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT);
1439 1446
1440 pipe_wait(ipipe); 1447 opipe->nrbufs++;
1441 } 1448 ret += obuf->len;
1449 len -= obuf->len;
1450 i++;
1451 } while (len);
1442 1452
1443 mutex_unlock(&ipipe->inode->i_mutex); 1453 mutex_unlock(&ipipe->inode->i_mutex);
1444 mutex_unlock(&opipe->inode->i_mutex); 1454 mutex_unlock(&opipe->inode->i_mutex);
1445 1455
1446 if (do_wakeup) { 1456 /*
1457 * If we put data in the output pipe, wakeup any potential readers.
1458 */
1459 if (ret > 0) {
1447 smp_mb(); 1460 smp_mb();
1448 if (waitqueue_active(&opipe->wait)) 1461 if (waitqueue_active(&opipe->wait))
1449 wake_up_interruptible(&opipe->wait); 1462 wake_up_interruptible(&opipe->wait);
@@ -1464,14 +1477,29 @@ static long do_tee(struct file *in, struct file *out, size_t len,
1464{ 1477{
1465 struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe; 1478 struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe;
1466 struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe; 1479 struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe;
1480 int ret = -EINVAL;
1467 1481
1468 /* 1482 /*
1469 * Link ipipe to the two output pipes, consuming as we go along. 1483 * Duplicate the contents of ipipe to opipe without actually
1484 * copying the data.
1470 */ 1485 */
1471 if (ipipe && opipe) 1486 if (ipipe && opipe && ipipe != opipe) {
1472 return link_pipe(ipipe, opipe, len, flags); 1487 /*
1488 * Keep going, unless we encounter an error. The ipipe/opipe
1489 * ordering doesn't really matter.
1490 */
1491 ret = link_ipipe_prep(ipipe, flags);
1492 if (!ret) {
1493 ret = link_opipe_prep(opipe, flags);
1494 if (!ret) {
1495 ret = link_pipe(ipipe, opipe, len, flags);
1496 if (!ret && (flags & SPLICE_F_NONBLOCK))
1497 ret = -EAGAIN;
1498 }
1499 }
1500 }
1473 1501
1474 return -EINVAL; 1502 return ret;
1475} 1503}
1476 1504
1477asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) 1505asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags)
diff --git a/fs/super.c b/fs/super.c
index 9b780c42d845..5c4c94d5495e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -49,11 +49,12 @@ DEFINE_SPINLOCK(sb_lock);
49 49
50/** 50/**
51 * alloc_super - create new superblock 51 * alloc_super - create new superblock
52 * @type: filesystem type superblock should belong to
52 * 53 *
53 * Allocates and initializes a new &struct super_block. alloc_super() 54 * Allocates and initializes a new &struct super_block. alloc_super()
54 * returns a pointer new superblock or %NULL if allocation had failed. 55 * returns a pointer new superblock or %NULL if allocation had failed.
55 */ 56 */
56static struct super_block *alloc_super(void) 57static struct super_block *alloc_super(struct file_system_type *type)
57{ 58{
58 struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); 59 struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER);
59 static struct super_operations default_op; 60 static struct super_operations default_op;
@@ -72,6 +73,13 @@ static struct super_block *alloc_super(void)
72 INIT_LIST_HEAD(&s->s_inodes); 73 INIT_LIST_HEAD(&s->s_inodes);
73 init_rwsem(&s->s_umount); 74 init_rwsem(&s->s_umount);
74 mutex_init(&s->s_lock); 75 mutex_init(&s->s_lock);
76 lockdep_set_class(&s->s_umount, &type->s_umount_key);
77 /*
78 * The locking rules for s_lock are up to the
79 * filesystem. For example ext3fs has different
80 * lock ordering than usbfs:
81 */
82 lockdep_set_class(&s->s_lock, &type->s_lock_key);
75 down_write(&s->s_umount); 83 down_write(&s->s_umount);
76 s->s_count = S_BIAS; 84 s->s_count = S_BIAS;
77 atomic_set(&s->s_active, 1); 85 atomic_set(&s->s_active, 1);
@@ -295,7 +303,7 @@ retry:
295 } 303 }
296 if (!s) { 304 if (!s) {
297 spin_unlock(&sb_lock); 305 spin_unlock(&sb_lock);
298 s = alloc_super(); 306 s = alloc_super(type);
299 if (!s) 307 if (!s)
300 return ERR_PTR(-ENOMEM); 308 return ERR_PTR(-ENOMEM);
301 goto retry; 309 goto retry;
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 5e0e31cc46f5..9889e54e1f13 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -109,6 +109,17 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
109 inode->i_ctime = iattr->ia_ctime; 109 inode->i_ctime = iattr->ia_ctime;
110} 110}
111 111
112
113/*
114 * sysfs has a different i_mutex lock order behavior for i_mutex than other
115 * filesystems; sysfs i_mutex is called in many places with subsystem locks
116 * held. At the same time, many of the VFS locking rules do not apply to
117 * sysfs at all (cross directory rename for example). To untangle this mess
118 * (which gives false positives in lockdep), we're giving sysfs inodes their
119 * own class for i_mutex.
120 */
121static struct lock_class_key sysfs_inode_imutex_key;
122
112struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent * sd) 123struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent * sd)
113{ 124{
114 struct inode * inode = new_inode(sysfs_sb); 125 struct inode * inode = new_inode(sysfs_sb);
@@ -118,6 +129,7 @@ struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent * sd)
118 inode->i_mapping->a_ops = &sysfs_aops; 129 inode->i_mapping->a_ops = &sysfs_aops;
119 inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; 130 inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
120 inode->i_op = &sysfs_inode_operations; 131 inode->i_op = &sysfs_inode_operations;
132 lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key);
121 133
122 if (sd->s_iattr) { 134 if (sd->s_iattr) {
123 /* sysfs_dirent has non-default attributes 135 /* sysfs_dirent has non-default attributes
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 3873c672cb4c..33323473e3c4 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -75,6 +75,12 @@ struct inode * udf_new_inode (struct inode *dir, int mode, int * err)
75 } 75 }
76 *err = -ENOSPC; 76 *err = -ENOSPC;
77 77
78 UDF_I_UNIQUE(inode) = 0;
79 UDF_I_LENEXTENTS(inode) = 0;
80 UDF_I_NEXT_ALLOC_BLOCK(inode) = 0;
81 UDF_I_NEXT_ALLOC_GOAL(inode) = 0;
82 UDF_I_STRAT4096(inode) = 0;
83
78 block = udf_new_block(dir->i_sb, NULL, UDF_I_LOCATION(dir).partitionReferenceNum, 84 block = udf_new_block(dir->i_sb, NULL, UDF_I_LOCATION(dir).partitionReferenceNum,
79 start, err); 85 start, err);
80 if (*err) 86 if (*err)
@@ -84,11 +90,6 @@ struct inode * udf_new_inode (struct inode *dir, int mode, int * err)
84 } 90 }
85 91
86 mutex_lock(&sbi->s_alloc_mutex); 92 mutex_lock(&sbi->s_alloc_mutex);
87 UDF_I_UNIQUE(inode) = 0;
88 UDF_I_LENEXTENTS(inode) = 0;
89 UDF_I_NEXT_ALLOC_BLOCK(inode) = 0;
90 UDF_I_NEXT_ALLOC_GOAL(inode) = 0;
91 UDF_I_STRAT4096(inode) = 0;
92 if (UDF_SB_LVIDBH(sb)) 93 if (UDF_SB_LVIDBH(sb))
93 { 94 {
94 struct logicalVolHeaderDesc *lvhd; 95 struct logicalVolHeaderDesc *lvhd;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 4df822c881b6..fcce1a21a51b 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -115,6 +115,13 @@ static struct inode *udf_alloc_inode(struct super_block *sb)
115 ei = (struct udf_inode_info *)kmem_cache_alloc(udf_inode_cachep, SLAB_KERNEL); 115 ei = (struct udf_inode_info *)kmem_cache_alloc(udf_inode_cachep, SLAB_KERNEL);
116 if (!ei) 116 if (!ei)
117 return NULL; 117 return NULL;
118
119 ei->i_unique = 0;
120 ei->i_lenExtents = 0;
121 ei->i_next_alloc_block = 0;
122 ei->i_next_alloc_goal = 0;
123 ei->i_strat4096 = 0;
124
118 return &ei->vfs_inode; 125 return &ei->vfs_inode;
119} 126}
120 127
@@ -1652,7 +1659,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1652 iput(inode); 1659 iput(inode);
1653 goto error_out; 1660 goto error_out;
1654 } 1661 }
1655 sb->s_maxbytes = MAX_LFS_FILESIZE; 1662 sb->s_maxbytes = 1<<30;
1656 return 0; 1663 return 0;
1657 1664
1658error_out: 1665error_out:
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index e1b0e8cfecb4..0abd66ce36ea 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -239,37 +239,51 @@ void udf_truncate_extents(struct inode * inode)
239 { 239 {
240 if (offset) 240 if (offset)
241 { 241 {
242 extoffset -= adsize; 242 /*
243 etype = udf_next_aext(inode, &bloc, &extoffset, &eloc, &elen, &bh, 1); 243 * OK, there is not extent covering inode->i_size and
244 if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) 244 * no extent above inode->i_size => truncate is
245 { 245 * extending the file by 'offset'.
246 extoffset -= adsize; 246 */
247 elen = EXT_NOT_RECORDED_NOT_ALLOCATED | (elen + offset); 247 if ((!bh && extoffset == udf_file_entry_alloc_offset(inode)) ||
248 udf_write_aext(inode, bloc, &extoffset, eloc, elen, bh, 0); 248 (bh && extoffset == sizeof(struct allocExtDesc))) {
249 /* File has no extents at all! */
250 memset(&eloc, 0x00, sizeof(kernel_lb_addr));
251 elen = EXT_NOT_RECORDED_NOT_ALLOCATED | offset;
252 udf_add_aext(inode, &bloc, &extoffset, eloc, elen, &bh, 1);
249 } 253 }
250 else if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) 254 else {
251 {
252 kernel_lb_addr neloc = { 0, 0 };
253 extoffset -= adsize; 255 extoffset -= adsize;
254 nelen = EXT_NOT_RECORDED_NOT_ALLOCATED | 256 etype = udf_next_aext(inode, &bloc, &extoffset, &eloc, &elen, &bh, 1);
255 ((elen + offset + inode->i_sb->s_blocksize - 1) & 257 if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))
256 ~(inode->i_sb->s_blocksize - 1)); 258 {
257 udf_write_aext(inode, bloc, &extoffset, neloc, nelen, bh, 1); 259 extoffset -= adsize;
258 udf_add_aext(inode, &bloc, &extoffset, eloc, (etype << 30) | elen, &bh, 1); 260 elen = EXT_NOT_RECORDED_NOT_ALLOCATED | (elen + offset);
259 } 261 udf_write_aext(inode, bloc, &extoffset, eloc, elen, bh, 0);
260 else 262 }
261 { 263 else if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30))
262 if (elen & (inode->i_sb->s_blocksize - 1))
263 { 264 {
265 kernel_lb_addr neloc = { 0, 0 };
264 extoffset -= adsize; 266 extoffset -= adsize;
265 elen = EXT_RECORDED_ALLOCATED | 267 nelen = EXT_NOT_RECORDED_NOT_ALLOCATED |
266 ((elen + inode->i_sb->s_blocksize - 1) & 268 ((elen + offset + inode->i_sb->s_blocksize - 1) &
267 ~(inode->i_sb->s_blocksize - 1)); 269 ~(inode->i_sb->s_blocksize - 1));
268 udf_write_aext(inode, bloc, &extoffset, eloc, elen, bh, 1); 270 udf_write_aext(inode, bloc, &extoffset, neloc, nelen, bh, 1);
271 udf_add_aext(inode, &bloc, &extoffset, eloc, (etype << 30) | elen, &bh, 1);
272 }
273 else
274 {
275 if (elen & (inode->i_sb->s_blocksize - 1))
276 {
277 extoffset -= adsize;
278 elen = EXT_RECORDED_ALLOCATED |
279 ((elen + inode->i_sb->s_blocksize - 1) &
280 ~(inode->i_sb->s_blocksize - 1));
281 udf_write_aext(inode, bloc, &extoffset, eloc, elen, bh, 1);
282 }
283 memset(&eloc, 0x00, sizeof(kernel_lb_addr));
284 elen = EXT_NOT_RECORDED_NOT_ALLOCATED | offset;
285 udf_add_aext(inode, &bloc, &extoffset, eloc, elen, &bh, 1);
269 } 286 }
270 memset(&eloc, 0x00, sizeof(kernel_lb_addr));
271 elen = EXT_NOT_RECORDED_NOT_ALLOCATED | offset;
272 udf_add_aext(inode, &bloc, &extoffset, eloc, elen, &bh, 1);
273 } 287 }
274 } 288 }
275 } 289 }
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index b01804baa120..b82381475779 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -248,7 +248,7 @@ static void ufs_change_blocknr(struct inode *inode, unsigned int baseblk,
248 248
249 if (likely(cur_index != index)) { 249 if (likely(cur_index != index)) {
250 page = ufs_get_locked_page(mapping, index); 250 page = ufs_get_locked_page(mapping, index);
251 if (IS_ERR(page)) 251 if (!page || IS_ERR(page)) /* it was truncated or EIO */
252 continue; 252 continue;
253 } else 253 } else
254 page = locked_page; 254 page = locked_page;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index e7c8615beb65..30c6e8a9446c 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -169,18 +169,20 @@ static void ufs_clear_frag(struct inode *inode, struct buffer_head *bh)
169 169
170static struct buffer_head * 170static struct buffer_head *
171ufs_clear_frags(struct inode *inode, sector_t beg, 171ufs_clear_frags(struct inode *inode, sector_t beg,
172 unsigned int n) 172 unsigned int n, sector_t want)
173{ 173{
174 struct buffer_head *res, *bh; 174 struct buffer_head *res = NULL, *bh;
175 sector_t end = beg + n; 175 sector_t end = beg + n;
176 176
177 res = sb_getblk(inode->i_sb, beg); 177 for (; beg < end; ++beg) {
178 ufs_clear_frag(inode, res);
179 for (++beg; beg < end; ++beg) {
180 bh = sb_getblk(inode->i_sb, beg); 178 bh = sb_getblk(inode->i_sb, beg);
181 ufs_clear_frag(inode, bh); 179 ufs_clear_frag(inode, bh);
182 brelse(bh); 180 if (want != beg)
181 brelse(bh);
182 else
183 res = bh;
183 } 184 }
185 BUG_ON(!res);
184 return res; 186 return res;
185} 187}
186 188
@@ -265,7 +267,9 @@ repeat:
265 lastfrag = ufsi->i_lastfrag; 267 lastfrag = ufsi->i_lastfrag;
266 268
267 } 269 }
268 goal = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock]) + uspi->s_fpb; 270 tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock]);
271 if (tmp)
272 goal = tmp + uspi->s_fpb;
269 tmp = ufs_new_fragments (inode, p, fragment - blockoff, 273 tmp = ufs_new_fragments (inode, p, fragment - blockoff,
270 goal, required + blockoff, 274 goal, required + blockoff,
271 err, locked_page); 275 err, locked_page);
@@ -277,13 +281,15 @@ repeat:
277 tmp = ufs_new_fragments(inode, p, fragment - (blockoff - lastblockoff), 281 tmp = ufs_new_fragments(inode, p, fragment - (blockoff - lastblockoff),
278 fs32_to_cpu(sb, *p), required + (blockoff - lastblockoff), 282 fs32_to_cpu(sb, *p), required + (blockoff - lastblockoff),
279 err, locked_page); 283 err, locked_page);
280 } 284 } else /* (lastblock > block) */ {
281 /* 285 /*
282 * We will allocate new block before last allocated block 286 * We will allocate new block before last allocated block
283 */ 287 */
284 else /* (lastblock > block) */ { 288 if (block) {
285 if (lastblock && (tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock-1]))) 289 tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[block-1]);
286 goal = tmp + uspi->s_fpb; 290 if (tmp)
291 goal = tmp + uspi->s_fpb;
292 }
287 tmp = ufs_new_fragments(inode, p, fragment - blockoff, 293 tmp = ufs_new_fragments(inode, p, fragment - blockoff,
288 goal, uspi->s_fpb, err, locked_page); 294 goal, uspi->s_fpb, err, locked_page);
289 } 295 }
@@ -296,7 +302,7 @@ repeat:
296 } 302 }
297 303
298 if (!phys) { 304 if (!phys) {
299 result = ufs_clear_frags(inode, tmp + blockoff, required); 305 result = ufs_clear_frags(inode, tmp, required, tmp + blockoff);
300 } else { 306 } else {
301 *phys = tmp + blockoff; 307 *phys = tmp + blockoff;
302 result = NULL; 308 result = NULL;
@@ -383,7 +389,7 @@ repeat:
383 } 389 }
384 } 390 }
385 391
386 if (block && (tmp = fs32_to_cpu(sb, ((__fs32*)bh->b_data)[block-1]) + uspi->s_fpb)) 392 if (block && (tmp = fs32_to_cpu(sb, ((__fs32*)bh->b_data)[block-1])))
387 goal = tmp + uspi->s_fpb; 393 goal = tmp + uspi->s_fpb;
388 else 394 else
389 goal = bh->b_blocknr + uspi->s_fpb; 395 goal = bh->b_blocknr + uspi->s_fpb;
@@ -397,7 +403,8 @@ repeat:
397 403
398 404
399 if (!phys) { 405 if (!phys) {
400 result = ufs_clear_frags(inode, tmp + blockoff, uspi->s_fpb); 406 result = ufs_clear_frags(inode, tmp, uspi->s_fpb,
407 tmp + blockoff);
401 } else { 408 } else {
402 *phys = tmp + blockoff; 409 *phys = tmp + blockoff;
403 *new = 1; 410 *new = 1;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index abd5f23a426d..d344b411e261 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -129,7 +129,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
129 struct inode * inode; 129 struct inode * inode;
130 130
131 if (l > sb->s_blocksize) 131 if (l > sb->s_blocksize)
132 goto out; 132 goto out_notlocked;
133 133
134 lock_kernel(); 134 lock_kernel();
135 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); 135 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
@@ -155,6 +155,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
155 err = ufs_add_nondir(dentry, inode); 155 err = ufs_add_nondir(dentry, inode);
156out: 156out:
157 unlock_kernel(); 157 unlock_kernel();
158out_notlocked:
158 return err; 159 return err;
159 160
160out_fail: 161out_fail:
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 19a99726e58d..992ee0b87cc3 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1326,7 +1326,7 @@ static ssize_t ufs_quota_write(struct super_block *sb, int type,
1326 size_t towrite = len; 1326 size_t towrite = len;
1327 struct buffer_head *bh; 1327 struct buffer_head *bh;
1328 1328
1329 mutex_lock(&inode->i_mutex); 1329 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
1330 while (towrite > 0) { 1330 while (towrite > 0) {
1331 tocopy = sb->s_blocksize - offset < towrite ? 1331 tocopy = sb->s_blocksize - offset < towrite ?
1332 sb->s_blocksize - offset : towrite; 1332 sb->s_blocksize - offset : towrite;
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index c9b55872079b..ea11d04c41a0 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -375,17 +375,15 @@ static int ufs_alloc_lastblock(struct inode *inode)
375 int err = 0; 375 int err = 0;
376 struct address_space *mapping = inode->i_mapping; 376 struct address_space *mapping = inode->i_mapping;
377 struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi; 377 struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi;
378 struct ufs_inode_info *ufsi = UFS_I(inode);
379 unsigned lastfrag, i, end; 378 unsigned lastfrag, i, end;
380 struct page *lastpage; 379 struct page *lastpage;
381 struct buffer_head *bh; 380 struct buffer_head *bh;
382 381
383 lastfrag = (i_size_read(inode) + uspi->s_fsize - 1) >> uspi->s_fshift; 382 lastfrag = (i_size_read(inode) + uspi->s_fsize - 1) >> uspi->s_fshift;
384 383
385 if (!lastfrag) { 384 if (!lastfrag)
386 ufsi->i_lastfrag = 0;
387 goto out; 385 goto out;
388 } 386
389 lastfrag--; 387 lastfrag--;
390 388
391 lastpage = ufs_get_locked_page(mapping, lastfrag >> 389 lastpage = ufs_get_locked_page(mapping, lastfrag >>
@@ -400,25 +398,25 @@ static int ufs_alloc_lastblock(struct inode *inode)
400 for (i = 0; i < end; ++i) 398 for (i = 0; i < end; ++i)
401 bh = bh->b_this_page; 399 bh = bh->b_this_page;
402 400
403 if (!buffer_mapped(bh)) { 401
404 err = ufs_getfrag_block(inode, lastfrag, bh, 1); 402 err = ufs_getfrag_block(inode, lastfrag, bh, 1);
405 403
406 if (unlikely(err)) 404 if (unlikely(err))
407 goto out_unlock; 405 goto out_unlock;
408 406
409 if (buffer_new(bh)) { 407 if (buffer_new(bh)) {
410 clear_buffer_new(bh); 408 clear_buffer_new(bh);
411 unmap_underlying_metadata(bh->b_bdev, 409 unmap_underlying_metadata(bh->b_bdev,
412 bh->b_blocknr); 410 bh->b_blocknr);
413 /* 411 /*
414 * we do not zeroize fragment, because of 412 * we do not zeroize fragment, because of
415 * if it maped to hole, it already contains zeroes 413 * if it maped to hole, it already contains zeroes
416 */ 414 */
417 set_buffer_uptodate(bh); 415 set_buffer_uptodate(bh);
418 mark_buffer_dirty(bh); 416 mark_buffer_dirty(bh);
419 set_page_dirty(lastpage); 417 set_page_dirty(lastpage);
420 }
421 } 418 }
419
422out_unlock: 420out_unlock:
423 ufs_put_locked_page(lastpage); 421 ufs_put_locked_page(lastpage);
424out: 422out:
@@ -440,23 +438,11 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size)
440 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 438 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
441 return -EPERM; 439 return -EPERM;
442 440
443 if (inode->i_size > old_i_size) { 441 err = ufs_alloc_lastblock(inode);
444 /*
445 * if we expand file we should care about
446 * allocation of block for last byte first of all
447 */
448 err = ufs_alloc_lastblock(inode);
449 442
450 if (err) { 443 if (err) {
451 i_size_write(inode, old_i_size); 444 i_size_write(inode, old_i_size);
452 goto out; 445 goto out;
453 }
454 /*
455 * go away, because of we expand file, and we do not
456 * need free blocks, and zeroizes page
457 */
458 lock_kernel();
459 goto almost_end;
460 } 446 }
461 447
462 block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block); 448 block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block);
@@ -477,21 +463,8 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size)
477 yield(); 463 yield();
478 } 464 }
479 465
480 if (inode->i_size < old_i_size) {
481 /*
482 * now we should have enough space
483 * to allocate block for last byte
484 */
485 err = ufs_alloc_lastblock(inode);
486 if (err)
487 /*
488 * looks like all the same - we have no space,
489 * but we truncate file already
490 */
491 inode->i_size = (ufsi->i_lastfrag - 1) * uspi->s_fsize;
492 }
493almost_end:
494 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 466 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
467 ufsi->i_lastfrag = DIRECT_FRAGMENT;
495 unlock_kernel(); 468 unlock_kernel();
496 mark_inode_dirty(inode); 469 mark_inode_dirty(inode);
497out: 470out:
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 337cf2c46d10..22f820a9b15c 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -251,12 +251,12 @@ struct page *ufs_get_locked_page(struct address_space *mapping,
251{ 251{
252 struct page *page; 252 struct page *page;
253 253
254try_again:
255 page = find_lock_page(mapping, index); 254 page = find_lock_page(mapping, index);
256 if (!page) { 255 if (!page) {
257 page = read_cache_page(mapping, index, 256 page = read_cache_page(mapping, index,
258 (filler_t*)mapping->a_ops->readpage, 257 (filler_t*)mapping->a_ops->readpage,
259 NULL); 258 NULL);
259
260 if (IS_ERR(page)) { 260 if (IS_ERR(page)) {
261 printk(KERN_ERR "ufs_change_blocknr: " 261 printk(KERN_ERR "ufs_change_blocknr: "
262 "read_cache_page error: ino %lu, index: %lu\n", 262 "read_cache_page error: ino %lu, index: %lu\n",
@@ -266,6 +266,14 @@ try_again:
266 266
267 lock_page(page); 267 lock_page(page);
268 268
269 if (unlikely(page->mapping == NULL)) {
270 /* Truncate got there first */
271 unlock_page(page);
272 page_cache_release(page);
273 page = NULL;
274 goto out;
275 }
276
269 if (!PageUptodate(page) || PageError(page)) { 277 if (!PageUptodate(page) || PageError(page)) {
270 unlock_page(page); 278 unlock_page(page);
271 page_cache_release(page); 279 page_cache_release(page);
@@ -275,15 +283,8 @@ try_again:
275 mapping->host->i_ino, index); 283 mapping->host->i_ino, index);
276 284
277 page = ERR_PTR(-EIO); 285 page = ERR_PTR(-EIO);
278 goto out;
279 } 286 }
280 } 287 }
281
282 if (unlikely(!page->mapping || !page_has_buffers(page))) {
283 unlock_page(page);
284 page_cache_release(page);
285 goto try_again;/*we really need these buffers*/
286 }
287out: 288out:
288 return page; 289 return page;
289} 290}
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c40f81ba9b13..34dcb43a7837 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1390,11 +1390,19 @@ xfs_vm_direct_IO(
1390 1390
1391 iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN); 1391 iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
1392 1392
1393 ret = blockdev_direct_IO_own_locking(rw, iocb, inode, 1393 if (rw == WRITE) {
1394 iomap.iomap_target->bt_bdev, 1394 ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
1395 iov, offset, nr_segs, 1395 iomap.iomap_target->bt_bdev,
1396 xfs_get_blocks_direct, 1396 iov, offset, nr_segs,
1397 xfs_end_io_direct); 1397 xfs_get_blocks_direct,
1398 xfs_end_io_direct);
1399 } else {
1400 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
1401 iomap.iomap_target->bt_bdev,
1402 iov, offset, nr_segs,
1403 xfs_get_blocks_direct,
1404 xfs_end_io_direct);
1405 }
1398 1406
1399 if (unlikely(ret <= 0 && iocb->private)) 1407 if (unlikely(ret <= 0 && iocb->private))
1400 xfs_destroy_ioend(iocb->private); 1408 xfs_destroy_ioend(iocb->private);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index ceda3a2859d2..7858703ed84c 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -246,8 +246,8 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
246#define BUF_BUSY XBF_DONT_BLOCK 246#define BUF_BUSY XBF_DONT_BLOCK
247 247
248#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags) 248#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags)
249#define XFS_BUF_ZEROFLAGS(bp) \ 249#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \
250 ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI)) 250 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
251 251
252#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XFS_B_STALE) 252#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XFS_B_STALE)
253#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XFS_B_STALE) 253#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XFS_B_STALE)
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 8c021dc57d1f..a13f75c1a936 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -215,7 +215,6 @@ BUFFER_FNS(PrivateStart, unwritten);
215#define MIN(a,b) (min(a,b)) 215#define MIN(a,b) (min(a,b))
216#define MAX(a,b) (max(a,b)) 216#define MAX(a,b) (max(a,b))
217#define howmany(x, y) (((x)+((y)-1))/(y)) 217#define howmany(x, y) (((x)+((y)-1))/(y))
218#define roundup(x, y) ((((x)+((y)-1))/(y))*(y))
219 218
220/* 219/*
221 * Various platform dependent calls that don't fit anywhere else 220 * Various platform dependent calls that don't fit anywhere else
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 5d9cfd91ad08..ee788b1cb364 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -264,7 +264,9 @@ xfs_read(
264 dmflags, &locktype); 264 dmflags, &locktype);
265 if (ret) { 265 if (ret) {
266 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 266 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
267 goto unlock_mutex; 267 if (unlikely(ioflags & IO_ISDIRECT))
268 mutex_unlock(&inode->i_mutex);
269 return ret;
268 } 270 }
269 } 271 }
270 272
@@ -272,6 +274,9 @@ xfs_read(
272 bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)), 274 bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)),
273 -1, FI_REMAPF_LOCKED); 275 -1, FI_REMAPF_LOCKED);
274 276
277 if (unlikely(ioflags & IO_ISDIRECT))
278 mutex_unlock(&inode->i_mutex);
279
275 xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore, 280 xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
276 (void *)iovp, segs, *offset, ioflags); 281 (void *)iovp, segs, *offset, ioflags);
277 ret = __generic_file_aio_read(iocb, iovp, segs, offset); 282 ret = __generic_file_aio_read(iocb, iovp, segs, offset);
@@ -281,10 +286,6 @@ xfs_read(
281 XFS_STATS_ADD(xs_read_bytes, ret); 286 XFS_STATS_ADD(xs_read_bytes, ret);
282 287
283 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 288 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
284
285unlock_mutex:
286 if (unlikely(ioflags & IO_ISDIRECT))
287 mutex_unlock(&inode->i_mutex);
288 return ret; 289 return ret;
289} 290}
290 291
@@ -390,6 +391,8 @@ xfs_splice_write(
390 xfs_inode_t *ip = XFS_BHVTOI(bdp); 391 xfs_inode_t *ip = XFS_BHVTOI(bdp);
391 xfs_mount_t *mp = ip->i_mount; 392 xfs_mount_t *mp = ip->i_mount;
392 ssize_t ret; 393 ssize_t ret;
394 struct inode *inode = outfilp->f_mapping->host;
395 xfs_fsize_t isize;
393 396
394 XFS_STATS_INC(xs_write_calls); 397 XFS_STATS_INC(xs_write_calls);
395 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 398 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -416,6 +419,20 @@ xfs_splice_write(
416 if (ret > 0) 419 if (ret > 0)
417 XFS_STATS_ADD(xs_write_bytes, ret); 420 XFS_STATS_ADD(xs_write_bytes, ret);
418 421
422 isize = i_size_read(inode);
423 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
424 *ppos = isize;
425
426 if (*ppos > ip->i_d.di_size) {
427 xfs_ilock(ip, XFS_ILOCK_EXCL);
428 if (*ppos > ip->i_d.di_size) {
429 ip->i_d.di_size = *ppos;
430 i_size_write(inode, *ppos);
431 ip->i_update_core = 1;
432 ip->i_update_size = 1;
433 }
434 xfs_iunlock(ip, XFS_ILOCK_EXCL);
435 }
419 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 436 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
420 return ret; 437 return ret;
421} 438}
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 9bdef9d51900..4754f342a5d3 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -314,6 +314,13 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp)
314 return; 314 return;
315 } 315 }
316 316
317 if (xfs_readonly_buftarg(mp->m_ddev_targp)) {
318 xfs_fs_cmn_err(CE_NOTE, mp,
319 "Disabling barriers, underlying device is readonly");
320 mp->m_flags &= ~XFS_MOUNT_BARRIER;
321 return;
322 }
323
317 error = xfs_barrier_test(mp); 324 error = xfs_barrier_test(mp);
318 if (error) { 325 if (error) {
319 xfs_fs_cmn_err(CE_NOTE, mp, 326 xfs_fs_cmn_err(CE_NOTE, mp,
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index e95e99f7168f..db8872be8c87 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -203,7 +203,7 @@ xfs_qm_statvfs(
203 if (error || !vnode) 203 if (error || !vnode)
204 return error; 204 return error;
205 205
206 mp = XFS_BHVTOM(bhv); 206 mp = xfs_vfstom(bhvtovfs(bhv));
207 ip = xfs_vtoi(vnode); 207 ip = xfs_vtoi(vnode);
208 208
209 if (!(ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)) 209 if (!(ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))
@@ -217,17 +217,24 @@ xfs_qm_statvfs(
217 return 0; 217 return 0;
218 dp = &dqp->q_core; 218 dp = &dqp->q_core;
219 219
220 limit = dp->d_blk_softlimit ? dp->d_blk_softlimit : dp->d_blk_hardlimit; 220 limit = dp->d_blk_softlimit ?
221 be64_to_cpu(dp->d_blk_softlimit) :
222 be64_to_cpu(dp->d_blk_hardlimit);
221 if (limit && statp->f_blocks > limit) { 223 if (limit && statp->f_blocks > limit) {
222 statp->f_blocks = limit; 224 statp->f_blocks = limit;
223 statp->f_bfree = (statp->f_blocks > dp->d_bcount) ? 225 statp->f_bfree =
224 (statp->f_blocks - dp->d_bcount) : 0; 226 (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ?
227 (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0;
225 } 228 }
226 limit = dp->d_ino_softlimit ? dp->d_ino_softlimit : dp->d_ino_hardlimit; 229
230 limit = dp->d_ino_softlimit ?
231 be64_to_cpu(dp->d_ino_softlimit) :
232 be64_to_cpu(dp->d_ino_hardlimit);
227 if (limit && statp->f_files > limit) { 233 if (limit && statp->f_files > limit) {
228 statp->f_files = limit; 234 statp->f_files = limit;
229 statp->f_ffree = (statp->f_files > dp->d_icount) ? 235 statp->f_ffree =
230 (statp->f_ffree - dp->d_icount) : 0; 236 (statp->f_files > be64_to_cpu(dp->d_icount)) ?
237 (statp->f_ffree - be64_to_cpu(dp->d_icount)) : 0;
231 } 238 }
232 239
233 xfs_qm_dqput(dqp); 240 xfs_qm_dqput(dqp);
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index eef6763f3a67..d2bbcd882a69 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1835,40 +1835,47 @@ xfs_alloc_fix_freelist(
1835 &agbp))) 1835 &agbp)))
1836 return error; 1836 return error;
1837 if (!pag->pagf_init) { 1837 if (!pag->pagf_init) {
1838 ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
1839 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
1838 args->agbp = NULL; 1840 args->agbp = NULL;
1839 return 0; 1841 return 0;
1840 } 1842 }
1841 } else 1843 } else
1842 agbp = NULL; 1844 agbp = NULL;
1843 1845
1844 /* If this is a metadata preferred pag and we are user data 1846 /*
1847 * If this is a metadata preferred pag and we are user data
1845 * then try somewhere else if we are not being asked to 1848 * then try somewhere else if we are not being asked to
1846 * try harder at this point 1849 * try harder at this point
1847 */ 1850 */
1848 if (pag->pagf_metadata && args->userdata && flags) { 1851 if (pag->pagf_metadata && args->userdata &&
1852 (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
1853 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
1849 args->agbp = NULL; 1854 args->agbp = NULL;
1850 return 0; 1855 return 0;
1851 } 1856 }
1852 1857
1853 need = XFS_MIN_FREELIST_PAG(pag, mp); 1858 if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
1854 delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0; 1859 need = XFS_MIN_FREELIST_PAG(pag, mp);
1855 /* 1860 delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
1856 * If it looks like there isn't a long enough extent, or enough 1861 /*
1857 * total blocks, reject it. 1862 * If it looks like there isn't a long enough extent, or enough
1858 */ 1863 * total blocks, reject it.
1859 longest = (pag->pagf_longest > delta) ? 1864 */
1860 (pag->pagf_longest - delta) : 1865 longest = (pag->pagf_longest > delta) ?
1861 (pag->pagf_flcount > 0 || pag->pagf_longest > 0); 1866 (pag->pagf_longest - delta) :
1862 if (args->minlen + args->alignment + args->minalignslop - 1 > longest || 1867 (pag->pagf_flcount > 0 || pag->pagf_longest > 0);
1863 (!(flags & XFS_ALLOC_FLAG_FREEING) && 1868 if ((args->minlen + args->alignment + args->minalignslop - 1) >
1864 (int)(pag->pagf_freeblks + pag->pagf_flcount - 1869 longest ||
1865 need - args->total) < 1870 ((int)(pag->pagf_freeblks + pag->pagf_flcount -
1866 (int)args->minleft)) { 1871 need - args->total) < (int)args->minleft)) {
1867 if (agbp) 1872 if (agbp)
1868 xfs_trans_brelse(tp, agbp); 1873 xfs_trans_brelse(tp, agbp);
1869 args->agbp = NULL; 1874 args->agbp = NULL;
1870 return 0; 1875 return 0;
1876 }
1871 } 1877 }
1878
1872 /* 1879 /*
1873 * Get the a.g. freespace buffer. 1880 * Get the a.g. freespace buffer.
1874 * Can fail if we're not blocking on locks, and it's held. 1881 * Can fail if we're not blocking on locks, and it's held.
@@ -1878,6 +1885,8 @@ xfs_alloc_fix_freelist(
1878 &agbp))) 1885 &agbp)))
1879 return error; 1886 return error;
1880 if (agbp == NULL) { 1887 if (agbp == NULL) {
1888 ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
1889 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
1881 args->agbp = NULL; 1890 args->agbp = NULL;
1882 return 0; 1891 return 0;
1883 } 1892 }
@@ -1887,22 +1896,24 @@ xfs_alloc_fix_freelist(
1887 */ 1896 */
1888 agf = XFS_BUF_TO_AGF(agbp); 1897 agf = XFS_BUF_TO_AGF(agbp);
1889 need = XFS_MIN_FREELIST(agf, mp); 1898 need = XFS_MIN_FREELIST(agf, mp);
1890 delta = need > be32_to_cpu(agf->agf_flcount) ?
1891 (need - be32_to_cpu(agf->agf_flcount)) : 0;
1892 /* 1899 /*
1893 * If there isn't enough total or single-extent, reject it. 1900 * If there isn't enough total or single-extent, reject it.
1894 */ 1901 */
1895 longest = be32_to_cpu(agf->agf_longest); 1902 if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
1896 longest = (longest > delta) ? (longest - delta) : 1903 delta = need > be32_to_cpu(agf->agf_flcount) ?
1897 (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0); 1904 (need - be32_to_cpu(agf->agf_flcount)) : 0;
1898 if (args->minlen + args->alignment + args->minalignslop - 1 > longest || 1905 longest = be32_to_cpu(agf->agf_longest);
1899 (!(flags & XFS_ALLOC_FLAG_FREEING) && 1906 longest = (longest > delta) ? (longest - delta) :
1900 (int)(be32_to_cpu(agf->agf_freeblks) + 1907 (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
1901 be32_to_cpu(agf->agf_flcount) - need - args->total) < 1908 if ((args->minlen + args->alignment + args->minalignslop - 1) >
1902 (int)args->minleft)) { 1909 longest ||
1903 xfs_trans_brelse(tp, agbp); 1910 ((int)(be32_to_cpu(agf->agf_freeblks) +
1904 args->agbp = NULL; 1911 be32_to_cpu(agf->agf_flcount) - need - args->total) <
1905 return 0; 1912 (int)args->minleft)) {
1913 xfs_trans_brelse(tp, agbp);
1914 args->agbp = NULL;
1915 return 0;
1916 }
1906 } 1917 }
1907 /* 1918 /*
1908 * Make the freelist shorter if it's too long. 1919 * Make the freelist shorter if it's too long.
@@ -1950,12 +1961,11 @@ xfs_alloc_fix_freelist(
1950 * on a completely full ag. 1961 * on a completely full ag.
1951 */ 1962 */
1952 if (targs.agbno == NULLAGBLOCK) { 1963 if (targs.agbno == NULLAGBLOCK) {
1953 if (!(flags & XFS_ALLOC_FLAG_FREEING)) { 1964 if (flags & XFS_ALLOC_FLAG_FREEING)
1954 xfs_trans_brelse(tp, agflbp); 1965 break;
1955 args->agbp = NULL; 1966 xfs_trans_brelse(tp, agflbp);
1956 return 0; 1967 args->agbp = NULL;
1957 } 1968 return 0;
1958 break;
1959 } 1969 }
1960 /* 1970 /*
1961 * Put each allocated block on the list. 1971 * Put each allocated block on the list.
@@ -2442,31 +2452,26 @@ xfs_free_extent(
2442 xfs_fsblock_t bno, /* starting block number of extent */ 2452 xfs_fsblock_t bno, /* starting block number of extent */
2443 xfs_extlen_t len) /* length of extent */ 2453 xfs_extlen_t len) /* length of extent */
2444{ 2454{
2445#ifdef DEBUG 2455 xfs_alloc_arg_t args;
2446 xfs_agf_t *agf; /* a.g. freespace header */
2447#endif
2448 xfs_alloc_arg_t args; /* allocation argument structure */
2449 int error; 2456 int error;
2450 2457
2451 ASSERT(len != 0); 2458 ASSERT(len != 0);
2459 memset(&args, 0, sizeof(xfs_alloc_arg_t));
2452 args.tp = tp; 2460 args.tp = tp;
2453 args.mp = tp->t_mountp; 2461 args.mp = tp->t_mountp;
2454 args.agno = XFS_FSB_TO_AGNO(args.mp, bno); 2462 args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
2455 ASSERT(args.agno < args.mp->m_sb.sb_agcount); 2463 ASSERT(args.agno < args.mp->m_sb.sb_agcount);
2456 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); 2464 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
2457 args.alignment = 1;
2458 args.minlen = args.minleft = args.minalignslop = 0;
2459 down_read(&args.mp->m_peraglock); 2465 down_read(&args.mp->m_peraglock);
2460 args.pag = &args.mp->m_perag[args.agno]; 2466 args.pag = &args.mp->m_perag[args.agno];
2461 if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING))) 2467 if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING)))
2462 goto error0; 2468 goto error0;
2463#ifdef DEBUG 2469#ifdef DEBUG
2464 ASSERT(args.agbp != NULL); 2470 ASSERT(args.agbp != NULL);
2465 agf = XFS_BUF_TO_AGF(args.agbp); 2471 ASSERT((args.agbno + len) <=
2466 ASSERT(args.agbno + len <= be32_to_cpu(agf->agf_length)); 2472 be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length));
2467#endif 2473#endif
2468 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, 2474 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
2469 len, 0);
2470error0: 2475error0:
2471 up_read(&args.mp->m_peraglock); 2476 up_read(&args.mp->m_peraglock);
2472 return error; 2477 return error;
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 650591f999ae..5a4256120ccc 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -44,6 +44,26 @@ typedef enum xfs_alloctype
44#define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/ 44#define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/
45 45
46/* 46/*
47 * In order to avoid ENOSPC-related deadlock caused by
48 * out-of-order locking of AGF buffer (PV 947395), we place
49 * constraints on the relationship among actual allocations for
50 * data blocks, freelist blocks, and potential file data bmap
51 * btree blocks. However, these restrictions may result in no
52 * actual space allocated for a delayed extent, for example, a data
53 * block in a certain AG is allocated but there is no additional
54 * block for the additional bmap btree block due to a split of the
55 * bmap btree of the file. The result of this may lead to an
56 * infinite loop in xfssyncd when the file gets flushed to disk and
57 * all delayed extents need to be actually allocated. To get around
58 * this, we explicitly set aside a few blocks which will not be
59 * reserved in delayed allocation. Considering the minimum number of
60 * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap
61 * btree requires 1 fsb, so we set the number of set-aside blocks
62 * to 4 + 4*agcount.
63 */
64#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4))
65
66/*
47 * Argument structure for xfs_alloc routines. 67 * Argument structure for xfs_alloc routines.
48 * This is turned into a structure to avoid having 20 arguments passed 68 * This is turned into a structure to avoid having 20 arguments passed
49 * down several levels of the stack. 69 * down several levels of the stack.
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 3a6137539064..bf46fae303af 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4993,7 +4993,7 @@ xfs_bmapi(
4993 bma.firstblock = *firstblock; 4993 bma.firstblock = *firstblock;
4994 bma.alen = alen; 4994 bma.alen = alen;
4995 bma.off = aoff; 4995 bma.off = aoff;
4996 bma.conv = (flags & XFS_BMAPI_CONVERT); 4996 bma.conv = !!(flags & XFS_BMAPI_CONVERT);
4997 bma.wasdel = wasdelay; 4997 bma.wasdel = wasdelay;
4998 bma.minlen = minlen; 4998 bma.minlen = minlen;
4999 bma.low = flist->xbf_low; 4999 bma.low = flist->xbf_low;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 077629bab532..c064e72ada9e 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -462,7 +462,7 @@ xfs_fs_counts(
462 462
463 xfs_icsb_sync_counters_lazy(mp); 463 xfs_icsb_sync_counters_lazy(mp);
464 s = XFS_SB_LOCK(mp); 464 s = XFS_SB_LOCK(mp);
465 cnt->freedata = mp->m_sb.sb_fdblocks; 465 cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
466 cnt->freertx = mp->m_sb.sb_frextents; 466 cnt->freertx = mp->m_sb.sb_frextents;
467 cnt->freeino = mp->m_sb.sb_ifree; 467 cnt->freeino = mp->m_sb.sb_ifree;
468 cnt->allocino = mp->m_sb.sb_icount; 468 cnt->allocino = mp->m_sb.sb_icount;
@@ -519,15 +519,19 @@ xfs_reserve_blocks(
519 } 519 }
520 mp->m_resblks = request; 520 mp->m_resblks = request;
521 } else { 521 } else {
522 __int64_t free;
523
524 free = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
522 delta = request - mp->m_resblks; 525 delta = request - mp->m_resblks;
523 lcounter = mp->m_sb.sb_fdblocks - delta; 526 lcounter = free - delta;
524 if (lcounter < 0) { 527 if (lcounter < 0) {
525 /* We can't satisfy the request, just get what we can */ 528 /* We can't satisfy the request, just get what we can */
526 mp->m_resblks += mp->m_sb.sb_fdblocks; 529 mp->m_resblks += free;
527 mp->m_resblks_avail += mp->m_sb.sb_fdblocks; 530 mp->m_resblks_avail += free;
528 mp->m_sb.sb_fdblocks = 0; 531 mp->m_sb.sb_fdblocks = XFS_ALLOC_SET_ASIDE(mp);
529 } else { 532 } else {
530 mp->m_sb.sb_fdblocks = lcounter; 533 mp->m_sb.sb_fdblocks =
534 lcounter + XFS_ALLOC_SET_ASIDE(mp);
531 mp->m_resblks = request; 535 mp->m_resblks = request;
532 mp->m_resblks_avail += delta; 536 mp->m_resblks_avail += delta;
533 } 537 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 86c1bf0bba9e..1f8ecff8553a 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -334,10 +334,9 @@ xfs_itobp(
334#if !defined(__KERNEL__) 334#if !defined(__KERNEL__)
335 ni = 0; 335 ni = 0;
336#elif defined(DEBUG) 336#elif defined(DEBUG)
337 ni = (imap_flags & XFS_IMAP_BULKSTAT) ? 0 : 337 ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog;
338 (BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog);
339#else /* usual case */ 338#else /* usual case */
340 ni = (imap_flags & XFS_IMAP_BULKSTAT) ? 0 : 1; 339 ni = 1;
341#endif 340#endif
342 341
343 for (i = 0; i < ni; i++) { 342 for (i = 0; i < ni; i++) {
@@ -348,11 +347,15 @@ xfs_itobp(
348 (i << mp->m_sb.sb_inodelog)); 347 (i << mp->m_sb.sb_inodelog));
349 di_ok = INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC && 348 di_ok = INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC &&
350 XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT)); 349 XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT));
351 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, 350 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
352 XFS_RANDOM_ITOBP_INOTOBP))) { 351 XFS_ERRTAG_ITOBP_INOTOBP,
352 XFS_RANDOM_ITOBP_INOTOBP))) {
353 if (imap_flags & XFS_IMAP_BULKSTAT) {
354 xfs_trans_brelse(tp, bp);
355 return XFS_ERROR(EINVAL);
356 }
353#ifdef DEBUG 357#ifdef DEBUG
354 if (!(imap_flags & XFS_IMAP_BULKSTAT)) 358 cmn_err(CE_ALERT,
355 cmn_err(CE_ALERT,
356 "Device %s - bad inode magic/vsn " 359 "Device %s - bad inode magic/vsn "
357 "daddr %lld #%d (magic=%x)", 360 "daddr %lld #%d (magic=%x)",
358 XFS_BUFTARG_NAME(mp->m_ddev_targp), 361 XFS_BUFTARG_NAME(mp->m_ddev_targp),
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index e730328636c3..21ac1a67e3e0 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1413,7 +1413,7 @@ xlog_sync(xlog_t *log,
1413 ops = iclog->ic_header.h_num_logops; 1413 ops = iclog->ic_header.h_num_logops;
1414 INT_SET(iclog->ic_header.h_num_logops, ARCH_CONVERT, ops); 1414 INT_SET(iclog->ic_header.h_num_logops, ARCH_CONVERT, ops);
1415 1415
1416 bp = iclog->ic_bp; 1416 bp = iclog->ic_bp;
1417 ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1); 1417 ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1);
1418 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); 1418 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2);
1419 XFS_BUF_SET_ADDR(bp, BLOCK_LSN(INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT))); 1419 XFS_BUF_SET_ADDR(bp, BLOCK_LSN(INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT)));
@@ -1430,15 +1430,14 @@ xlog_sync(xlog_t *log,
1430 } 1430 }
1431 XFS_BUF_SET_PTR(bp, (xfs_caddr_t) &(iclog->ic_header), count); 1431 XFS_BUF_SET_PTR(bp, (xfs_caddr_t) &(iclog->ic_header), count);
1432 XFS_BUF_SET_FSPRIVATE(bp, iclog); /* save for later */ 1432 XFS_BUF_SET_FSPRIVATE(bp, iclog); /* save for later */
1433 XFS_BUF_ZEROFLAGS(bp);
1433 XFS_BUF_BUSY(bp); 1434 XFS_BUF_BUSY(bp);
1434 XFS_BUF_ASYNC(bp); 1435 XFS_BUF_ASYNC(bp);
1435 /* 1436 /*
1436 * Do an ordered write for the log block. 1437 * Do an ordered write for the log block.
1437 * 1438 * Its unnecessary to flush the first split block in the log wrap case.
1438 * It may not be needed to flush the first split block in the log wrap
1439 * case, but do it anyways to be safe -AK
1440 */ 1439 */
1441 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) 1440 if (!split && (log->l_mp->m_flags & XFS_MOUNT_BARRIER))
1442 XFS_BUF_ORDERED(bp); 1441 XFS_BUF_ORDERED(bp);
1443 1442
1444 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); 1443 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
@@ -1460,7 +1459,7 @@ xlog_sync(xlog_t *log,
1460 return error; 1459 return error;
1461 } 1460 }
1462 if (split) { 1461 if (split) {
1463 bp = iclog->ic_log->l_xbuf; 1462 bp = iclog->ic_log->l_xbuf;
1464 ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == 1463 ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) ==
1465 (unsigned long)1); 1464 (unsigned long)1);
1466 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); 1465 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2);
@@ -1468,6 +1467,7 @@ xlog_sync(xlog_t *log,
1468 XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+ 1467 XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+
1469 (__psint_t)count), split); 1468 (__psint_t)count), split);
1470 XFS_BUF_SET_FSPRIVATE(bp, iclog); 1469 XFS_BUF_SET_FSPRIVATE(bp, iclog);
1470 XFS_BUF_ZEROFLAGS(bp);
1471 XFS_BUF_BUSY(bp); 1471 XFS_BUF_BUSY(bp);
1472 XFS_BUF_ASYNC(bp); 1472 XFS_BUF_ASYNC(bp);
1473 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) 1473 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 4be5c0b2d296..9dfae18d995f 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1243,24 +1243,6 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
1243 xfs_trans_log_buf(tp, bp, first, last); 1243 xfs_trans_log_buf(tp, bp, first, last);
1244} 1244}
1245 1245
1246/*
1247 * In order to avoid ENOSPC-related deadlock caused by
1248 * out-of-order locking of AGF buffer (PV 947395), we place
1249 * constraints on the relationship among actual allocations for
1250 * data blocks, freelist blocks, and potential file data bmap
1251 * btree blocks. However, these restrictions may result in no
1252 * actual space allocated for a delayed extent, for example, a data
1253 * block in a certain AG is allocated but there is no additional
1254 * block for the additional bmap btree block due to a split of the
1255 * bmap btree of the file. The result of this may lead to an
1256 * infinite loop in xfssyncd when the file gets flushed to disk and
1257 * all delayed extents need to be actually allocated. To get around
1258 * this, we explicitly set aside a few blocks which will not be
1259 * reserved in delayed allocation. Considering the minimum number of
1260 * needed freelist blocks is 4 fsbs, a potential split of file's bmap
1261 * btree requires 1 fsb, so we set the number of set-aside blocks to 8.
1262*/
1263#define SET_ASIDE_BLOCKS 8
1264 1246
1265/* 1247/*
1266 * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply 1248 * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
@@ -1306,7 +1288,8 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
1306 return 0; 1288 return 0;
1307 case XFS_SBS_FDBLOCKS: 1289 case XFS_SBS_FDBLOCKS:
1308 1290
1309 lcounter = (long long)mp->m_sb.sb_fdblocks - SET_ASIDE_BLOCKS; 1291 lcounter = (long long)
1292 mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
1310 res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); 1293 res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
1311 1294
1312 if (delta > 0) { /* Putting blocks back */ 1295 if (delta > 0) { /* Putting blocks back */
@@ -1340,7 +1323,7 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
1340 } 1323 }
1341 } 1324 }
1342 1325
1343 mp->m_sb.sb_fdblocks = lcounter + SET_ASIDE_BLOCKS; 1326 mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
1344 return 0; 1327 return 0;
1345 case XFS_SBS_FREXTENTS: 1328 case XFS_SBS_FREXTENTS:
1346 lcounter = (long long)mp->m_sb.sb_frextents; 1329 lcounter = (long long)mp->m_sb.sb_frextents;
@@ -2021,7 +2004,8 @@ xfs_icsb_sync_counters_lazy(
2021 * when we get near ENOSPC. 2004 * when we get near ENOSPC.
2022 */ 2005 */
2023#define XFS_ICSB_INO_CNTR_REENABLE 64 2006#define XFS_ICSB_INO_CNTR_REENABLE 64
2024#define XFS_ICSB_FDBLK_CNTR_REENABLE 512 2007#define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \
2008 (512 + XFS_ALLOC_SET_ASIDE(mp))
2025STATIC void 2009STATIC void
2026xfs_icsb_balance_counter( 2010xfs_icsb_balance_counter(
2027 xfs_mount_t *mp, 2011 xfs_mount_t *mp,
@@ -2055,7 +2039,7 @@ xfs_icsb_balance_counter(
2055 case XFS_SBS_FDBLOCKS: 2039 case XFS_SBS_FDBLOCKS:
2056 count = mp->m_sb.sb_fdblocks; 2040 count = mp->m_sb.sb_fdblocks;
2057 resid = do_div(count, weight); 2041 resid = do_div(count, weight);
2058 if (count < XFS_ICSB_FDBLK_CNTR_REENABLE) 2042 if (count < XFS_ICSB_FDBLK_CNTR_REENABLE(mp))
2059 goto out; 2043 goto out;
2060 break; 2044 break;
2061 default: 2045 default:
@@ -2110,11 +2094,11 @@ again:
2110 case XFS_SBS_FDBLOCKS: 2094 case XFS_SBS_FDBLOCKS:
2111 BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0); 2095 BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0);
2112 2096
2113 lcounter = icsbp->icsb_fdblocks; 2097 lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
2114 lcounter += delta; 2098 lcounter += delta;
2115 if (unlikely(lcounter < 0)) 2099 if (unlikely(lcounter < 0))
2116 goto slow_path; 2100 goto slow_path;
2117 icsbp->icsb_fdblocks = lcounter; 2101 icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
2118 break; 2102 break;
2119 default: 2103 default:
2120 BUG(); 2104 BUG();
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 6c96391f3f1a..a34796e57afb 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -515,7 +515,7 @@ xfs_mount(
515 if (error) 515 if (error)
516 goto error2; 516 goto error2;
517 517
518 if ((mp->m_flags & XFS_MOUNT_BARRIER) && !(vfsp->vfs_flag & VFS_RDONLY)) 518 if (mp->m_flags & XFS_MOUNT_BARRIER)
519 xfs_mountfs_check_barriers(mp); 519 xfs_mountfs_check_barriers(mp);
520 520
521 error = XFS_IOINIT(vfsp, args, flags); 521 error = XFS_IOINIT(vfsp, args, flags);
@@ -811,7 +811,8 @@ xfs_statvfs(
811 statp->f_bsize = sbp->sb_blocksize; 811 statp->f_bsize = sbp->sb_blocksize;
812 lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0; 812 lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
813 statp->f_blocks = sbp->sb_dblocks - lsize; 813 statp->f_blocks = sbp->sb_dblocks - lsize;
814 statp->f_bfree = statp->f_bavail = sbp->sb_fdblocks; 814 statp->f_bfree = statp->f_bavail =
815 sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
815 fakeinos = statp->f_bfree << sbp->sb_inopblog; 816 fakeinos = statp->f_bfree << sbp->sb_inopblog;
816#if XFS_BIG_INUMS 817#if XFS_BIG_INUMS
817 fakeinos += mp->m_inoadd; 818 fakeinos += mp->m_inoadd;