aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2011-02-15 04:24:31 -0500
committerJiri Kosina <jkosina@suse.cz>2011-02-15 04:24:31 -0500
commit0a9d59a2461477bd9ed143c01af9df3f8f00fa81 (patch)
treedf997d1cfb0786427a0df1fbd6f0640fa4248cf4 /fs
parenta23ce6da9677d245aa0aadc99f4197030350ab54 (diff)
parent795abaf1e4e188c4171e3cd3dbb11a9fcacaf505 (diff)
Merge branch 'master' into for-next
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig19
-rw-r--r--fs/afs/cmservice.c12
-rw-r--r--fs/afs/dir.c1
-rw-r--r--fs/afs/inode.c3
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/afs/main.c13
-rw-r--r--fs/afs/mntpt.c63
-rw-r--r--fs/afs/rxrpc.c2
-rw-r--r--fs/afs/server.c13
-rw-r--r--fs/afs/vlocation.c14
-rw-r--r--fs/aio.c2
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/autofs4/autofs_i.h113
-rw-r--r--fs/autofs4/dev-ioctl.c2
-rw-r--r--fs/autofs4/expire.c55
-rw-r--r--fs/autofs4/inode.c114
-rw-r--r--fs/autofs4/root.c743
-rw-r--r--fs/autofs4/symlink.c3
-rw-r--r--fs/autofs4/waitq.c17
-rw-r--r--fs/block_dev.c93
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c10
-rw-r--r--fs/btrfs/btrfs_inode.h2
-rw-r--r--fs/btrfs/compression.c354
-rw-r--r--fs/btrfs/compression.h72
-rw-r--r--fs/btrfs/ctree.c8
-rw-r--r--fs/btrfs/ctree.h49
-rw-r--r--fs/btrfs/disk-io.c419
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/export.c2
-rw-r--r--fs/btrfs/extent-tree.c188
-rw-r--r--fs/btrfs/extent_io.c13
-rw-r--r--fs/btrfs/extent_io.h17
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/extent_map.h3
-rw-r--r--fs/btrfs/file-item.c5
-rw-r--r--fs/btrfs/file.c146
-rw-r--r--fs/btrfs/free-space-cache.c162
-rw-r--r--fs/btrfs/inode.c236
-rw-r--r--fs/btrfs/ioctl.c239
-rw-r--r--fs/btrfs/ioctl.h12
-rw-r--r--fs/btrfs/lzo.c420
-rw-r--r--fs/btrfs/ordered-data.c20
-rw-r--r--fs/btrfs/ordered-data.h8
-rw-r--r--fs/btrfs/print-tree.c1
-rw-r--r--fs/btrfs/relocation.c29
-rw-r--r--fs/btrfs/super.c290
-rw-r--r--fs/btrfs/transaction.c16
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/tree-log.c35
-rw-r--r--fs/btrfs/volumes.c645
-rw-r--r--fs/btrfs/volumes.h27
-rw-r--r--fs/btrfs/xattr.c18
-rw-r--r--fs/btrfs/zlib.c369
-rw-r--r--fs/ceph/caps.c43
-rw-r--r--fs/ceph/inode.c10
-rw-r--r--fs/ceph/mds_client.c10
-rw-r--r--fs/ceph/super.c2
-rw-r--r--fs/ceph/xattr.c3
-rw-r--r--fs/cifs/Kconfig1
-rw-r--r--fs/cifs/Makefile2
-rw-r--r--fs/cifs/README5
-rw-r--r--fs/cifs/cifs_debug.c10
-rw-r--r--fs/cifs/cifs_dfs_ref.c122
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifs_unicode.c127
-rw-r--r--fs/cifs/cifsacl.c17
-rw-r--r--fs/cifs/cifsencrypt.c38
-rw-r--r--fs/cifs/cifsencrypt.h33
-rw-r--r--fs/cifs/cifsfs.c55
-rw-r--r--fs/cifs/cifsfs.h25
-rw-r--r--fs/cifs/cifsglob.h101
-rw-r--r--fs/cifs/cifspdu.h62
-rw-r--r--fs/cifs/cifsproto.h20
-rw-r--r--fs/cifs/cifssmb.c121
-rw-r--r--fs/cifs/connect.c252
-rw-r--r--fs/cifs/dir.c2
-rw-r--r--fs/cifs/file.c496
-rw-r--r--fs/cifs/inode.c16
-rw-r--r--fs/cifs/link.c59
-rw-r--r--fs/cifs/md4.c205
-rw-r--r--fs/cifs/md5.c366
-rw-r--r--fs/cifs/md5.h38
-rw-r--r--fs/cifs/misc.c189
-rw-r--r--fs/cifs/netmisc.c8
-rw-r--r--fs/cifs/readdir.c3
-rw-r--r--fs/cifs/sess.c15
-rw-r--r--fs/cifs/smbdes.c1
-rw-r--r--fs/cifs/smbencrypt.c92
-rw-r--r--fs/cifs/transport.c481
-rw-r--r--fs/compat.c10
-rw-r--r--fs/configfs/Kconfig4
-rw-r--r--fs/dcache.c13
-rw-r--r--fs/direct-io.c10
-rw-r--r--fs/dlm/Kconfig3
-rw-r--r--fs/dlm/lowcomms.c6
-rw-r--r--fs/ecryptfs/crypto.c30
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/file.c28
-rw-r--r--fs/ecryptfs/inode.c32
-rw-r--r--fs/ecryptfs/keystore.c26
-rw-r--r--fs/ecryptfs/main.c14
-rw-r--r--fs/ecryptfs/mmap.c35
-rw-r--r--fs/eventpoll.c16
-rw-r--r--fs/exec.c4
-rw-r--r--fs/exofs/inode.c2
-rw-r--r--fs/ext3/super.c25
-rw-r--r--fs/ext4/ext4.h12
-rw-r--r--fs/ext4/extents.c21
-rw-r--r--fs/ext4/file.c62
-rw-r--r--fs/ext4/mballoc.c100
-rw-r--r--fs/ext4/page-io.c36
-rw-r--r--fs/ext4/super.c91
-rw-r--r--fs/fcntl.c2
-rw-r--r--fs/file_table.c4
-rw-r--r--fs/fs-writeback.c105
-rw-r--r--fs/fs_struct.c35
-rw-r--r--fs/fscache/operation.c2
-rw-r--r--fs/gfs2/file.c258
-rw-r--r--fs/gfs2/inode.c72
-rw-r--r--fs/gfs2/inode.h1
-rw-r--r--fs/gfs2/ops_inode.c258
-rw-r--r--fs/gfs2/super.c1
-rw-r--r--fs/hfsplus/extents.c4
-rw-r--r--fs/hfsplus/part_tbl.c4
-rw-r--r--fs/hfsplus/super.c106
-rw-r--r--fs/hfsplus/wrapper.c4
-rw-r--r--fs/hpfs/inode.c2
-rw-r--r--fs/internal.h4
-rw-r--r--fs/ioctl.c17
-rw-r--r--fs/jbd2/journal.c9
-rw-r--r--fs/jbd2/transaction.c21
-rw-r--r--fs/jffs2/build.c5
-rw-r--r--fs/jffs2/jffs2_fs_sb.h2
-rw-r--r--fs/jffs2/xattr.c12
-rw-r--r--fs/lockd/host.c9
-rw-r--r--fs/locks.c8
-rw-r--r--fs/mpage.c49
-rw-r--r--fs/namei.c437
-rw-r--r--fs/namespace.c223
-rw-r--r--fs/nfs/callback.c109
-rw-r--r--fs/nfs/callback.h4
-rw-r--r--fs/nfs/callback_proc.c12
-rw-r--r--fs/nfs/callback_xdr.c5
-rw-r--r--fs/nfs/client.c15
-rw-r--r--fs/nfs/delegation.c6
-rw-r--r--fs/nfs/dir.c16
-rw-r--r--fs/nfs/direct.c34
-rw-r--r--fs/nfs/inode.c30
-rw-r--r--fs/nfs/internal.h4
-rw-r--r--fs/nfs/namespace.c77
-rw-r--r--fs/nfs/nfs3acl.c4
-rw-r--r--fs/nfs/nfs3xdr.c5
-rw-r--r--fs/nfs/nfs4filelayoutdev.c9
-rw-r--r--fs/nfs/nfs4proc.c30
-rw-r--r--fs/nfs/nfs4state.c6
-rw-r--r--fs/nfs/nfs4xdr.c9
-rw-r--r--fs/nfs/pnfs.c2
-rw-r--r--fs/nfs/write.c2
-rw-r--r--fs/nfs_common/nfsacl.c54
-rw-r--r--fs/nfsd/acl.h59
-rw-r--r--fs/nfsd/export.c4
-rw-r--r--fs/nfsd/idmap.h62
-rw-r--r--fs/nfsd/nfs3proc.c8
-rw-r--r--fs/nfsd/nfs4acl.c2
-rw-r--r--fs/nfsd/nfs4callback.c151
-rw-r--r--fs/nfsd/nfs4idmap.c15
-rw-r--r--fs/nfsd/nfs4proc.c59
-rw-r--r--fs/nfsd/nfs4recover.c1
-rw-r--r--fs/nfsd/nfs4state.c243
-rw-r--r--fs/nfsd/nfs4xdr.c115
-rw-r--r--fs/nfsd/nfsctl.c4
-rw-r--r--fs/nfsd/nfsd.h1
-rw-r--r--fs/nfsd/nfsproc.c6
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--fs/nfsd/state.h16
-rw-r--r--fs/nfsd/vfs.c90
-rw-r--r--fs/nfsd/xdr4.h9
-rw-r--r--fs/nilfs2/super.c3
-rw-r--r--fs/ntfs/mft.c11
-rw-r--r--fs/ocfs2/Kconfig3
-rw-r--r--fs/ocfs2/file.c10
-rw-r--r--fs/ocfs2/super.c5
-rw-r--r--fs/open.c6
-rw-r--r--fs/pipe.c12
-rw-r--r--fs/posix_acl.c17
-rw-r--r--fs/proc/Kconfig6
-rw-r--r--fs/proc/base.c4
-rw-r--r--fs/proc/consoles.c4
-rw-r--r--fs/proc/meminfo.c14
-rw-r--r--fs/proc/page.c14
-rw-r--r--fs/proc/task_mmu.c7
-rw-r--r--fs/quota/dquot.c18
-rw-r--r--fs/quota/quota.c41
-rw-r--r--fs/reiserfs/super.c17
-rw-r--r--fs/squashfs/Kconfig18
-rw-r--r--fs/squashfs/Makefile1
-rw-r--r--fs/squashfs/block.c9
-rw-r--r--fs/squashfs/cache.c1
-rw-r--r--fs/squashfs/decompressor.c16
-rw-r--r--fs/squashfs/decompressor.h9
-rw-r--r--fs/squashfs/fragment.c1
-rw-r--r--fs/squashfs/id.c1
-rw-r--r--fs/squashfs/lzo_wrapper.c1
-rw-r--r--fs/squashfs/squashfs.h8
-rw-r--r--fs/squashfs/squashfs_fs.h1
-rw-r--r--fs/squashfs/squashfs_fs_i.h6
-rw-r--r--fs/squashfs/xattr_id.c1
-rw-r--r--fs/squashfs/xz_wrapper.c147
-rw-r--r--fs/squashfs/zlib_wrapper.c21
-rw-r--r--fs/stat.c4
-rw-r--r--fs/super.c7
-rw-r--r--fs/sysfs/Kconfig2
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c7
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h7
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.c191
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.h8
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c587
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c57
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h33
-rw-r--r--fs/xfs/quota/xfs_qm.c46
-rw-r--r--fs/xfs/support/debug.c112
-rw-r--r--fs/xfs/support/debug.h25
-rw-r--r--fs/xfs/xfs_alloc.c10
-rw-r--r--fs/xfs/xfs_alloc.h41
-rw-r--r--fs/xfs/xfs_bmap.c61
-rw-r--r--fs/xfs/xfs_buf_item.c163
-rw-r--r--fs/xfs/xfs_error.c31
-rw-r--r--fs/xfs/xfs_error.h18
-rw-r--r--fs/xfs/xfs_extfree_item.c3
-rw-r--r--fs/xfs/xfs_fsops.c10
-rw-r--r--fs/xfs/xfs_fsops.h2
-rw-r--r--fs/xfs/xfs_iomap.c7
-rw-r--r--fs/xfs/xfs_log.c2
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_cil.c15
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--fs/xfs/xfs_trans.c43
244 files changed, 8470 insertions, 5175 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 771f457402d4..3db9caa57edc 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -30,15 +30,6 @@ config FS_MBCACHE
30source "fs/reiserfs/Kconfig" 30source "fs/reiserfs/Kconfig"
31source "fs/jfs/Kconfig" 31source "fs/jfs/Kconfig"
32 32
33config FS_POSIX_ACL
34# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/nfs4)
35#
36# NOTE: you can implement Posix ACLs without these helpers (XFS does).
37# Never use this symbol for ifdefs.
38#
39 bool
40 default n
41
42source "fs/xfs/Kconfig" 33source "fs/xfs/Kconfig"
43source "fs/gfs2/Kconfig" 34source "fs/gfs2/Kconfig"
44source "fs/ocfs2/Kconfig" 35source "fs/ocfs2/Kconfig"
@@ -47,11 +38,19 @@ source "fs/nilfs2/Kconfig"
47 38
48endif # BLOCK 39endif # BLOCK
49 40
41# Posix ACL utility routines
42#
43# Note: Posix ACLs can be implemented without these helpers. Never use
44# this symbol for ifdefs in core code.
45#
46config FS_POSIX_ACL
47 def_bool n
48
50config EXPORTFS 49config EXPORTFS
51 tristate 50 tristate
52 51
53config FILE_LOCKING 52config FILE_LOCKING
54 bool "Enable POSIX file locking API" if EMBEDDED 53 bool "Enable POSIX file locking API" if EXPERT
55 default y 54 default y
56 help 55 help
57 This option enables standard file locking support, required 56 This option enables standard file locking support, required
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index a3bcec75c54a..1c8c6cc6de30 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -289,7 +289,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
289 call->server = server; 289 call->server = server;
290 290
291 INIT_WORK(&call->work, SRXAFSCB_CallBack); 291 INIT_WORK(&call->work, SRXAFSCB_CallBack);
292 schedule_work(&call->work); 292 queue_work(afs_wq, &call->work);
293 return 0; 293 return 0;
294} 294}
295 295
@@ -336,7 +336,7 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
336 call->server = server; 336 call->server = server;
337 337
338 INIT_WORK(&call->work, SRXAFSCB_InitCallBackState); 338 INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
339 schedule_work(&call->work); 339 queue_work(afs_wq, &call->work);
340 return 0; 340 return 0;
341} 341}
342 342
@@ -367,7 +367,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
367 call->server = server; 367 call->server = server;
368 368
369 INIT_WORK(&call->work, SRXAFSCB_InitCallBackState); 369 INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
370 schedule_work(&call->work); 370 queue_work(afs_wq, &call->work);
371 return 0; 371 return 0;
372} 372}
373 373
@@ -400,7 +400,7 @@ static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
400 call->state = AFS_CALL_REPLYING; 400 call->state = AFS_CALL_REPLYING;
401 401
402 INIT_WORK(&call->work, SRXAFSCB_Probe); 402 INIT_WORK(&call->work, SRXAFSCB_Probe);
403 schedule_work(&call->work); 403 queue_work(afs_wq, &call->work);
404 return 0; 404 return 0;
405} 405}
406 406
@@ -496,7 +496,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
496 call->state = AFS_CALL_REPLYING; 496 call->state = AFS_CALL_REPLYING;
497 497
498 INIT_WORK(&call->work, SRXAFSCB_ProbeUuid); 498 INIT_WORK(&call->work, SRXAFSCB_ProbeUuid);
499 schedule_work(&call->work); 499 queue_work(afs_wq, &call->work);
500 return 0; 500 return 0;
501} 501}
502 502
@@ -580,6 +580,6 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call,
580 call->state = AFS_CALL_REPLYING; 580 call->state = AFS_CALL_REPLYING;
581 581
582 INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself); 582 INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself);
583 schedule_work(&call->work); 583 queue_work(afs_wq, &call->work);
584 return 0; 584 return 0;
585} 585}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index e6a4ab980e31..20c106f24927 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -66,6 +66,7 @@ const struct dentry_operations afs_fs_dentry_operations = {
66 .d_revalidate = afs_d_revalidate, 66 .d_revalidate = afs_d_revalidate,
67 .d_delete = afs_d_delete, 67 .d_delete = afs_d_delete,
68 .d_release = afs_d_release, 68 .d_release = afs_d_release,
69 .d_automount = afs_d_automount,
69}; 70};
70 71
71#define AFS_DIR_HASHTBL_SIZE 128 72#define AFS_DIR_HASHTBL_SIZE 128
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0747339011c3..db66c5201474 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -184,7 +184,8 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
184 inode->i_generation = 0; 184 inode->i_generation = 0;
185 185
186 set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); 186 set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
187 inode->i_flags |= S_NOATIME; 187 set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
188 inode->i_flags |= S_AUTOMOUNT | S_NOATIME;
188 unlock_new_inode(inode); 189 unlock_new_inode(inode);
189 _leave(" = %p", inode); 190 _leave(" = %p", inode);
190 return inode; 191 return inode;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index ab6db5abaf53..5a9b6843bac1 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -577,6 +577,7 @@ extern int afs_drop_inode(struct inode *);
577/* 577/*
578 * main.c 578 * main.c
579 */ 579 */
580extern struct workqueue_struct *afs_wq;
580extern struct afs_uuid afs_uuid; 581extern struct afs_uuid afs_uuid;
581 582
582/* 583/*
@@ -591,6 +592,7 @@ extern const struct inode_operations afs_mntpt_inode_operations;
591extern const struct inode_operations afs_autocell_inode_operations; 592extern const struct inode_operations afs_autocell_inode_operations;
592extern const struct file_operations afs_mntpt_file_operations; 593extern const struct file_operations afs_mntpt_file_operations;
593 594
595extern struct vfsmount *afs_d_automount(struct path *);
594extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *); 596extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
595extern void afs_mntpt_kill_timer(void); 597extern void afs_mntpt_kill_timer(void);
596 598
diff --git a/fs/afs/main.c b/fs/afs/main.c
index cfd1cbe25b22..42dd2e499ed8 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -30,6 +30,7 @@ module_param(rootcell, charp, 0);
30MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); 30MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
31 31
32struct afs_uuid afs_uuid; 32struct afs_uuid afs_uuid;
33struct workqueue_struct *afs_wq;
33 34
34/* 35/*
35 * get a client UUID 36 * get a client UUID
@@ -87,10 +88,16 @@ static int __init afs_init(void)
87 if (ret < 0) 88 if (ret < 0)
88 return ret; 89 return ret;
89 90
91 /* create workqueue */
92 ret = -ENOMEM;
93 afs_wq = alloc_workqueue("afs", 0, 0);
94 if (!afs_wq)
95 return ret;
96
90 /* register the /proc stuff */ 97 /* register the /proc stuff */
91 ret = afs_proc_init(); 98 ret = afs_proc_init();
92 if (ret < 0) 99 if (ret < 0)
93 return ret; 100 goto error_proc;
94 101
95#ifdef CONFIG_AFS_FSCACHE 102#ifdef CONFIG_AFS_FSCACHE
96 /* we want to be able to cache */ 103 /* we want to be able to cache */
@@ -140,6 +147,8 @@ error_cell_init:
140error_cache: 147error_cache:
141#endif 148#endif
142 afs_proc_cleanup(); 149 afs_proc_cleanup();
150error_proc:
151 destroy_workqueue(afs_wq);
143 rcu_barrier(); 152 rcu_barrier();
144 printk(KERN_ERR "kAFS: failed to register: %d\n", ret); 153 printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
145 return ret; 154 return ret;
@@ -163,7 +172,7 @@ static void __exit afs_exit(void)
163 afs_purge_servers(); 172 afs_purge_servers();
164 afs_callback_update_kill(); 173 afs_callback_update_kill();
165 afs_vlocation_purge(); 174 afs_vlocation_purge();
166 flush_scheduled_work(); 175 destroy_workqueue(afs_wq);
167 afs_cell_purge(); 176 afs_cell_purge();
168#ifdef CONFIG_AFS_FSCACHE 177#ifdef CONFIG_AFS_FSCACHE
169 fscache_unregister_netfs(&afs_cache_netfs); 178 fscache_unregister_netfs(&afs_cache_netfs);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 6153417caf57..aa59184151d0 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -24,7 +24,6 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
24 struct dentry *dentry, 24 struct dentry *dentry,
25 struct nameidata *nd); 25 struct nameidata *nd);
26static int afs_mntpt_open(struct inode *inode, struct file *file); 26static int afs_mntpt_open(struct inode *inode, struct file *file);
27static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd);
28static void afs_mntpt_expiry_timed_out(struct work_struct *work); 27static void afs_mntpt_expiry_timed_out(struct work_struct *work);
29 28
30const struct file_operations afs_mntpt_file_operations = { 29const struct file_operations afs_mntpt_file_operations = {
@@ -34,13 +33,11 @@ const struct file_operations afs_mntpt_file_operations = {
34 33
35const struct inode_operations afs_mntpt_inode_operations = { 34const struct inode_operations afs_mntpt_inode_operations = {
36 .lookup = afs_mntpt_lookup, 35 .lookup = afs_mntpt_lookup,
37 .follow_link = afs_mntpt_follow_link,
38 .readlink = page_readlink, 36 .readlink = page_readlink,
39 .getattr = afs_getattr, 37 .getattr = afs_getattr,
40}; 38};
41 39
42const struct inode_operations afs_autocell_inode_operations = { 40const struct inode_operations afs_autocell_inode_operations = {
43 .follow_link = afs_mntpt_follow_link,
44 .getattr = afs_getattr, 41 .getattr = afs_getattr,
45}; 42};
46 43
@@ -88,6 +85,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
88 _debug("symlink is a mountpoint"); 85 _debug("symlink is a mountpoint");
89 spin_lock(&vnode->lock); 86 spin_lock(&vnode->lock);
90 set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); 87 set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
88 vnode->vfs_inode.i_flags |= S_AUTOMOUNT;
91 spin_unlock(&vnode->lock); 89 spin_unlock(&vnode->lock);
92 } 90 }
93 91
@@ -238,52 +236,24 @@ error_no_devname:
238} 236}
239 237
240/* 238/*
241 * follow a link from a mountpoint directory, thus causing it to be mounted 239 * handle an automount point
242 */ 240 */
243static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd) 241struct vfsmount *afs_d_automount(struct path *path)
244{ 242{
245 struct vfsmount *newmnt; 243 struct vfsmount *newmnt;
246 int err;
247 244
248 _enter("%p{%s},{%s:%p{%s},}", 245 _enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name);
249 dentry,
250 dentry->d_name.name,
251 nd->path.mnt->mnt_devname,
252 dentry,
253 nd->path.dentry->d_name.name);
254
255 dput(nd->path.dentry);
256 nd->path.dentry = dget(dentry);
257 246
258 newmnt = afs_mntpt_do_automount(nd->path.dentry); 247 newmnt = afs_mntpt_do_automount(path->dentry);
259 if (IS_ERR(newmnt)) { 248 if (IS_ERR(newmnt))
260 path_put(&nd->path); 249 return newmnt;
261 return (void *)newmnt;
262 }
263
264 mntget(newmnt);
265 err = do_add_mount(newmnt, &nd->path, MNT_SHRINKABLE, &afs_vfsmounts);
266 switch (err) {
267 case 0:
268 path_put(&nd->path);
269 nd->path.mnt = newmnt;
270 nd->path.dentry = dget(newmnt->mnt_root);
271 schedule_delayed_work(&afs_mntpt_expiry_timer,
272 afs_mntpt_expiry_timeout * HZ);
273 break;
274 case -EBUSY:
275 /* someone else made a mount here whilst we were busy */
276 while (d_mountpoint(nd->path.dentry) &&
277 follow_down(&nd->path))
278 ;
279 err = 0;
280 default:
281 mntput(newmnt);
282 break;
283 }
284 250
285 _leave(" = %d", err); 251 mntget(newmnt); /* prevent immediate expiration */
286 return ERR_PTR(err); 252 mnt_set_expiry(newmnt, &afs_vfsmounts);
253 queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
254 afs_mntpt_expiry_timeout * HZ);
255 _leave(" = %p {%s}", newmnt, newmnt->mnt_devname);
256 return newmnt;
287} 257}
288 258
289/* 259/*
@@ -295,8 +265,8 @@ static void afs_mntpt_expiry_timed_out(struct work_struct *work)
295 265
296 if (!list_empty(&afs_vfsmounts)) { 266 if (!list_empty(&afs_vfsmounts)) {
297 mark_mounts_for_expiry(&afs_vfsmounts); 267 mark_mounts_for_expiry(&afs_vfsmounts);
298 schedule_delayed_work(&afs_mntpt_expiry_timer, 268 queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
299 afs_mntpt_expiry_timeout * HZ); 269 afs_mntpt_expiry_timeout * HZ);
300 } 270 }
301 271
302 _leave(""); 272 _leave("");
@@ -310,6 +280,5 @@ void afs_mntpt_kill_timer(void)
310 _enter(""); 280 _enter("");
311 281
312 ASSERT(list_empty(&afs_vfsmounts)); 282 ASSERT(list_empty(&afs_vfsmounts));
313 cancel_delayed_work(&afs_mntpt_expiry_timer); 283 cancel_delayed_work_sync(&afs_mntpt_expiry_timer);
314 flush_scheduled_work();
315} 284}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 654d8fdbf01f..e45a323aebb4 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -410,7 +410,7 @@ static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID,
410 if (!call) { 410 if (!call) {
411 /* its an incoming call for our callback service */ 411 /* its an incoming call for our callback service */
412 skb_queue_tail(&afs_incoming_calls, skb); 412 skb_queue_tail(&afs_incoming_calls, skb);
413 schedule_work(&afs_collect_incoming_call_work); 413 queue_work(afs_wq, &afs_collect_incoming_call_work);
414 } else { 414 } else {
415 /* route the messages directly to the appropriate call */ 415 /* route the messages directly to the appropriate call */
416 skb_queue_tail(&call->rx_queue, skb); 416 skb_queue_tail(&call->rx_queue, skb);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 9fdc7fe3a7bc..d59b7516e943 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -238,8 +238,8 @@ void afs_put_server(struct afs_server *server)
238 if (atomic_read(&server->usage) == 0) { 238 if (atomic_read(&server->usage) == 0) {
239 list_move_tail(&server->grave, &afs_server_graveyard); 239 list_move_tail(&server->grave, &afs_server_graveyard);
240 server->time_of_death = get_seconds(); 240 server->time_of_death = get_seconds();
241 schedule_delayed_work(&afs_server_reaper, 241 queue_delayed_work(afs_wq, &afs_server_reaper,
242 afs_server_timeout * HZ); 242 afs_server_timeout * HZ);
243 } 243 }
244 spin_unlock(&afs_server_graveyard_lock); 244 spin_unlock(&afs_server_graveyard_lock);
245 _leave(" [dead]"); 245 _leave(" [dead]");
@@ -285,10 +285,11 @@ static void afs_reap_server(struct work_struct *work)
285 expiry = server->time_of_death + afs_server_timeout; 285 expiry = server->time_of_death + afs_server_timeout;
286 if (expiry > now) { 286 if (expiry > now) {
287 delay = (expiry - now) * HZ; 287 delay = (expiry - now) * HZ;
288 if (!schedule_delayed_work(&afs_server_reaper, delay)) { 288 if (!queue_delayed_work(afs_wq, &afs_server_reaper,
289 delay)) {
289 cancel_delayed_work(&afs_server_reaper); 290 cancel_delayed_work(&afs_server_reaper);
290 schedule_delayed_work(&afs_server_reaper, 291 queue_delayed_work(afs_wq, &afs_server_reaper,
291 delay); 292 delay);
292 } 293 }
293 break; 294 break;
294 } 295 }
@@ -323,5 +324,5 @@ void __exit afs_purge_servers(void)
323{ 324{
324 afs_server_timeout = 0; 325 afs_server_timeout = 0;
325 cancel_delayed_work(&afs_server_reaper); 326 cancel_delayed_work(&afs_server_reaper);
326 schedule_delayed_work(&afs_server_reaper, 0); 327 queue_delayed_work(afs_wq, &afs_server_reaper, 0);
327} 328}
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 9ac260d1361d..431984d2e372 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -507,8 +507,8 @@ void afs_put_vlocation(struct afs_vlocation *vl)
507 _debug("buried"); 507 _debug("buried");
508 list_move_tail(&vl->grave, &afs_vlocation_graveyard); 508 list_move_tail(&vl->grave, &afs_vlocation_graveyard);
509 vl->time_of_death = get_seconds(); 509 vl->time_of_death = get_seconds();
510 schedule_delayed_work(&afs_vlocation_reap, 510 queue_delayed_work(afs_wq, &afs_vlocation_reap,
511 afs_vlocation_timeout * HZ); 511 afs_vlocation_timeout * HZ);
512 512
513 /* suspend updates on this record */ 513 /* suspend updates on this record */
514 if (!list_empty(&vl->update)) { 514 if (!list_empty(&vl->update)) {
@@ -561,11 +561,11 @@ static void afs_vlocation_reaper(struct work_struct *work)
561 if (expiry > now) { 561 if (expiry > now) {
562 delay = (expiry - now) * HZ; 562 delay = (expiry - now) * HZ;
563 _debug("delay %lu", delay); 563 _debug("delay %lu", delay);
564 if (!schedule_delayed_work(&afs_vlocation_reap, 564 if (!queue_delayed_work(afs_wq, &afs_vlocation_reap,
565 delay)) { 565 delay)) {
566 cancel_delayed_work(&afs_vlocation_reap); 566 cancel_delayed_work(&afs_vlocation_reap);
567 schedule_delayed_work(&afs_vlocation_reap, 567 queue_delayed_work(afs_wq, &afs_vlocation_reap,
568 delay); 568 delay);
569 } 569 }
570 break; 570 break;
571 } 571 }
@@ -620,7 +620,7 @@ void afs_vlocation_purge(void)
620 destroy_workqueue(afs_vlocation_update_worker); 620 destroy_workqueue(afs_vlocation_update_worker);
621 621
622 cancel_delayed_work(&afs_vlocation_reap); 622 cancel_delayed_work(&afs_vlocation_reap);
623 schedule_delayed_work(&afs_vlocation_reap, 0); 623 queue_delayed_work(afs_wq, &afs_vlocation_reap, 0);
624} 624}
625 625
626/* 626/*
diff --git a/fs/aio.c b/fs/aio.c
index 5e00f15c54aa..fc557a3be0a9 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -87,7 +87,7 @@ static int __init aio_setup(void)
87 87
88 aio_wq = create_workqueue("aio"); 88 aio_wq = create_workqueue("aio");
89 abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry)); 89 abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
90 BUG_ON(!abe_pool); 90 BUG_ON(!aio_wq || !abe_pool);
91 91
92 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); 92 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
93 93
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index cbe57f3c4d89..c5567cb78432 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -233,7 +233,7 @@ static int __init anon_inode_init(void)
233 return 0; 233 return 0;
234 234
235err_mntput: 235err_mntput:
236 mntput_long(anon_inode_mnt); 236 mntput(anon_inode_mnt);
237err_unregister_filesystem: 237err_unregister_filesystem:
238 unregister_filesystem(&anon_inode_fs_type); 238 unregister_filesystem(&anon_inode_fs_type);
239err_exit: 239err_exit:
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 0fffe1c24cec..54f923792728 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -88,18 +88,9 @@ struct autofs_info {
88 88
89 uid_t uid; 89 uid_t uid;
90 gid_t gid; 90 gid_t gid;
91
92 mode_t mode;
93 size_t size;
94
95 void (*free)(struct autofs_info *);
96 union {
97 const char *symlink;
98 } u;
99}; 91};
100 92
101#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ 93#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
102#define AUTOFS_INF_MOUNTPOINT (1<<1) /* mountpoint status for direct expire */
103#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ 94#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */
104 95
105struct autofs_wait_queue { 96struct autofs_wait_queue {
@@ -176,14 +167,7 @@ static inline int autofs4_ispending(struct dentry *dentry)
176 return 0; 167 return 0;
177} 168}
178 169
179static inline void autofs4_copy_atime(struct file *src, struct file *dst) 170struct inode *autofs4_get_inode(struct super_block *, mode_t);
180{
181 dst->f_path.dentry->d_inode->i_atime =
182 src->f_path.dentry->d_inode->i_atime;
183 return;
184}
185
186struct inode *autofs4_get_inode(struct super_block *, struct autofs_info *);
187void autofs4_free_ino(struct autofs_info *); 171void autofs4_free_ino(struct autofs_info *);
188 172
189/* Expiration */ 173/* Expiration */
@@ -212,16 +196,89 @@ void autofs_dev_ioctl_exit(void);
212 196
213extern const struct inode_operations autofs4_symlink_inode_operations; 197extern const struct inode_operations autofs4_symlink_inode_operations;
214extern const struct inode_operations autofs4_dir_inode_operations; 198extern const struct inode_operations autofs4_dir_inode_operations;
215extern const struct inode_operations autofs4_root_inode_operations;
216extern const struct inode_operations autofs4_indirect_root_inode_operations;
217extern const struct inode_operations autofs4_direct_root_inode_operations;
218extern const struct file_operations autofs4_dir_operations; 199extern const struct file_operations autofs4_dir_operations;
219extern const struct file_operations autofs4_root_operations; 200extern const struct file_operations autofs4_root_operations;
201extern const struct dentry_operations autofs4_dentry_operations;
202
203/* VFS automount flags management functions */
204
205static inline void __managed_dentry_set_automount(struct dentry *dentry)
206{
207 dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
208}
209
210static inline void managed_dentry_set_automount(struct dentry *dentry)
211{
212 spin_lock(&dentry->d_lock);
213 __managed_dentry_set_automount(dentry);
214 spin_unlock(&dentry->d_lock);
215}
216
217static inline void __managed_dentry_clear_automount(struct dentry *dentry)
218{
219 dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT;
220}
221
222static inline void managed_dentry_clear_automount(struct dentry *dentry)
223{
224 spin_lock(&dentry->d_lock);
225 __managed_dentry_clear_automount(dentry);
226 spin_unlock(&dentry->d_lock);
227}
228
229static inline void __managed_dentry_set_transit(struct dentry *dentry)
230{
231 dentry->d_flags |= DCACHE_MANAGE_TRANSIT;
232}
233
234static inline void managed_dentry_set_transit(struct dentry *dentry)
235{
236 spin_lock(&dentry->d_lock);
237 __managed_dentry_set_transit(dentry);
238 spin_unlock(&dentry->d_lock);
239}
240
241static inline void __managed_dentry_clear_transit(struct dentry *dentry)
242{
243 dentry->d_flags &= ~DCACHE_MANAGE_TRANSIT;
244}
245
246static inline void managed_dentry_clear_transit(struct dentry *dentry)
247{
248 spin_lock(&dentry->d_lock);
249 __managed_dentry_clear_transit(dentry);
250 spin_unlock(&dentry->d_lock);
251}
252
253static inline void __managed_dentry_set_managed(struct dentry *dentry)
254{
255 dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
256}
257
258static inline void managed_dentry_set_managed(struct dentry *dentry)
259{
260 spin_lock(&dentry->d_lock);
261 __managed_dentry_set_managed(dentry);
262 spin_unlock(&dentry->d_lock);
263}
264
265static inline void __managed_dentry_clear_managed(struct dentry *dentry)
266{
267 dentry->d_flags &= ~(DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
268}
269
270static inline void managed_dentry_clear_managed(struct dentry *dentry)
271{
272 spin_lock(&dentry->d_lock);
273 __managed_dentry_clear_managed(dentry);
274 spin_unlock(&dentry->d_lock);
275}
220 276
221/* Initializing function */ 277/* Initializing function */
222 278
223int autofs4_fill_super(struct super_block *, void *, int); 279int autofs4_fill_super(struct super_block *, void *, int);
224struct autofs_info *autofs4_init_ino(struct autofs_info *, struct autofs_sb_info *sbi, mode_t mode); 280struct autofs_info *autofs4_new_ino(struct autofs_sb_info *);
281void autofs4_clean_ino(struct autofs_info *);
225 282
226/* Queue management functions */ 283/* Queue management functions */
227 284
@@ -229,19 +286,6 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
229int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int); 286int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
230void autofs4_catatonic_mode(struct autofs_sb_info *); 287void autofs4_catatonic_mode(struct autofs_sb_info *);
231 288
232static inline int autofs4_follow_mount(struct path *path)
233{
234 int res = 0;
235
236 while (d_mountpoint(path->dentry)) {
237 int followed = follow_down(path);
238 if (!followed)
239 break;
240 res = 1;
241 }
242 return res;
243}
244
245static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi) 289static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
246{ 290{
247 return new_encode_dev(sbi->sb->s_dev); 291 return new_encode_dev(sbi->sb->s_dev);
@@ -294,5 +338,4 @@ static inline void autofs4_del_expiring(struct dentry *dentry)
294 return; 338 return;
295} 339}
296 340
297void autofs4_dentry_release(struct dentry *);
298extern void autofs4_kill_sb(struct super_block *); 341extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index eff9a419469a..1442da4860e5 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -551,7 +551,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
551 551
552 err = have_submounts(path.dentry); 552 err = have_submounts(path.dentry);
553 553
554 if (follow_down(&path)) 554 if (follow_down_one(&path))
555 magic = path.mnt->mnt_sb->s_magic; 555 magic = path.mnt->mnt_sb->s_magic;
556 } 556 }
557 557
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index cc1d01365905..f43100b9662b 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -26,10 +26,6 @@ static inline int autofs4_can_expire(struct dentry *dentry,
26 if (ino == NULL) 26 if (ino == NULL)
27 return 0; 27 return 0;
28 28
29 /* No point expiring a pending mount */
30 if (ino->flags & AUTOFS_INF_PENDING)
31 return 0;
32
33 if (!do_now) { 29 if (!do_now) {
34 /* Too young to die */ 30 /* Too young to die */
35 if (!timeout || time_after(ino->last_used + timeout, now)) 31 if (!timeout || time_after(ino->last_used + timeout, now))
@@ -56,7 +52,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
56 52
57 path_get(&path); 53 path_get(&path);
58 54
59 if (!follow_down(&path)) 55 if (!follow_down_one(&path))
60 goto done; 56 goto done;
61 57
62 if (is_autofs4_dentry(path.dentry)) { 58 if (is_autofs4_dentry(path.dentry)) {
@@ -100,7 +96,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev,
100 struct dentry *p, *ret; 96 struct dentry *p, *ret;
101 97
102 if (prev == NULL) 98 if (prev == NULL)
103 return dget(prev); 99 return dget(root);
104 100
105 spin_lock(&autofs4_lock); 101 spin_lock(&autofs4_lock);
106relock: 102relock:
@@ -137,7 +133,7 @@ again:
137 spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED); 133 spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED);
138 /* Negative dentry - try next */ 134 /* Negative dentry - try next */
139 if (!simple_positive(ret)) { 135 if (!simple_positive(ret)) {
140 spin_unlock(&ret->d_lock); 136 spin_unlock(&p->d_lock);
141 p = ret; 137 p = ret;
142 goto again; 138 goto again;
143 } 139 }
@@ -283,6 +279,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
283 unsigned long timeout; 279 unsigned long timeout;
284 struct dentry *root = dget(sb->s_root); 280 struct dentry *root = dget(sb->s_root);
285 int do_now = how & AUTOFS_EXP_IMMEDIATE; 281 int do_now = how & AUTOFS_EXP_IMMEDIATE;
282 struct autofs_info *ino;
286 283
287 if (!root) 284 if (!root)
288 return NULL; 285 return NULL;
@@ -291,19 +288,21 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
291 timeout = sbi->exp_timeout; 288 timeout = sbi->exp_timeout;
292 289
293 spin_lock(&sbi->fs_lock); 290 spin_lock(&sbi->fs_lock);
291 ino = autofs4_dentry_ino(root);
292 /* No point expiring a pending mount */
293 if (ino->flags & AUTOFS_INF_PENDING) {
294 spin_unlock(&sbi->fs_lock);
295 return NULL;
296 }
297 managed_dentry_set_transit(root);
294 if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { 298 if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
295 struct autofs_info *ino = autofs4_dentry_ino(root); 299 struct autofs_info *ino = autofs4_dentry_ino(root);
296 if (d_mountpoint(root)) {
297 ino->flags |= AUTOFS_INF_MOUNTPOINT;
298 spin_lock(&root->d_lock);
299 root->d_flags &= ~DCACHE_MOUNTED;
300 spin_unlock(&root->d_lock);
301 }
302 ino->flags |= AUTOFS_INF_EXPIRING; 300 ino->flags |= AUTOFS_INF_EXPIRING;
303 init_completion(&ino->expire_complete); 301 init_completion(&ino->expire_complete);
304 spin_unlock(&sbi->fs_lock); 302 spin_unlock(&sbi->fs_lock);
305 return root; 303 return root;
306 } 304 }
305 managed_dentry_clear_transit(root);
307 spin_unlock(&sbi->fs_lock); 306 spin_unlock(&sbi->fs_lock);
308 dput(root); 307 dput(root);
309 308
@@ -340,6 +339,10 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
340 while ((dentry = get_next_positive_dentry(dentry, root))) { 339 while ((dentry = get_next_positive_dentry(dentry, root))) {
341 spin_lock(&sbi->fs_lock); 340 spin_lock(&sbi->fs_lock);
342 ino = autofs4_dentry_ino(dentry); 341 ino = autofs4_dentry_ino(dentry);
342 /* No point expiring a pending mount */
343 if (ino->flags & AUTOFS_INF_PENDING)
344 goto cont;
345 managed_dentry_set_transit(dentry);
343 346
344 /* 347 /*
345 * Case 1: (i) indirect mount or top level pseudo direct mount 348 * Case 1: (i) indirect mount or top level pseudo direct mount
@@ -399,6 +402,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
399 } 402 }
400 } 403 }
401next: 404next:
405 managed_dentry_clear_transit(dentry);
406cont:
402 spin_unlock(&sbi->fs_lock); 407 spin_unlock(&sbi->fs_lock);
403 } 408 }
404 return NULL; 409 return NULL;
@@ -479,6 +484,8 @@ int autofs4_expire_run(struct super_block *sb,
479 spin_lock(&sbi->fs_lock); 484 spin_lock(&sbi->fs_lock);
480 ino = autofs4_dentry_ino(dentry); 485 ino = autofs4_dentry_ino(dentry);
481 ino->flags &= ~AUTOFS_INF_EXPIRING; 486 ino->flags &= ~AUTOFS_INF_EXPIRING;
487 if (!d_unhashed(dentry))
488 managed_dentry_clear_transit(dentry);
482 complete_all(&ino->expire_complete); 489 complete_all(&ino->expire_complete);
483 spin_unlock(&sbi->fs_lock); 490 spin_unlock(&sbi->fs_lock);
484 491
@@ -504,18 +511,18 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
504 ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); 511 ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
505 512
506 spin_lock(&sbi->fs_lock); 513 spin_lock(&sbi->fs_lock);
507 if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
508 spin_lock(&sb->s_root->d_lock);
509 /*
510 * If we haven't been expired away, then reset
511 * mounted status.
512 */
513 if (mnt->mnt_parent != mnt)
514 sb->s_root->d_flags |= DCACHE_MOUNTED;
515 spin_unlock(&sb->s_root->d_lock);
516 ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
517 }
518 ino->flags &= ~AUTOFS_INF_EXPIRING; 514 ino->flags &= ~AUTOFS_INF_EXPIRING;
515 spin_lock(&dentry->d_lock);
516 if (ret)
517 __managed_dentry_clear_transit(dentry);
518 else {
519 if ((IS_ROOT(dentry) ||
520 (autofs_type_indirect(sbi->type) &&
521 IS_ROOT(dentry->d_parent))) &&
522 !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
523 __managed_dentry_set_automount(dentry);
524 }
525 spin_unlock(&dentry->d_lock);
519 complete_all(&ino->expire_complete); 526 complete_all(&ino->expire_complete);
520 spin_unlock(&sbi->fs_lock); 527 spin_unlock(&sbi->fs_lock);
521 dput(dentry); 528 dput(dentry);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index a7bdb9dcac84..180fa2425e49 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -22,77 +22,27 @@
22#include "autofs_i.h" 22#include "autofs_i.h"
23#include <linux/module.h> 23#include <linux/module.h>
24 24
25static void ino_lnkfree(struct autofs_info *ino) 25struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
26{ 26{
27 if (ino->u.symlink) { 27 struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL);
28 kfree(ino->u.symlink); 28 if (ino) {
29 ino->u.symlink = NULL;
30 }
31}
32
33struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
34 struct autofs_sb_info *sbi, mode_t mode)
35{
36 int reinit = 1;
37
38 if (ino == NULL) {
39 reinit = 0;
40 ino = kmalloc(sizeof(*ino), GFP_KERNEL);
41 }
42
43 if (ino == NULL)
44 return NULL;
45
46 if (!reinit) {
47 ino->flags = 0;
48 ino->inode = NULL;
49 ino->dentry = NULL;
50 ino->size = 0;
51 INIT_LIST_HEAD(&ino->active); 29 INIT_LIST_HEAD(&ino->active);
52 ino->active_count = 0;
53 INIT_LIST_HEAD(&ino->expiring); 30 INIT_LIST_HEAD(&ino->expiring);
54 atomic_set(&ino->count, 0); 31 ino->last_used = jiffies;
32 ino->sbi = sbi;
55 } 33 }
34 return ino;
35}
56 36
37void autofs4_clean_ino(struct autofs_info *ino)
38{
57 ino->uid = 0; 39 ino->uid = 0;
58 ino->gid = 0; 40 ino->gid = 0;
59 ino->mode = mode;
60 ino->last_used = jiffies; 41 ino->last_used = jiffies;
61
62 ino->sbi = sbi;
63
64 if (reinit && ino->free)
65 (ino->free)(ino);
66
67 memset(&ino->u, 0, sizeof(ino->u));
68
69 ino->free = NULL;
70
71 if (S_ISLNK(mode))
72 ino->free = ino_lnkfree;
73
74 return ino;
75} 42}
76 43
77void autofs4_free_ino(struct autofs_info *ino) 44void autofs4_free_ino(struct autofs_info *ino)
78{ 45{
79 struct autofs_info *p_ino;
80
81 if (ino->dentry) {
82 ino->dentry->d_fsdata = NULL;
83 if (ino->dentry->d_inode) {
84 struct dentry *parent = ino->dentry->d_parent;
85 if (atomic_dec_and_test(&ino->count)) {
86 p_ino = autofs4_dentry_ino(parent);
87 if (p_ino && parent != ino->dentry)
88 atomic_dec(&p_ino->count);
89 }
90 dput(ino->dentry);
91 }
92 ino->dentry = NULL;
93 }
94 if (ino->free)
95 (ino->free)(ino);
96 kfree(ino); 46 kfree(ino);
97} 47}
98 48
@@ -148,9 +98,16 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
148 return 0; 98 return 0;
149} 99}
150 100
101static void autofs4_evict_inode(struct inode *inode)
102{
103 end_writeback(inode);
104 kfree(inode->i_private);
105}
106
151static const struct super_operations autofs4_sops = { 107static const struct super_operations autofs4_sops = {
152 .statfs = simple_statfs, 108 .statfs = simple_statfs,
153 .show_options = autofs4_show_options, 109 .show_options = autofs4_show_options,
110 .evict_inode = autofs4_evict_inode,
154}; 111};
155 112
156enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, 113enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
@@ -240,21 +197,6 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
240 return (*pipefd < 0); 197 return (*pipefd < 0);
241} 198}
242 199
243static struct autofs_info *autofs4_mkroot(struct autofs_sb_info *sbi)
244{
245 struct autofs_info *ino;
246
247 ino = autofs4_init_ino(NULL, sbi, S_IFDIR | 0755);
248 if (!ino)
249 return NULL;
250
251 return ino;
252}
253
254static const struct dentry_operations autofs4_sb_dentry_operations = {
255 .d_release = autofs4_dentry_release,
256};
257
258int autofs4_fill_super(struct super_block *s, void *data, int silent) 200int autofs4_fill_super(struct super_block *s, void *data, int silent)
259{ 201{
260 struct inode * root_inode; 202 struct inode * root_inode;
@@ -292,15 +234,16 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
292 s->s_blocksize_bits = 10; 234 s->s_blocksize_bits = 10;
293 s->s_magic = AUTOFS_SUPER_MAGIC; 235 s->s_magic = AUTOFS_SUPER_MAGIC;
294 s->s_op = &autofs4_sops; 236 s->s_op = &autofs4_sops;
237 s->s_d_op = &autofs4_dentry_operations;
295 s->s_time_gran = 1; 238 s->s_time_gran = 1;
296 239
297 /* 240 /*
298 * Get the root inode and dentry, but defer checking for errors. 241 * Get the root inode and dentry, but defer checking for errors.
299 */ 242 */
300 ino = autofs4_mkroot(sbi); 243 ino = autofs4_new_ino(sbi);
301 if (!ino) 244 if (!ino)
302 goto fail_free; 245 goto fail_free;
303 root_inode = autofs4_get_inode(s, ino); 246 root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
304 if (!root_inode) 247 if (!root_inode)
305 goto fail_ino; 248 goto fail_ino;
306 249
@@ -309,7 +252,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
309 goto fail_iput; 252 goto fail_iput;
310 pipe = NULL; 253 pipe = NULL;
311 254
312 d_set_d_op(root, &autofs4_sb_dentry_operations);
313 root->d_fsdata = ino; 255 root->d_fsdata = ino;
314 256
315 /* Can this call block? */ 257 /* Can this call block? */
@@ -320,10 +262,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
320 goto fail_dput; 262 goto fail_dput;
321 } 263 }
322 264
265 if (autofs_type_trigger(sbi->type))
266 __managed_dentry_set_managed(root);
267
323 root_inode->i_fop = &autofs4_root_operations; 268 root_inode->i_fop = &autofs4_root_operations;
324 root_inode->i_op = autofs_type_trigger(sbi->type) ? 269 root_inode->i_op = &autofs4_dir_inode_operations;
325 &autofs4_direct_root_inode_operations :
326 &autofs4_indirect_root_inode_operations;
327 270
328 /* Couldn't this be tested earlier? */ 271 /* Couldn't this be tested earlier? */
329 if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || 272 if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
@@ -383,16 +326,14 @@ fail_unlock:
383 return -EINVAL; 326 return -EINVAL;
384} 327}
385 328
386struct inode *autofs4_get_inode(struct super_block *sb, 329struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode)
387 struct autofs_info *inf)
388{ 330{
389 struct inode *inode = new_inode(sb); 331 struct inode *inode = new_inode(sb);
390 332
391 if (inode == NULL) 333 if (inode == NULL)
392 return NULL; 334 return NULL;
393 335
394 inf->inode = inode; 336 inode->i_mode = mode;
395 inode->i_mode = inf->mode;
396 if (sb->s_root) { 337 if (sb->s_root) {
397 inode->i_uid = sb->s_root->d_inode->i_uid; 338 inode->i_uid = sb->s_root->d_inode->i_uid;
398 inode->i_gid = sb->s_root->d_inode->i_gid; 339 inode->i_gid = sb->s_root->d_inode->i_gid;
@@ -400,12 +341,11 @@ struct inode *autofs4_get_inode(struct super_block *sb,
400 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 341 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
401 inode->i_ino = get_next_ino(); 342 inode->i_ino = get_next_ino();
402 343
403 if (S_ISDIR(inf->mode)) { 344 if (S_ISDIR(mode)) {
404 inode->i_nlink = 2; 345 inode->i_nlink = 2;
405 inode->i_op = &autofs4_dir_inode_operations; 346 inode->i_op = &autofs4_dir_inode_operations;
406 inode->i_fop = &autofs4_dir_operations; 347 inode->i_fop = &autofs4_dir_operations;
407 } else if (S_ISLNK(inf->mode)) { 348 } else if (S_ISLNK(mode)) {
408 inode->i_size = inf->size;
409 inode->i_op = &autofs4_symlink_inode_operations; 349 inode->i_op = &autofs4_symlink_inode_operations;
410 } 350 }
411 351
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 651e4ef563b1..014e7aba3b08 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -35,10 +35,9 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
35#endif 35#endif
36static int autofs4_dir_open(struct inode *inode, struct file *file); 36static int autofs4_dir_open(struct inode *inode, struct file *file);
37static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); 37static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
38static void *autofs4_follow_link(struct dentry *, struct nameidata *); 38static struct vfsmount *autofs4_d_automount(struct path *);
39 39static int autofs4_d_manage(struct dentry *, bool, bool);
40#define TRIGGER_FLAGS (LOOKUP_CONTINUE | LOOKUP_DIRECTORY) 40static void autofs4_dentry_release(struct dentry *);
41#define TRIGGER_INTENTS (LOOKUP_OPEN | LOOKUP_CREATE)
42 41
43const struct file_operations autofs4_root_operations = { 42const struct file_operations autofs4_root_operations = {
44 .open = dcache_dir_open, 43 .open = dcache_dir_open,
@@ -60,7 +59,7 @@ const struct file_operations autofs4_dir_operations = {
60 .llseek = dcache_dir_lseek, 59 .llseek = dcache_dir_lseek,
61}; 60};
62 61
63const struct inode_operations autofs4_indirect_root_inode_operations = { 62const struct inode_operations autofs4_dir_inode_operations = {
64 .lookup = autofs4_lookup, 63 .lookup = autofs4_lookup,
65 .unlink = autofs4_dir_unlink, 64 .unlink = autofs4_dir_unlink,
66 .symlink = autofs4_dir_symlink, 65 .symlink = autofs4_dir_symlink,
@@ -68,20 +67,10 @@ const struct inode_operations autofs4_indirect_root_inode_operations = {
68 .rmdir = autofs4_dir_rmdir, 67 .rmdir = autofs4_dir_rmdir,
69}; 68};
70 69
71const struct inode_operations autofs4_direct_root_inode_operations = { 70const struct dentry_operations autofs4_dentry_operations = {
72 .lookup = autofs4_lookup, 71 .d_automount = autofs4_d_automount,
73 .unlink = autofs4_dir_unlink, 72 .d_manage = autofs4_d_manage,
74 .mkdir = autofs4_dir_mkdir, 73 .d_release = autofs4_dentry_release,
75 .rmdir = autofs4_dir_rmdir,
76 .follow_link = autofs4_follow_link,
77};
78
79const struct inode_operations autofs4_dir_inode_operations = {
80 .lookup = autofs4_lookup,
81 .unlink = autofs4_dir_unlink,
82 .symlink = autofs4_dir_symlink,
83 .mkdir = autofs4_dir_mkdir,
84 .rmdir = autofs4_dir_rmdir,
85}; 74};
86 75
87static void autofs4_add_active(struct dentry *dentry) 76static void autofs4_add_active(struct dentry *dentry)
@@ -116,14 +105,6 @@ static void autofs4_del_active(struct dentry *dentry)
116 return; 105 return;
117} 106}
118 107
119static unsigned int autofs4_need_mount(unsigned int flags)
120{
121 unsigned int res = 0;
122 if (flags & (TRIGGER_FLAGS | TRIGGER_INTENTS))
123 res = 1;
124 return res;
125}
126
127static int autofs4_dir_open(struct inode *inode, struct file *file) 108static int autofs4_dir_open(struct inode *inode, struct file *file)
128{ 109{
129 struct dentry *dentry = file->f_path.dentry; 110 struct dentry *dentry = file->f_path.dentry;
@@ -158,278 +139,27 @@ out:
158 return dcache_dir_open(inode, file); 139 return dcache_dir_open(inode, file);
159} 140}
160 141
161static int try_to_fill_dentry(struct dentry *dentry, int flags) 142static void autofs4_dentry_release(struct dentry *de)
162{
163 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
164 struct autofs_info *ino = autofs4_dentry_ino(dentry);
165 int status;
166
167 DPRINTK("dentry=%p %.*s ino=%p",
168 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
169
170 /*
171 * Wait for a pending mount, triggering one if there
172 * isn't one already
173 */
174 if (dentry->d_inode == NULL) {
175 DPRINTK("waiting for mount name=%.*s",
176 dentry->d_name.len, dentry->d_name.name);
177
178 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
179
180 DPRINTK("mount done status=%d", status);
181
182 /* Turn this into a real negative dentry? */
183 if (status == -ENOENT) {
184 spin_lock(&sbi->fs_lock);
185 ino->flags &= ~AUTOFS_INF_PENDING;
186 spin_unlock(&sbi->fs_lock);
187 return status;
188 } else if (status) {
189 /* Return a negative dentry, but leave it "pending" */
190 return status;
191 }
192 /* Trigger mount for path component or follow link */
193 } else if (ino->flags & AUTOFS_INF_PENDING ||
194 autofs4_need_mount(flags)) {
195 DPRINTK("waiting for mount name=%.*s",
196 dentry->d_name.len, dentry->d_name.name);
197
198 spin_lock(&sbi->fs_lock);
199 ino->flags |= AUTOFS_INF_PENDING;
200 spin_unlock(&sbi->fs_lock);
201 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
202
203 DPRINTK("mount done status=%d", status);
204
205 if (status) {
206 spin_lock(&sbi->fs_lock);
207 ino->flags &= ~AUTOFS_INF_PENDING;
208 spin_unlock(&sbi->fs_lock);
209 return status;
210 }
211 }
212
213 /* Initialize expiry counter after successful mount */
214 ino->last_used = jiffies;
215
216 spin_lock(&sbi->fs_lock);
217 ino->flags &= ~AUTOFS_INF_PENDING;
218 spin_unlock(&sbi->fs_lock);
219
220 return 0;
221}
222
223/* For autofs direct mounts the follow link triggers the mount */
224static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
225{
226 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
227 struct autofs_info *ino = autofs4_dentry_ino(dentry);
228 int oz_mode = autofs4_oz_mode(sbi);
229 unsigned int lookup_type;
230 int status;
231
232 DPRINTK("dentry=%p %.*s oz_mode=%d nd->flags=%d",
233 dentry, dentry->d_name.len, dentry->d_name.name, oz_mode,
234 nd->flags);
235 /*
236 * For an expire of a covered direct or offset mount we need
237 * to break out of follow_down() at the autofs mount trigger
238 * (d_mounted--), so we can see the expiring flag, and manage
239 * the blocking and following here until the expire is completed.
240 */
241 if (oz_mode) {
242 spin_lock(&sbi->fs_lock);
243 if (ino->flags & AUTOFS_INF_EXPIRING) {
244 spin_unlock(&sbi->fs_lock);
245 /* Follow down to our covering mount. */
246 if (!follow_down(&nd->path))
247 goto done;
248 goto follow;
249 }
250 spin_unlock(&sbi->fs_lock);
251 goto done;
252 }
253
254 /* If an expire request is pending everyone must wait. */
255 autofs4_expire_wait(dentry);
256
257 /* We trigger a mount for almost all flags */
258 lookup_type = autofs4_need_mount(nd->flags);
259 spin_lock(&sbi->fs_lock);
260 spin_lock(&autofs4_lock);
261 spin_lock(&dentry->d_lock);
262 if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) {
263 spin_unlock(&dentry->d_lock);
264 spin_unlock(&autofs4_lock);
265 spin_unlock(&sbi->fs_lock);
266 goto follow;
267 }
268
269 /*
270 * If the dentry contains directories then it is an autofs
271 * multi-mount with no root mount offset. So don't try to
272 * mount it again.
273 */
274 if (ino->flags & AUTOFS_INF_PENDING ||
275 (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
276 spin_unlock(&dentry->d_lock);
277 spin_unlock(&autofs4_lock);
278 spin_unlock(&sbi->fs_lock);
279
280 status = try_to_fill_dentry(dentry, nd->flags);
281 if (status)
282 goto out_error;
283
284 goto follow;
285 }
286 spin_unlock(&dentry->d_lock);
287 spin_unlock(&autofs4_lock);
288 spin_unlock(&sbi->fs_lock);
289follow:
290 /*
291 * If there is no root mount it must be an autofs
292 * multi-mount with no root offset so we don't need
293 * to follow it.
294 */
295 if (d_mountpoint(dentry)) {
296 if (!autofs4_follow_mount(&nd->path)) {
297 status = -ENOENT;
298 goto out_error;
299 }
300 }
301
302done:
303 return NULL;
304
305out_error:
306 path_put(&nd->path);
307 return ERR_PTR(status);
308}
309
310/*
311 * Revalidate is called on every cache lookup. Some of those
312 * cache lookups may actually happen while the dentry is not
313 * yet completely filled in, and revalidate has to delay such
314 * lookups..
315 */
316static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
317{ 143{
318 struct inode *dir; 144 struct autofs_info *ino = autofs4_dentry_ino(de);
319 struct autofs_sb_info *sbi; 145 struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
320 int oz_mode;
321 int flags = nd ? nd->flags : 0;
322 int status = 1;
323
324 if (flags & LOOKUP_RCU)
325 return -ECHILD;
326
327 dir = dentry->d_parent->d_inode;
328 sbi = autofs4_sbi(dir->i_sb);
329 oz_mode = autofs4_oz_mode(sbi);
330
331 /* Pending dentry */
332 spin_lock(&sbi->fs_lock);
333 if (autofs4_ispending(dentry)) {
334 /* The daemon never causes a mount to trigger */
335 spin_unlock(&sbi->fs_lock);
336
337 if (oz_mode)
338 return 1;
339
340 /*
341 * If the directory has gone away due to an expire
342 * we have been called as ->d_revalidate() and so
343 * we need to return false and proceed to ->lookup().
344 */
345 if (autofs4_expire_wait(dentry) == -EAGAIN)
346 return 0;
347
348 /*
349 * A zero status is success otherwise we have a
350 * negative error code.
351 */
352 status = try_to_fill_dentry(dentry, flags);
353 if (status == 0)
354 return 1;
355
356 return status;
357 }
358 spin_unlock(&sbi->fs_lock);
359
360 /* Negative dentry.. invalidate if "old" */
361 if (dentry->d_inode == NULL)
362 return 0;
363
364 /* Check for a non-mountpoint directory with no contents */
365 spin_lock(&autofs4_lock);
366 spin_lock(&dentry->d_lock);
367 if (S_ISDIR(dentry->d_inode->i_mode) &&
368 !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
369 DPRINTK("dentry=%p %.*s, emptydir",
370 dentry, dentry->d_name.len, dentry->d_name.name);
371 spin_unlock(&dentry->d_lock);
372 spin_unlock(&autofs4_lock);
373
374 /* The daemon never causes a mount to trigger */
375 if (oz_mode)
376 return 1;
377
378 /*
379 * A zero status is success otherwise we have a
380 * negative error code.
381 */
382 status = try_to_fill_dentry(dentry, flags);
383 if (status == 0)
384 return 1;
385
386 return status;
387 }
388 spin_unlock(&dentry->d_lock);
389 spin_unlock(&autofs4_lock);
390
391 return 1;
392}
393
394void autofs4_dentry_release(struct dentry *de)
395{
396 struct autofs_info *inf;
397 146
398 DPRINTK("releasing %p", de); 147 DPRINTK("releasing %p", de);
399 148
400 inf = autofs4_dentry_ino(de); 149 if (!ino)
401 de->d_fsdata = NULL; 150 return;
402
403 if (inf) {
404 struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
405
406 if (sbi) {
407 spin_lock(&sbi->lookup_lock);
408 if (!list_empty(&inf->active))
409 list_del(&inf->active);
410 if (!list_empty(&inf->expiring))
411 list_del(&inf->expiring);
412 spin_unlock(&sbi->lookup_lock);
413 }
414
415 inf->dentry = NULL;
416 inf->inode = NULL;
417 151
418 autofs4_free_ino(inf); 152 if (sbi) {
153 spin_lock(&sbi->lookup_lock);
154 if (!list_empty(&ino->active))
155 list_del(&ino->active);
156 if (!list_empty(&ino->expiring))
157 list_del(&ino->expiring);
158 spin_unlock(&sbi->lookup_lock);
419 } 159 }
420}
421 160
422/* For dentries of directories in the root dir */ 161 autofs4_free_ino(ino);
423static const struct dentry_operations autofs4_root_dentry_operations = { 162}
424 .d_revalidate = autofs4_revalidate,
425 .d_release = autofs4_dentry_release,
426};
427
428/* For other dentries */
429static const struct dentry_operations autofs4_dentry_operations = {
430 .d_revalidate = autofs4_revalidate,
431 .d_release = autofs4_dentry_release,
432};
433 163
434static struct dentry *autofs4_lookup_active(struct dentry *dentry) 164static struct dentry *autofs4_lookup_active(struct dentry *dentry)
435{ 165{
@@ -541,51 +271,246 @@ next:
541 return NULL; 271 return NULL;
542} 272}
543 273
274static int autofs4_mount_wait(struct dentry *dentry)
275{
276 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
277 struct autofs_info *ino = autofs4_dentry_ino(dentry);
278 int status;
279
280 if (ino->flags & AUTOFS_INF_PENDING) {
281 DPRINTK("waiting for mount name=%.*s",
282 dentry->d_name.len, dentry->d_name.name);
283 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
284 DPRINTK("mount wait done status=%d", status);
285 ino->last_used = jiffies;
286 return status;
287 }
288 return 0;
289}
290
291static int do_expire_wait(struct dentry *dentry)
292{
293 struct dentry *expiring;
294
295 expiring = autofs4_lookup_expiring(dentry);
296 if (!expiring)
297 return autofs4_expire_wait(dentry);
298 else {
299 /*
300 * If we are racing with expire the request might not
301 * be quite complete, but the directory has been removed
302 * so it must have been successful, just wait for it.
303 */
304 autofs4_expire_wait(expiring);
305 autofs4_del_expiring(expiring);
306 dput(expiring);
307 }
308 return 0;
309}
310
311static struct dentry *autofs4_mountpoint_changed(struct path *path)
312{
313 struct dentry *dentry = path->dentry;
314 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
315
316 /*
317 * If this is an indirect mount the dentry could have gone away
318 * as a result of an expire and a new one created.
319 */
320 if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
321 struct dentry *parent = dentry->d_parent;
322 struct dentry *new = d_lookup(parent, &dentry->d_name);
323 if (!new)
324 return NULL;
325 dput(path->dentry);
326 path->dentry = new;
327 }
328 return path->dentry;
329}
330
331static struct vfsmount *autofs4_d_automount(struct path *path)
332{
333 struct dentry *dentry = path->dentry;
334 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
335 struct autofs_info *ino = autofs4_dentry_ino(dentry);
336 int status;
337
338 DPRINTK("dentry=%p %.*s",
339 dentry, dentry->d_name.len, dentry->d_name.name);
340
341 /*
342 * Someone may have manually umounted this or it was a submount
343 * that has gone away.
344 */
345 spin_lock(&dentry->d_lock);
346 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
347 if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
348 (dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
349 __managed_dentry_set_transit(path->dentry);
350 }
351 spin_unlock(&dentry->d_lock);
352
353 /* The daemon never triggers a mount. */
354 if (autofs4_oz_mode(sbi))
355 return NULL;
356
357 /*
358 * If an expire request is pending everyone must wait.
359 * If the expire fails we're still mounted so continue
360 * the follow and return. A return of -EAGAIN (which only
361 * happens with indirect mounts) means the expire completed
362 * and the directory was removed, so just go ahead and try
363 * the mount.
364 */
365 status = do_expire_wait(dentry);
366 if (status && status != -EAGAIN)
367 return NULL;
368
369 /* Callback to the daemon to perform the mount or wait */
370 spin_lock(&sbi->fs_lock);
371 if (ino->flags & AUTOFS_INF_PENDING) {
372 spin_unlock(&sbi->fs_lock);
373 status = autofs4_mount_wait(dentry);
374 if (status)
375 return ERR_PTR(status);
376 spin_lock(&sbi->fs_lock);
377 goto done;
378 }
379
380 /*
381 * If the dentry is a symlink it's equivalent to a directory
382 * having d_mountpoint() true, so there's no need to call back
383 * to the daemon.
384 */
385 if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))
386 goto done;
387 if (!d_mountpoint(dentry)) {
388 /*
389 * It's possible that user space hasn't removed directories
390 * after umounting a rootless multi-mount, although it
391 * should. For v5 have_submounts() is sufficient to handle
392 * this because the leaves of the directory tree under the
393 * mount never trigger mounts themselves (they have an autofs
394 * trigger mount mounted on them). But v4 pseudo direct mounts
395 * do need the leaves to to trigger mounts. In this case we
396 * have no choice but to use the list_empty() check and
397 * require user space behave.
398 */
399 if (sbi->version > 4) {
400 if (have_submounts(dentry))
401 goto done;
402 } else {
403 spin_lock(&dentry->d_lock);
404 if (!list_empty(&dentry->d_subdirs)) {
405 spin_unlock(&dentry->d_lock);
406 goto done;
407 }
408 spin_unlock(&dentry->d_lock);
409 }
410 ino->flags |= AUTOFS_INF_PENDING;
411 spin_unlock(&sbi->fs_lock);
412 status = autofs4_mount_wait(dentry);
413 if (status)
414 return ERR_PTR(status);
415 spin_lock(&sbi->fs_lock);
416 ino->flags &= ~AUTOFS_INF_PENDING;
417 }
418done:
419 if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
420 /*
421 * Any needed mounting has been completed and the path updated
422 * so turn this into a normal dentry so we don't continually
423 * call ->d_automount() and ->d_manage().
424 */
425 spin_lock(&dentry->d_lock);
426 __managed_dentry_clear_transit(dentry);
427 /*
428 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
429 * symlinks as in all other cases the dentry will be covered by
430 * an actual mount so ->d_automount() won't be called during
431 * the follow.
432 */
433 if ((!d_mountpoint(dentry) &&
434 !list_empty(&dentry->d_subdirs)) ||
435 (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
436 __managed_dentry_clear_automount(dentry);
437 spin_unlock(&dentry->d_lock);
438 }
439 spin_unlock(&sbi->fs_lock);
440
441 /* Mount succeeded, check if we ended up with a new dentry */
442 dentry = autofs4_mountpoint_changed(path);
443 if (!dentry)
444 return ERR_PTR(-ENOENT);
445
446 return NULL;
447}
448
449int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
450{
451 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
452
453 DPRINTK("dentry=%p %.*s",
454 dentry, dentry->d_name.len, dentry->d_name.name);
455
456 /* The daemon never waits. */
457 if (autofs4_oz_mode(sbi) || mounting_here) {
458 if (!d_mountpoint(dentry))
459 return -EISDIR;
460 return 0;
461 }
462
463 /* We need to sleep, so we need pathwalk to be in ref-mode */
464 if (rcu_walk)
465 return -ECHILD;
466
467 /* Wait for pending expires */
468 do_expire_wait(dentry);
469
470 /*
471 * This dentry may be under construction so wait on mount
472 * completion.
473 */
474 return autofs4_mount_wait(dentry);
475}
476
544/* Lookups in the root directory */ 477/* Lookups in the root directory */
545static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 478static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
546{ 479{
547 struct autofs_sb_info *sbi; 480 struct autofs_sb_info *sbi;
548 struct autofs_info *ino; 481 struct autofs_info *ino;
549 struct dentry *expiring, *active; 482 struct dentry *active;
550 int oz_mode;
551 483
552 DPRINTK("name = %.*s", 484 DPRINTK("name = %.*s", dentry->d_name.len, dentry->d_name.name);
553 dentry->d_name.len, dentry->d_name.name);
554 485
555 /* File name too long to exist */ 486 /* File name too long to exist */
556 if (dentry->d_name.len > NAME_MAX) 487 if (dentry->d_name.len > NAME_MAX)
557 return ERR_PTR(-ENAMETOOLONG); 488 return ERR_PTR(-ENAMETOOLONG);
558 489
559 sbi = autofs4_sbi(dir->i_sb); 490 sbi = autofs4_sbi(dir->i_sb);
560 oz_mode = autofs4_oz_mode(sbi);
561 491
562 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", 492 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
563 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode); 493 current->pid, task_pgrp_nr(current), sbi->catatonic,
494 autofs4_oz_mode(sbi));
564 495
565 active = autofs4_lookup_active(dentry); 496 active = autofs4_lookup_active(dentry);
566 if (active) { 497 if (active) {
567 dentry = active; 498 return active;
568 ino = autofs4_dentry_ino(dentry);
569 } else { 499 } else {
570 /* 500 /*
571 * Mark the dentry incomplete but don't hash it. We do this 501 * A dentry that is not within the root can never trigger a
572 * to serialize our inode creation operations (symlink and 502 * mount operation, unless the directory already exists, so we
573 * mkdir) which prevents deadlock during the callback to 503 * can return fail immediately. The daemon however does need
574 * the daemon. Subsequent user space lookups for the same 504 * to create directories within the file system.
575 * dentry are placed on the wait queue while the daemon
576 * itself is allowed passage unresticted so the create
577 * operation itself can then hash the dentry. Finally,
578 * we check for the hashed dentry and return the newly
579 * hashed dentry.
580 */ 505 */
581 d_set_d_op(dentry, &autofs4_root_dentry_operations); 506 if (!autofs4_oz_mode(sbi) && !IS_ROOT(dentry->d_parent))
507 return ERR_PTR(-ENOENT);
582 508
583 /* 509 /* Mark entries in the root as mount triggers */
584 * And we need to ensure that the same dentry is used for 510 if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))
585 * all following lookup calls until it is hashed so that 511 __managed_dentry_set_managed(dentry);
586 * the dentry flags are persistent throughout the request. 512
587 */ 513 ino = autofs4_new_ino(sbi);
588 ino = autofs4_init_ino(NULL, sbi, 0555);
589 if (!ino) 514 if (!ino)
590 return ERR_PTR(-ENOMEM); 515 return ERR_PTR(-ENOMEM);
591 516
@@ -596,82 +521,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
596 521
597 d_instantiate(dentry, NULL); 522 d_instantiate(dentry, NULL);
598 } 523 }
599
600 if (!oz_mode) {
601 mutex_unlock(&dir->i_mutex);
602 expiring = autofs4_lookup_expiring(dentry);
603 if (expiring) {
604 /*
605 * If we are racing with expire the request might not
606 * be quite complete but the directory has been removed
607 * so it must have been successful, so just wait for it.
608 */
609 autofs4_expire_wait(expiring);
610 autofs4_del_expiring(expiring);
611 dput(expiring);
612 }
613
614 spin_lock(&sbi->fs_lock);
615 ino->flags |= AUTOFS_INF_PENDING;
616 spin_unlock(&sbi->fs_lock);
617 if (dentry->d_op && dentry->d_op->d_revalidate)
618 (dentry->d_op->d_revalidate)(dentry, nd);
619 mutex_lock(&dir->i_mutex);
620 }
621
622 /*
623 * If we are still pending, check if we had to handle
624 * a signal. If so we can force a restart..
625 */
626 if (ino->flags & AUTOFS_INF_PENDING) {
627 /* See if we were interrupted */
628 if (signal_pending(current)) {
629 sigset_t *sigset = &current->pending.signal;
630 if (sigismember (sigset, SIGKILL) ||
631 sigismember (sigset, SIGQUIT) ||
632 sigismember (sigset, SIGINT)) {
633 if (active)
634 dput(active);
635 return ERR_PTR(-ERESTARTNOINTR);
636 }
637 }
638 if (!oz_mode) {
639 spin_lock(&sbi->fs_lock);
640 ino->flags &= ~AUTOFS_INF_PENDING;
641 spin_unlock(&sbi->fs_lock);
642 }
643 }
644
645 /*
646 * If this dentry is unhashed, then we shouldn't honour this
647 * lookup. Returning ENOENT here doesn't do the right thing
648 * for all system calls, but it should be OK for the operations
649 * we permit from an autofs.
650 */
651 if (!oz_mode && d_unhashed(dentry)) {
652 /*
653 * A user space application can (and has done in the past)
654 * remove and re-create this directory during the callback.
655 * This can leave us with an unhashed dentry, but a
656 * successful mount! So we need to perform another
657 * cached lookup in case the dentry now exists.
658 */
659 struct dentry *parent = dentry->d_parent;
660 struct dentry *new = d_lookup(parent, &dentry->d_name);
661 if (new != NULL)
662 dentry = new;
663 else
664 dentry = ERR_PTR(-ENOENT);
665
666 if (active)
667 dput(active);
668
669 return dentry;
670 }
671
672 if (active)
673 return active;
674
675 return NULL; 524 return NULL;
676} 525}
677 526
@@ -683,6 +532,7 @@ static int autofs4_dir_symlink(struct inode *dir,
683 struct autofs_info *ino = autofs4_dentry_ino(dentry); 532 struct autofs_info *ino = autofs4_dentry_ino(dentry);
684 struct autofs_info *p_ino; 533 struct autofs_info *p_ino;
685 struct inode *inode; 534 struct inode *inode;
535 size_t size = strlen(symname);
686 char *cp; 536 char *cp;
687 537
688 DPRINTK("%s <- %.*s", symname, 538 DPRINTK("%s <- %.*s", symname,
@@ -691,45 +541,35 @@ static int autofs4_dir_symlink(struct inode *dir,
691 if (!autofs4_oz_mode(sbi)) 541 if (!autofs4_oz_mode(sbi))
692 return -EACCES; 542 return -EACCES;
693 543
694 ino = autofs4_init_ino(ino, sbi, S_IFLNK | 0555); 544 BUG_ON(!ino);
695 if (!ino) 545
696 return -ENOMEM; 546 autofs4_clean_ino(ino);
697 547
698 autofs4_del_active(dentry); 548 autofs4_del_active(dentry);
699 549
700 ino->size = strlen(symname); 550 cp = kmalloc(size + 1, GFP_KERNEL);
701 cp = kmalloc(ino->size + 1, GFP_KERNEL); 551 if (!cp)
702 if (!cp) {
703 if (!dentry->d_fsdata)
704 kfree(ino);
705 return -ENOMEM; 552 return -ENOMEM;
706 }
707 553
708 strcpy(cp, symname); 554 strcpy(cp, symname);
709 555
710 inode = autofs4_get_inode(dir->i_sb, ino); 556 inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555);
711 if (!inode) { 557 if (!inode) {
712 kfree(cp); 558 kfree(cp);
713 if (!dentry->d_fsdata) 559 if (!dentry->d_fsdata)
714 kfree(ino); 560 kfree(ino);
715 return -ENOMEM; 561 return -ENOMEM;
716 } 562 }
563 inode->i_private = cp;
564 inode->i_size = size;
717 d_add(dentry, inode); 565 d_add(dentry, inode);
718 566
719 if (dir == dir->i_sb->s_root->d_inode) 567 dget(dentry);
720 d_set_d_op(dentry, &autofs4_root_dentry_operations);
721 else
722 d_set_d_op(dentry, &autofs4_dentry_operations);
723
724 dentry->d_fsdata = ino;
725 ino->dentry = dget(dentry);
726 atomic_inc(&ino->count); 568 atomic_inc(&ino->count);
727 p_ino = autofs4_dentry_ino(dentry->d_parent); 569 p_ino = autofs4_dentry_ino(dentry->d_parent);
728 if (p_ino && dentry->d_parent != dentry) 570 if (p_ino && dentry->d_parent != dentry)
729 atomic_inc(&p_ino->count); 571 atomic_inc(&p_ino->count);
730 ino->inode = inode;
731 572
732 ino->u.symlink = cp;
733 dir->i_mtime = CURRENT_TIME; 573 dir->i_mtime = CURRENT_TIME;
734 574
735 return 0; 575 return 0;
@@ -782,6 +622,58 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
782 return 0; 622 return 0;
783} 623}
784 624
625/*
626 * Version 4 of autofs provides a pseudo direct mount implementation
627 * that relies on directories at the leaves of a directory tree under
628 * an indirect mount to trigger mounts. To allow for this we need to
629 * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves
630 * of the directory tree. There is no need to clear the automount flag
631 * following a mount or restore it after an expire because these mounts
632 * are always covered. However, it is neccessary to ensure that these
633 * flags are clear on non-empty directories to avoid unnecessary calls
634 * during path walks.
635 */
636static void autofs_set_leaf_automount_flags(struct dentry *dentry)
637{
638 struct dentry *parent;
639
640 /* root and dentrys in the root are already handled */
641 if (IS_ROOT(dentry->d_parent))
642 return;
643
644 managed_dentry_set_managed(dentry);
645
646 parent = dentry->d_parent;
647 /* only consider parents below dentrys in the root */
648 if (IS_ROOT(parent->d_parent))
649 return;
650 managed_dentry_clear_managed(parent);
651 return;
652}
653
654static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
655{
656 struct list_head *d_child;
657 struct dentry *parent;
658
659 /* flags for dentrys in the root are handled elsewhere */
660 if (IS_ROOT(dentry->d_parent))
661 return;
662
663 managed_dentry_clear_managed(dentry);
664
665 parent = dentry->d_parent;
666 /* only consider parents below dentrys in the root */
667 if (IS_ROOT(parent->d_parent))
668 return;
669 d_child = &dentry->d_u.d_child;
670 /* Set parent managed if it's becoming empty */
671 if (d_child->next == &parent->d_subdirs &&
672 d_child->prev == &parent->d_subdirs)
673 managed_dentry_set_managed(parent);
674 return;
675}
676
785static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) 677static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
786{ 678{
787 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); 679 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
@@ -809,6 +701,9 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
809 spin_unlock(&dentry->d_lock); 701 spin_unlock(&dentry->d_lock);
810 spin_unlock(&autofs4_lock); 702 spin_unlock(&autofs4_lock);
811 703
704 if (sbi->version < 5)
705 autofs_clear_leaf_automount_flags(dentry);
706
812 if (atomic_dec_and_test(&ino->count)) { 707 if (atomic_dec_and_test(&ino->count)) {
813 p_ino = autofs4_dentry_ino(dentry->d_parent); 708 p_ino = autofs4_dentry_ino(dentry->d_parent);
814 if (p_ino && dentry->d_parent != dentry) 709 if (p_ino && dentry->d_parent != dentry)
@@ -837,32 +732,25 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
837 DPRINTK("dentry %p, creating %.*s", 732 DPRINTK("dentry %p, creating %.*s",
838 dentry, dentry->d_name.len, dentry->d_name.name); 733 dentry, dentry->d_name.len, dentry->d_name.name);
839 734
840 ino = autofs4_init_ino(ino, sbi, S_IFDIR | 0555); 735 BUG_ON(!ino);
841 if (!ino) 736
842 return -ENOMEM; 737 autofs4_clean_ino(ino);
843 738
844 autofs4_del_active(dentry); 739 autofs4_del_active(dentry);
845 740
846 inode = autofs4_get_inode(dir->i_sb, ino); 741 inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555);
847 if (!inode) { 742 if (!inode)
848 if (!dentry->d_fsdata)
849 kfree(ino);
850 return -ENOMEM; 743 return -ENOMEM;
851 }
852 d_add(dentry, inode); 744 d_add(dentry, inode);
853 745
854 if (dir == dir->i_sb->s_root->d_inode) 746 if (sbi->version < 5)
855 d_set_d_op(dentry, &autofs4_root_dentry_operations); 747 autofs_set_leaf_automount_flags(dentry);
856 else
857 d_set_d_op(dentry, &autofs4_dentry_operations);
858 748
859 dentry->d_fsdata = ino; 749 dget(dentry);
860 ino->dentry = dget(dentry);
861 atomic_inc(&ino->count); 750 atomic_inc(&ino->count);
862 p_ino = autofs4_dentry_ino(dentry->d_parent); 751 p_ino = autofs4_dentry_ino(dentry->d_parent);
863 if (p_ino && dentry->d_parent != dentry) 752 if (p_ino && dentry->d_parent != dentry)
864 atomic_inc(&p_ino->count); 753 atomic_inc(&p_ino->count);
865 ino->inode = inode;
866 inc_nlink(dir); 754 inc_nlink(dir);
867 dir->i_mtime = CURRENT_TIME; 755 dir->i_mtime = CURRENT_TIME;
868 756
@@ -944,8 +832,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
944int is_autofs4_dentry(struct dentry *dentry) 832int is_autofs4_dentry(struct dentry *dentry)
945{ 833{
946 return dentry && dentry->d_inode && 834 return dentry && dentry->d_inode &&
947 (dentry->d_op == &autofs4_root_dentry_operations || 835 dentry->d_op == &autofs4_dentry_operations &&
948 dentry->d_op == &autofs4_dentry_operations) &&
949 dentry->d_fsdata != NULL; 836 dentry->d_fsdata != NULL;
950} 837}
951 838
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index b4ea82934d2e..f27c094a1919 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -14,8 +14,7 @@
14 14
15static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) 15static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
16{ 16{
17 struct autofs_info *ino = autofs4_dentry_ino(dentry); 17 nd_set_link(nd, dentry->d_inode->i_private);
18 nd_set_link(nd, (char *)ino->u.symlink);
19 return NULL; 18 return NULL;
20} 19}
21 20
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index c5f8459c905e..56010056b2e6 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -309,6 +309,9 @@ static int validate_request(struct autofs_wait_queue **wait,
309 * completed while we waited on the mutex ... 309 * completed while we waited on the mutex ...
310 */ 310 */
311 if (notify == NFY_MOUNT) { 311 if (notify == NFY_MOUNT) {
312 struct dentry *new = NULL;
313 int valid = 1;
314
312 /* 315 /*
313 * If the dentry was successfully mounted while we slept 316 * If the dentry was successfully mounted while we slept
314 * on the wait queue mutex we can return success. If it 317 * on the wait queue mutex we can return success. If it
@@ -316,8 +319,20 @@ static int validate_request(struct autofs_wait_queue **wait,
316 * a multi-mount with no mount at it's base) we can 319 * a multi-mount with no mount at it's base) we can
317 * continue on and create a new request. 320 * continue on and create a new request.
318 */ 321 */
322 if (!IS_ROOT(dentry)) {
323 if (dentry->d_inode && d_unhashed(dentry)) {
324 struct dentry *parent = dentry->d_parent;
325 new = d_lookup(parent, &dentry->d_name);
326 if (new)
327 dentry = new;
328 }
329 }
319 if (have_submounts(dentry)) 330 if (have_submounts(dentry))
320 return 0; 331 valid = 0;
332
333 if (new)
334 dput(new);
335 return valid;
321 } 336 }
322 337
323 return 1; 338 return 1;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index fe3f59c14a02..333a7bb4cb9c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -432,6 +432,9 @@ static void init_once(void *foo)
432 mutex_init(&bdev->bd_mutex); 432 mutex_init(&bdev->bd_mutex);
433 INIT_LIST_HEAD(&bdev->bd_inodes); 433 INIT_LIST_HEAD(&bdev->bd_inodes);
434 INIT_LIST_HEAD(&bdev->bd_list); 434 INIT_LIST_HEAD(&bdev->bd_list);
435#ifdef CONFIG_SYSFS
436 INIT_LIST_HEAD(&bdev->bd_holder_disks);
437#endif
435 inode_init_once(&ei->vfs_inode); 438 inode_init_once(&ei->vfs_inode);
436 /* Initialize mutex for freeze. */ 439 /* Initialize mutex for freeze. */
437 mutex_init(&bdev->bd_fsfreeze_mutex); 440 mutex_init(&bdev->bd_fsfreeze_mutex);
@@ -779,6 +782,23 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
779} 782}
780 783
781#ifdef CONFIG_SYSFS 784#ifdef CONFIG_SYSFS
785struct bd_holder_disk {
786 struct list_head list;
787 struct gendisk *disk;
788 int refcnt;
789};
790
791static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
792 struct gendisk *disk)
793{
794 struct bd_holder_disk *holder;
795
796 list_for_each_entry(holder, &bdev->bd_holder_disks, list)
797 if (holder->disk == disk)
798 return holder;
799 return NULL;
800}
801
782static int add_symlink(struct kobject *from, struct kobject *to) 802static int add_symlink(struct kobject *from, struct kobject *to)
783{ 803{
784 return sysfs_create_link(from, to, kobject_name(to)); 804 return sysfs_create_link(from, to, kobject_name(to));
@@ -794,6 +814,8 @@ static void del_symlink(struct kobject *from, struct kobject *to)
794 * @bdev: the claimed slave bdev 814 * @bdev: the claimed slave bdev
795 * @disk: the holding disk 815 * @disk: the holding disk
796 * 816 *
817 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
818 *
797 * This functions creates the following sysfs symlinks. 819 * This functions creates the following sysfs symlinks.
798 * 820 *
799 * - from "slaves" directory of the holder @disk to the claimed @bdev 821 * - from "slaves" directory of the holder @disk to the claimed @bdev
@@ -817,47 +839,83 @@ static void del_symlink(struct kobject *from, struct kobject *to)
817 */ 839 */
818int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) 840int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
819{ 841{
842 struct bd_holder_disk *holder;
820 int ret = 0; 843 int ret = 0;
821 844
822 mutex_lock(&bdev->bd_mutex); 845 mutex_lock(&bdev->bd_mutex);
823 846
824 WARN_ON_ONCE(!bdev->bd_holder || bdev->bd_holder_disk); 847 WARN_ON_ONCE(!bdev->bd_holder);
825 848
826 /* FIXME: remove the following once add_disk() handles errors */ 849 /* FIXME: remove the following once add_disk() handles errors */
827 if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir)) 850 if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
828 goto out_unlock; 851 goto out_unlock;
829 852
830 ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); 853 holder = bd_find_holder_disk(bdev, disk);
831 if (ret) 854 if (holder) {
855 holder->refcnt++;
832 goto out_unlock; 856 goto out_unlock;
857 }
833 858
834 ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); 859 holder = kzalloc(sizeof(*holder), GFP_KERNEL);
835 if (ret) { 860 if (!holder) {
836 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); 861 ret = -ENOMEM;
837 goto out_unlock; 862 goto out_unlock;
838 } 863 }
839 864
840 bdev->bd_holder_disk = disk; 865 INIT_LIST_HEAD(&holder->list);
866 holder->disk = disk;
867 holder->refcnt = 1;
868
869 ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
870 if (ret)
871 goto out_free;
872
873 ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
874 if (ret)
875 goto out_del;
876
877 list_add(&holder->list, &bdev->bd_holder_disks);
878 goto out_unlock;
879
880out_del:
881 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
882out_free:
883 kfree(holder);
841out_unlock: 884out_unlock:
842 mutex_unlock(&bdev->bd_mutex); 885 mutex_unlock(&bdev->bd_mutex);
843 return ret; 886 return ret;
844} 887}
845EXPORT_SYMBOL_GPL(bd_link_disk_holder); 888EXPORT_SYMBOL_GPL(bd_link_disk_holder);
846 889
847static void bd_unlink_disk_holder(struct block_device *bdev) 890/**
891 * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
892 * @bdev: the calimed slave bdev
893 * @disk: the holding disk
894 *
895 * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
896 *
897 * CONTEXT:
898 * Might sleep.
899 */
900void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
848{ 901{
849 struct gendisk *disk = bdev->bd_holder_disk; 902 struct bd_holder_disk *holder;
850 903
851 bdev->bd_holder_disk = NULL; 904 mutex_lock(&bdev->bd_mutex);
852 if (!disk)
853 return;
854 905
855 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); 906 holder = bd_find_holder_disk(bdev, disk);
856 del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); 907
908 if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
909 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
910 del_symlink(bdev->bd_part->holder_dir,
911 &disk_to_dev(disk)->kobj);
912 list_del_init(&holder->list);
913 kfree(holder);
914 }
915
916 mutex_unlock(&bdev->bd_mutex);
857} 917}
858#else 918EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
859static inline void bd_unlink_disk_holder(struct block_device *bdev)
860{ }
861#endif 919#endif
862 920
863/** 921/**
@@ -1380,7 +1438,6 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
1380 * unblock evpoll if it was a write holder. 1438 * unblock evpoll if it was a write holder.
1381 */ 1439 */
1382 if (bdev_free) { 1440 if (bdev_free) {
1383 bd_unlink_disk_holder(bdev);
1384 if (bdev->bd_write_holder) { 1441 if (bdev->bd_write_holder) {
1385 disk_unblock_events(bdev->bd_disk); 1442 disk_unblock_events(bdev->bd_disk);
1386 bdev->bd_write_holder = false; 1443 bdev->bd_write_holder = false;
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 7bb3c020e570..ecb9fd3be143 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -4,6 +4,8 @@ config BTRFS_FS
4 select LIBCRC32C 4 select LIBCRC32C
5 select ZLIB_INFLATE 5 select ZLIB_INFLATE
6 select ZLIB_DEFLATE 6 select ZLIB_DEFLATE
7 select LZO_COMPRESS
8 select LZO_DECOMPRESS
7 help 9 help
8 Btrfs is a new filesystem with extents, writable snapshotting, 10 Btrfs is a new filesystem with extents, writable snapshotting,
9 support for multiple devices and many more features. 11 support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a35eb36b32fd..31610ea73aec 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 transaction.o inode.o file.o tree-defrag.o \ 6 transaction.o inode.o file.o tree-defrag.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o acl.o free-space-cache.o zlib.o \ 9 export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o 10 compression.o delayed-ref.o relocation.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6ae2c8cac9d5..9c949348510b 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -37,6 +37,9 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
37 char *value = NULL; 37 char *value = NULL;
38 struct posix_acl *acl; 38 struct posix_acl *acl;
39 39
40 if (!IS_POSIXACL(inode))
41 return NULL;
42
40 acl = get_cached_acl(inode, type); 43 acl = get_cached_acl(inode, type);
41 if (acl != ACL_NOT_CACHED) 44 if (acl != ACL_NOT_CACHED)
42 return acl; 45 return acl;
@@ -60,8 +63,10 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
60 size = __btrfs_getxattr(inode, name, value, size); 63 size = __btrfs_getxattr(inode, name, value, size);
61 if (size > 0) { 64 if (size > 0) {
62 acl = posix_acl_from_xattr(value, size); 65 acl = posix_acl_from_xattr(value, size);
63 if (IS_ERR(acl)) 66 if (IS_ERR(acl)) {
67 kfree(value);
64 return acl; 68 return acl;
69 }
65 set_cached_acl(inode, type, acl); 70 set_cached_acl(inode, type, acl);
66 } 71 }
67 kfree(value); 72 kfree(value);
@@ -82,6 +87,9 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
82 struct posix_acl *acl; 87 struct posix_acl *acl;
83 int ret = 0; 88 int ret = 0;
84 89
90 if (!IS_POSIXACL(dentry->d_inode))
91 return -EOPNOTSUPP;
92
85 acl = btrfs_get_acl(dentry->d_inode, type); 93 acl = btrfs_get_acl(dentry->d_inode, type);
86 94
87 if (IS_ERR(acl)) 95 if (IS_ERR(acl))
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6ad63f17eca0..ccc991c542df 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -157,7 +157,7 @@ struct btrfs_inode {
157 /* 157 /*
158 * always compress this one file 158 * always compress this one file
159 */ 159 */
160 unsigned force_compress:1; 160 unsigned force_compress:4;
161 161
162 struct inode vfs_inode; 162 struct inode vfs_inode;
163}; 163};
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b50bc4bd5c56..4d2110eafe29 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -62,6 +62,9 @@ struct compressed_bio {
62 /* number of bytes on disk */ 62 /* number of bytes on disk */
63 unsigned long compressed_len; 63 unsigned long compressed_len;
64 64
65 /* the compression algorithm for this bio */
66 int compress_type;
67
65 /* number of compressed pages in the array */ 68 /* number of compressed pages in the array */
66 unsigned long nr_pages; 69 unsigned long nr_pages;
67 70
@@ -173,11 +176,12 @@ static void end_compressed_bio_read(struct bio *bio, int err)
173 /* ok, we're the last bio for this extent, lets start 176 /* ok, we're the last bio for this extent, lets start
174 * the decompression. 177 * the decompression.
175 */ 178 */
176 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, 179 ret = btrfs_decompress_biovec(cb->compress_type,
177 cb->start, 180 cb->compressed_pages,
178 cb->orig_bio->bi_io_vec, 181 cb->start,
179 cb->orig_bio->bi_vcnt, 182 cb->orig_bio->bi_io_vec,
180 cb->compressed_len); 183 cb->orig_bio->bi_vcnt,
184 cb->compressed_len);
181csum_failed: 185csum_failed:
182 if (ret) 186 if (ret)
183 cb->errors = 1; 187 cb->errors = 1;
@@ -558,7 +562,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
558 u64 em_len; 562 u64 em_len;
559 u64 em_start; 563 u64 em_start;
560 struct extent_map *em; 564 struct extent_map *em;
561 int ret; 565 int ret = -ENOMEM;
562 u32 *sums; 566 u32 *sums;
563 567
564 tree = &BTRFS_I(inode)->io_tree; 568 tree = &BTRFS_I(inode)->io_tree;
@@ -573,6 +577,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
573 577
574 compressed_len = em->block_len; 578 compressed_len = em->block_len;
575 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); 579 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
580 if (!cb)
581 goto out;
582
576 atomic_set(&cb->pending_bios, 0); 583 atomic_set(&cb->pending_bios, 0);
577 cb->errors = 0; 584 cb->errors = 0;
578 cb->inode = inode; 585 cb->inode = inode;
@@ -588,17 +595,23 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
588 595
589 cb->len = uncompressed_len; 596 cb->len = uncompressed_len;
590 cb->compressed_len = compressed_len; 597 cb->compressed_len = compressed_len;
598 cb->compress_type = extent_compress_type(bio_flags);
591 cb->orig_bio = bio; 599 cb->orig_bio = bio;
592 600
593 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / 601 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
594 PAGE_CACHE_SIZE; 602 PAGE_CACHE_SIZE;
595 cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages, 603 cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
596 GFP_NOFS); 604 GFP_NOFS);
605 if (!cb->compressed_pages)
606 goto fail1;
607
597 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 608 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
598 609
599 for (page_index = 0; page_index < nr_pages; page_index++) { 610 for (page_index = 0; page_index < nr_pages; page_index++) {
600 cb->compressed_pages[page_index] = alloc_page(GFP_NOFS | 611 cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
601 __GFP_HIGHMEM); 612 __GFP_HIGHMEM);
613 if (!cb->compressed_pages[page_index])
614 goto fail2;
602 } 615 }
603 cb->nr_pages = nr_pages; 616 cb->nr_pages = nr_pages;
604 617
@@ -609,6 +622,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
609 cb->len = uncompressed_len; 622 cb->len = uncompressed_len;
610 623
611 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); 624 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
625 if (!comp_bio)
626 goto fail2;
612 comp_bio->bi_private = cb; 627 comp_bio->bi_private = cb;
613 comp_bio->bi_end_io = end_compressed_bio_read; 628 comp_bio->bi_end_io = end_compressed_bio_read;
614 atomic_inc(&cb->pending_bios); 629 atomic_inc(&cb->pending_bios);
@@ -676,4 +691,329 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
676 691
677 bio_put(comp_bio); 692 bio_put(comp_bio);
678 return 0; 693 return 0;
694
695fail2:
696 for (page_index = 0; page_index < nr_pages; page_index++)
697 free_page((unsigned long)cb->compressed_pages[page_index]);
698
699 kfree(cb->compressed_pages);
700fail1:
701 kfree(cb);
702out:
703 free_extent_map(em);
704 return ret;
705}
706
707static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
708static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
709static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
710static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
711static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
712
713struct btrfs_compress_op *btrfs_compress_op[] = {
714 &btrfs_zlib_compress,
715 &btrfs_lzo_compress,
716};
717
718int __init btrfs_init_compress(void)
719{
720 int i;
721
722 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
723 INIT_LIST_HEAD(&comp_idle_workspace[i]);
724 spin_lock_init(&comp_workspace_lock[i]);
725 atomic_set(&comp_alloc_workspace[i], 0);
726 init_waitqueue_head(&comp_workspace_wait[i]);
727 }
728 return 0;
729}
730
731/*
732 * this finds an available workspace or allocates a new one
733 * ERR_PTR is returned if things go bad.
734 */
735static struct list_head *find_workspace(int type)
736{
737 struct list_head *workspace;
738 int cpus = num_online_cpus();
739 int idx = type - 1;
740
741 struct list_head *idle_workspace = &comp_idle_workspace[idx];
742 spinlock_t *workspace_lock = &comp_workspace_lock[idx];
743 atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
744 wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
745 int *num_workspace = &comp_num_workspace[idx];
746again:
747 spin_lock(workspace_lock);
748 if (!list_empty(idle_workspace)) {
749 workspace = idle_workspace->next;
750 list_del(workspace);
751 (*num_workspace)--;
752 spin_unlock(workspace_lock);
753 return workspace;
754
755 }
756 if (atomic_read(alloc_workspace) > cpus) {
757 DEFINE_WAIT(wait);
758
759 spin_unlock(workspace_lock);
760 prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
761 if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
762 schedule();
763 finish_wait(workspace_wait, &wait);
764 goto again;
765 }
766 atomic_inc(alloc_workspace);
767 spin_unlock(workspace_lock);
768
769 workspace = btrfs_compress_op[idx]->alloc_workspace();
770 if (IS_ERR(workspace)) {
771 atomic_dec(alloc_workspace);
772 wake_up(workspace_wait);
773 }
774 return workspace;
775}
776
777/*
778 * put a workspace struct back on the list or free it if we have enough
779 * idle ones sitting around
780 */
781static void free_workspace(int type, struct list_head *workspace)
782{
783 int idx = type - 1;
784 struct list_head *idle_workspace = &comp_idle_workspace[idx];
785 spinlock_t *workspace_lock = &comp_workspace_lock[idx];
786 atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
787 wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
788 int *num_workspace = &comp_num_workspace[idx];
789
790 spin_lock(workspace_lock);
791 if (*num_workspace < num_online_cpus()) {
792 list_add_tail(workspace, idle_workspace);
793 (*num_workspace)++;
794 spin_unlock(workspace_lock);
795 goto wake;
796 }
797 spin_unlock(workspace_lock);
798
799 btrfs_compress_op[idx]->free_workspace(workspace);
800 atomic_dec(alloc_workspace);
801wake:
802 if (waitqueue_active(workspace_wait))
803 wake_up(workspace_wait);
804}
805
806/*
807 * cleanup function for module exit
808 */
809static void free_workspaces(void)
810{
811 struct list_head *workspace;
812 int i;
813
814 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
815 while (!list_empty(&comp_idle_workspace[i])) {
816 workspace = comp_idle_workspace[i].next;
817 list_del(workspace);
818 btrfs_compress_op[i]->free_workspace(workspace);
819 atomic_dec(&comp_alloc_workspace[i]);
820 }
821 }
822}
823
824/*
825 * given an address space and start/len, compress the bytes.
826 *
827 * pages are allocated to hold the compressed result and stored
828 * in 'pages'
829 *
830 * out_pages is used to return the number of pages allocated. There
831 * may be pages allocated even if we return an error
832 *
833 * total_in is used to return the number of bytes actually read. It
834 * may be smaller then len if we had to exit early because we
835 * ran out of room in the pages array or because we cross the
836 * max_out threshold.
837 *
838 * total_out is used to return the total number of compressed bytes
839 *
840 * max_out tells us the max number of bytes that we're allowed to
841 * stuff into pages
842 */
843int btrfs_compress_pages(int type, struct address_space *mapping,
844 u64 start, unsigned long len,
845 struct page **pages,
846 unsigned long nr_dest_pages,
847 unsigned long *out_pages,
848 unsigned long *total_in,
849 unsigned long *total_out,
850 unsigned long max_out)
851{
852 struct list_head *workspace;
853 int ret;
854
855 workspace = find_workspace(type);
856 if (IS_ERR(workspace))
857 return -1;
858
859 ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
860 start, len, pages,
861 nr_dest_pages, out_pages,
862 total_in, total_out,
863 max_out);
864 free_workspace(type, workspace);
865 return ret;
866}
867
868/*
869 * pages_in is an array of pages with compressed data.
870 *
871 * disk_start is the starting logical offset of this array in the file
872 *
873 * bvec is a bio_vec of pages from the file that we want to decompress into
874 *
875 * vcnt is the count of pages in the biovec
876 *
877 * srclen is the number of bytes in pages_in
878 *
879 * The basic idea is that we have a bio that was created by readpages.
880 * The pages in the bio are for the uncompressed data, and they may not
881 * be contiguous. They all correspond to the range of bytes covered by
882 * the compressed extent.
883 */
884int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
885 struct bio_vec *bvec, int vcnt, size_t srclen)
886{
887 struct list_head *workspace;
888 int ret;
889
890 workspace = find_workspace(type);
891 if (IS_ERR(workspace))
892 return -ENOMEM;
893
894 ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
895 disk_start,
896 bvec, vcnt, srclen);
897 free_workspace(type, workspace);
898 return ret;
899}
900
901/*
902 * a less complex decompression routine. Our compressed data fits in a
903 * single page, and we want to read a single page out of it.
904 * start_byte tells us the offset into the compressed data we're interested in
905 */
906int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
907 unsigned long start_byte, size_t srclen, size_t destlen)
908{
909 struct list_head *workspace;
910 int ret;
911
912 workspace = find_workspace(type);
913 if (IS_ERR(workspace))
914 return -ENOMEM;
915
916 ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
917 dest_page, start_byte,
918 srclen, destlen);
919
920 free_workspace(type, workspace);
921 return ret;
922}
923
924void btrfs_exit_compress(void)
925{
926 free_workspaces();
927}
928
929/*
930 * Copy uncompressed data from working buffer to pages.
931 *
932 * buf_start is the byte offset we're of the start of our workspace buffer.
933 *
934 * total_out is the last byte of the buffer
935 */
936int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
937 unsigned long total_out, u64 disk_start,
938 struct bio_vec *bvec, int vcnt,
939 unsigned long *page_index,
940 unsigned long *pg_offset)
941{
942 unsigned long buf_offset;
943 unsigned long current_buf_start;
944 unsigned long start_byte;
945 unsigned long working_bytes = total_out - buf_start;
946 unsigned long bytes;
947 char *kaddr;
948 struct page *page_out = bvec[*page_index].bv_page;
949
950 /*
951 * start byte is the first byte of the page we're currently
952 * copying into relative to the start of the compressed data.
953 */
954 start_byte = page_offset(page_out) - disk_start;
955
956 /* we haven't yet hit data corresponding to this page */
957 if (total_out <= start_byte)
958 return 1;
959
960 /*
961 * the start of the data we care about is offset into
962 * the middle of our working buffer
963 */
964 if (total_out > start_byte && buf_start < start_byte) {
965 buf_offset = start_byte - buf_start;
966 working_bytes -= buf_offset;
967 } else {
968 buf_offset = 0;
969 }
970 current_buf_start = buf_start;
971
972 /* copy bytes from the working buffer into the pages */
973 while (working_bytes > 0) {
974 bytes = min(PAGE_CACHE_SIZE - *pg_offset,
975 PAGE_CACHE_SIZE - buf_offset);
976 bytes = min(bytes, working_bytes);
977 kaddr = kmap_atomic(page_out, KM_USER0);
978 memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
979 kunmap_atomic(kaddr, KM_USER0);
980 flush_dcache_page(page_out);
981
982 *pg_offset += bytes;
983 buf_offset += bytes;
984 working_bytes -= bytes;
985 current_buf_start += bytes;
986
987 /* check if we need to pick another page */
988 if (*pg_offset == PAGE_CACHE_SIZE) {
989 (*page_index)++;
990 if (*page_index >= vcnt)
991 return 0;
992
993 page_out = bvec[*page_index].bv_page;
994 *pg_offset = 0;
995 start_byte = page_offset(page_out) - disk_start;
996
997 /*
998 * make sure our new page is covered by this
999 * working buffer
1000 */
1001 if (total_out <= start_byte)
1002 return 1;
1003
1004 /*
1005 * the next page in the biovec might not be adjacent
1006 * to the last page, but it might still be found
1007 * inside this working buffer. bump our offset pointer
1008 */
1009 if (total_out > start_byte &&
1010 current_buf_start < start_byte) {
1011 buf_offset = start_byte - buf_start;
1012 working_bytes = total_out - start_byte;
1013 current_buf_start = buf_start + buf_offset;
1014 }
1015 }
1016 }
1017
1018 return 1;
679} 1019}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 421f5b4aa715..51000174b9d7 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -19,24 +19,27 @@
19#ifndef __BTRFS_COMPRESSION_ 19#ifndef __BTRFS_COMPRESSION_
20#define __BTRFS_COMPRESSION_ 20#define __BTRFS_COMPRESSION_
21 21
22int btrfs_zlib_decompress(unsigned char *data_in, 22int btrfs_init_compress(void);
23 struct page *dest_page, 23void btrfs_exit_compress(void);
24 unsigned long start_byte, 24
25 size_t srclen, size_t destlen); 25int btrfs_compress_pages(int type, struct address_space *mapping,
26int btrfs_zlib_compress_pages(struct address_space *mapping, 26 u64 start, unsigned long len,
27 u64 start, unsigned long len, 27 struct page **pages,
28 struct page **pages, 28 unsigned long nr_dest_pages,
29 unsigned long nr_dest_pages, 29 unsigned long *out_pages,
30 unsigned long *out_pages, 30 unsigned long *total_in,
31 unsigned long *total_in, 31 unsigned long *total_out,
32 unsigned long *total_out, 32 unsigned long max_out);
33 unsigned long max_out); 33int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
34int btrfs_zlib_decompress_biovec(struct page **pages_in, 34 struct bio_vec *bvec, int vcnt, size_t srclen);
35 u64 disk_start, 35int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
36 struct bio_vec *bvec, 36 unsigned long start_byte, size_t srclen, size_t destlen);
37 int vcnt, 37int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
38 size_t srclen); 38 unsigned long total_out, u64 disk_start,
39void btrfs_zlib_exit(void); 39 struct bio_vec *bvec, int vcnt,
40 unsigned long *page_index,
41 unsigned long *pg_offset);
42
40int btrfs_submit_compressed_write(struct inode *inode, u64 start, 43int btrfs_submit_compressed_write(struct inode *inode, u64 start,
41 unsigned long len, u64 disk_start, 44 unsigned long len, u64 disk_start,
42 unsigned long compressed_len, 45 unsigned long compressed_len,
@@ -44,4 +47,37 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
44 unsigned long nr_pages); 47 unsigned long nr_pages);
45int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, 48int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
46 int mirror_num, unsigned long bio_flags); 49 int mirror_num, unsigned long bio_flags);
50
51struct btrfs_compress_op {
52 struct list_head *(*alloc_workspace)(void);
53
54 void (*free_workspace)(struct list_head *workspace);
55
56 int (*compress_pages)(struct list_head *workspace,
57 struct address_space *mapping,
58 u64 start, unsigned long len,
59 struct page **pages,
60 unsigned long nr_dest_pages,
61 unsigned long *out_pages,
62 unsigned long *total_in,
63 unsigned long *total_out,
64 unsigned long max_out);
65
66 int (*decompress_biovec)(struct list_head *workspace,
67 struct page **pages_in,
68 u64 disk_start,
69 struct bio_vec *bvec,
70 int vcnt,
71 size_t srclen);
72
73 int (*decompress)(struct list_head *workspace,
74 unsigned char *data_in,
75 struct page *dest_page,
76 unsigned long start_byte,
77 size_t srclen, size_t destlen);
78};
79
80extern struct btrfs_compress_op btrfs_zlib_compress;
81extern struct btrfs_compress_op btrfs_lzo_compress;
82
47#endif 83#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9ac171599258..b5baff0dccfe 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -105,6 +105,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
105/* this also releases the path */ 105/* this also releases the path */
106void btrfs_free_path(struct btrfs_path *p) 106void btrfs_free_path(struct btrfs_path *p)
107{ 107{
108 if (!p)
109 return;
108 btrfs_release_path(NULL, p); 110 btrfs_release_path(NULL, p);
109 kmem_cache_free(btrfs_path_cachep, p); 111 kmem_cache_free(btrfs_path_cachep, p);
110} 112}
@@ -2514,6 +2516,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2514 btrfs_assert_tree_locked(path->nodes[1]); 2516 btrfs_assert_tree_locked(path->nodes[1]);
2515 2517
2516 right = read_node_slot(root, upper, slot + 1); 2518 right = read_node_slot(root, upper, slot + 1);
2519 if (right == NULL)
2520 return 1;
2521
2517 btrfs_tree_lock(right); 2522 btrfs_tree_lock(right);
2518 btrfs_set_lock_blocking(right); 2523 btrfs_set_lock_blocking(right);
2519 2524
@@ -2764,6 +2769,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2764 btrfs_assert_tree_locked(path->nodes[1]); 2769 btrfs_assert_tree_locked(path->nodes[1]);
2765 2770
2766 left = read_node_slot(root, path->nodes[1], slot - 1); 2771 left = read_node_slot(root, path->nodes[1], slot - 1);
2772 if (left == NULL)
2773 return 1;
2774
2767 btrfs_tree_lock(left); 2775 btrfs_tree_lock(left);
2768 btrfs_set_lock_blocking(left); 2776 btrfs_set_lock_blocking(left);
2769 2777
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a142d204b526..2c98b3af6052 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -27,6 +27,7 @@
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/wait.h> 28#include <linux/wait.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/kobject.h>
30#include <asm/kmap_types.h> 31#include <asm/kmap_types.h>
31#include "extent_io.h" 32#include "extent_io.h"
32#include "extent_map.h" 33#include "extent_map.h"
@@ -294,6 +295,14 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
294#define BTRFS_FSID_SIZE 16 295#define BTRFS_FSID_SIZE 16
295#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) 296#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
296#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) 297#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1)
298
299/*
300 * File system states
301 */
302
303/* Errors detected */
304#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2)
305
297#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) 306#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
298#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) 307#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33)
299 308
@@ -398,13 +407,15 @@ struct btrfs_super_block {
398#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) 407#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
399#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) 408#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
400#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) 409#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
410#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3)
401 411
402#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 412#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
403#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 413#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
404#define BTRFS_FEATURE_INCOMPAT_SUPP \ 414#define BTRFS_FEATURE_INCOMPAT_SUPP \
405 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ 415 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
406 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ 416 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
407 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 417 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
418 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
408 419
409/* 420/*
410 * A leaf is full of items. offset and size tell us where to find 421 * A leaf is full of items. offset and size tell us where to find
@@ -551,9 +562,11 @@ struct btrfs_timespec {
551} __attribute__ ((__packed__)); 562} __attribute__ ((__packed__));
552 563
553enum btrfs_compression_type { 564enum btrfs_compression_type {
554 BTRFS_COMPRESS_NONE = 0, 565 BTRFS_COMPRESS_NONE = 0,
555 BTRFS_COMPRESS_ZLIB = 1, 566 BTRFS_COMPRESS_ZLIB = 1,
556 BTRFS_COMPRESS_LAST = 2, 567 BTRFS_COMPRESS_LZO = 2,
568 BTRFS_COMPRESS_TYPES = 2,
569 BTRFS_COMPRESS_LAST = 3,
557}; 570};
558 571
559struct btrfs_inode_item { 572struct btrfs_inode_item {
@@ -597,6 +610,8 @@ struct btrfs_dir_item {
597 u8 type; 610 u8 type;
598} __attribute__ ((__packed__)); 611} __attribute__ ((__packed__));
599 612
613#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0)
614
600struct btrfs_root_item { 615struct btrfs_root_item {
601 struct btrfs_inode_item inode; 616 struct btrfs_inode_item inode;
602 __le64 generation; 617 __le64 generation;
@@ -895,7 +910,8 @@ struct btrfs_fs_info {
895 */ 910 */
896 u64 last_trans_log_full_commit; 911 u64 last_trans_log_full_commit;
897 u64 open_ioctl_trans; 912 u64 open_ioctl_trans;
898 unsigned long mount_opt; 913 unsigned long mount_opt:20;
914 unsigned long compress_type:4;
899 u64 max_inline; 915 u64 max_inline;
900 u64 alloc_start; 916 u64 alloc_start;
901 struct btrfs_transaction *running_transaction; 917 struct btrfs_transaction *running_transaction;
@@ -1050,6 +1066,9 @@ struct btrfs_fs_info {
1050 unsigned metadata_ratio; 1066 unsigned metadata_ratio;
1051 1067
1052 void *bdev_holder; 1068 void *bdev_holder;
1069
1070 /* filesystem state */
1071 u64 fs_state;
1053}; 1072};
1054 1073
1055/* 1074/*
@@ -1893,6 +1912,11 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
1893BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, 1912BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
1894 last_snapshot, 64); 1913 last_snapshot, 64);
1895 1914
1915static inline bool btrfs_root_readonly(struct btrfs_root *root)
1916{
1917 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
1918}
1919
1896/* struct btrfs_super_block */ 1920/* struct btrfs_super_block */
1897 1921
1898BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 1922BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2145,6 +2169,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
2145int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 2169int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2146 struct btrfs_root *root, u64 group_start); 2170 struct btrfs_root *root, u64 group_start);
2147u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2171u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2172u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
2148void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2173void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2149void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2174void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2150int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 2175int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
@@ -2188,6 +2213,12 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
2188int btrfs_set_block_group_rw(struct btrfs_root *root, 2213int btrfs_set_block_group_rw(struct btrfs_root *root,
2189 struct btrfs_block_group_cache *cache); 2214 struct btrfs_block_group_cache *cache);
2190void btrfs_put_block_group_cache(struct btrfs_fs_info *info); 2215void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
2216u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
2217int btrfs_error_unpin_extent_range(struct btrfs_root *root,
2218 u64 start, u64 end);
2219int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
2220 u64 num_bytes);
2221
2191/* ctree.c */ 2222/* ctree.c */
2192int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2223int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2193 int level, int *slot); 2224 int level, int *slot);
@@ -2541,6 +2572,14 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
2541/* super.c */ 2572/* super.c */
2542int btrfs_parse_options(struct btrfs_root *root, char *options); 2573int btrfs_parse_options(struct btrfs_root *root, char *options);
2543int btrfs_sync_fs(struct super_block *sb, int wait); 2574int btrfs_sync_fs(struct super_block *sb, int wait);
2575void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
2576 unsigned int line, int errno);
2577
2578#define btrfs_std_error(fs_info, errno) \
2579do { \
2580 if ((errno)) \
2581 __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
2582} while (0)
2544 2583
2545/* acl.c */ 2584/* acl.c */
2546#ifdef CONFIG_BTRFS_FS_POSIX_ACL 2585#ifdef CONFIG_BTRFS_FS_POSIX_ACL
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 51d2e4de34eb..fdce8799b98d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -44,6 +44,20 @@
44static struct extent_io_ops btree_extent_io_ops; 44static struct extent_io_ops btree_extent_io_ops;
45static void end_workqueue_fn(struct btrfs_work *work); 45static void end_workqueue_fn(struct btrfs_work *work);
46static void free_fs_root(struct btrfs_root *root); 46static void free_fs_root(struct btrfs_root *root);
47static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
48 int read_only);
49static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
50static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
51static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
52 struct btrfs_root *root);
53static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
54static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
55static int btrfs_destroy_marked_extents(struct btrfs_root *root,
56 struct extent_io_tree *dirty_pages,
57 int mark);
58static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
59 struct extent_io_tree *pinned_extents);
60static int btrfs_cleanup_transaction(struct btrfs_root *root);
47 61
48/* 62/*
49 * end_io_wq structs are used to do processing in task context when an IO is 63 * end_io_wq structs are used to do processing in task context when an IO is
@@ -353,6 +367,10 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
353 WARN_ON(len == 0); 367 WARN_ON(len == 0);
354 368
355 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 369 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
370 if (eb == NULL) {
371 WARN_ON(1);
372 goto out;
373 }
356 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, 374 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
357 btrfs_header_generation(eb)); 375 btrfs_header_generation(eb));
358 BUG_ON(ret); 376 BUG_ON(ret);
@@ -427,6 +445,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
427 WARN_ON(len == 0); 445 WARN_ON(len == 0);
428 446
429 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 447 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
448 if (eb == NULL) {
449 ret = -EIO;
450 goto out;
451 }
430 452
431 found_start = btrfs_header_bytenr(eb); 453 found_start = btrfs_header_bytenr(eb);
432 if (found_start != start) { 454 if (found_start != start) {
@@ -1145,6 +1167,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1145 } 1167 }
1146 btrfs_free_path(path); 1168 btrfs_free_path(path);
1147 if (ret) { 1169 if (ret) {
1170 kfree(root);
1148 if (ret > 0) 1171 if (ret > 0)
1149 ret = -ENOENT; 1172 ret = -ENOENT;
1150 return ERR_PTR(ret); 1173 return ERR_PTR(ret);
@@ -1527,6 +1550,7 @@ static int transaction_kthread(void *arg)
1527 spin_unlock(&root->fs_info->new_trans_lock); 1550 spin_unlock(&root->fs_info->new_trans_lock);
1528 1551
1529 trans = btrfs_join_transaction(root, 1); 1552 trans = btrfs_join_transaction(root, 1);
1553 BUG_ON(IS_ERR(trans));
1530 if (transid == trans->transid) { 1554 if (transid == trans->transid) {
1531 ret = btrfs_commit_transaction(trans, root); 1555 ret = btrfs_commit_transaction(trans, root);
1532 BUG_ON(ret); 1556 BUG_ON(ret);
@@ -1713,8 +1737,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1713 fs_info, BTRFS_ROOT_TREE_OBJECTID); 1737 fs_info, BTRFS_ROOT_TREE_OBJECTID);
1714 1738
1715 bh = btrfs_read_dev_super(fs_devices->latest_bdev); 1739 bh = btrfs_read_dev_super(fs_devices->latest_bdev);
1716 if (!bh) 1740 if (!bh) {
1741 err = -EINVAL;
1717 goto fail_iput; 1742 goto fail_iput;
1743 }
1718 1744
1719 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 1745 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
1720 memcpy(&fs_info->super_for_commit, &fs_info->super_copy, 1746 memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
@@ -1727,6 +1753,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1727 if (!btrfs_super_root(disk_super)) 1753 if (!btrfs_super_root(disk_super))
1728 goto fail_iput; 1754 goto fail_iput;
1729 1755
1756 /* check FS state, whether FS is broken. */
1757 fs_info->fs_state |= btrfs_super_flags(disk_super);
1758
1759 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1760
1730 ret = btrfs_parse_options(tree_root, options); 1761 ret = btrfs_parse_options(tree_root, options);
1731 if (ret) { 1762 if (ret) {
1732 err = ret; 1763 err = ret;
@@ -1744,10 +1775,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1744 } 1775 }
1745 1776
1746 features = btrfs_super_incompat_flags(disk_super); 1777 features = btrfs_super_incompat_flags(disk_super);
1747 if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) { 1778 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
1748 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; 1779 if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
1749 btrfs_set_super_incompat_flags(disk_super, features); 1780 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
1750 } 1781 btrfs_set_super_incompat_flags(disk_super, features);
1751 1782
1752 features = btrfs_super_compat_ro_flags(disk_super) & 1783 features = btrfs_super_compat_ro_flags(disk_super) &
1753 ~BTRFS_FEATURE_COMPAT_RO_SUPP; 1784 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
@@ -1957,7 +1988,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1957 btrfs_set_opt(fs_info->mount_opt, SSD); 1988 btrfs_set_opt(fs_info->mount_opt, SSD);
1958 } 1989 }
1959 1990
1960 if (btrfs_super_log_root(disk_super) != 0) { 1991 /* do not make disk changes in broken FS */
1992 if (btrfs_super_log_root(disk_super) != 0 &&
1993 !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
1961 u64 bytenr = btrfs_super_log_root(disk_super); 1994 u64 bytenr = btrfs_super_log_root(disk_super);
1962 1995
1963 if (fs_devices->rw_devices == 0) { 1996 if (fs_devices->rw_devices == 0) {
@@ -2421,10 +2454,14 @@ int btrfs_commit_super(struct btrfs_root *root)
2421 up_write(&root->fs_info->cleanup_work_sem); 2454 up_write(&root->fs_info->cleanup_work_sem);
2422 2455
2423 trans = btrfs_join_transaction(root, 1); 2456 trans = btrfs_join_transaction(root, 1);
2457 if (IS_ERR(trans))
2458 return PTR_ERR(trans);
2424 ret = btrfs_commit_transaction(trans, root); 2459 ret = btrfs_commit_transaction(trans, root);
2425 BUG_ON(ret); 2460 BUG_ON(ret);
2426 /* run commit again to drop the original snapshot */ 2461 /* run commit again to drop the original snapshot */
2427 trans = btrfs_join_transaction(root, 1); 2462 trans = btrfs_join_transaction(root, 1);
2463 if (IS_ERR(trans))
2464 return PTR_ERR(trans);
2428 btrfs_commit_transaction(trans, root); 2465 btrfs_commit_transaction(trans, root);
2429 ret = btrfs_write_and_wait_transaction(NULL, root); 2466 ret = btrfs_write_and_wait_transaction(NULL, root);
2430 BUG_ON(ret); 2467 BUG_ON(ret);
@@ -2442,8 +2479,28 @@ int close_ctree(struct btrfs_root *root)
2442 smp_mb(); 2479 smp_mb();
2443 2480
2444 btrfs_put_block_group_cache(fs_info); 2481 btrfs_put_block_group_cache(fs_info);
2482
2483 /*
2484 * Here come 2 situations when btrfs is broken to flip readonly:
2485 *
2486 * 1. when btrfs flips readonly somewhere else before
2487 * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
2488 * and btrfs will skip to write sb directly to keep
2489 * ERROR state on disk.
2490 *
2491 * 2. when btrfs flips readonly just in btrfs_commit_super,
2492 * and in such case, btrfs cannnot write sb via btrfs_commit_super,
2493 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
2494 * btrfs will cleanup all FS resources first and write sb then.
2495 */
2445 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2496 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2446 ret = btrfs_commit_super(root); 2497 ret = btrfs_commit_super(root);
2498 if (ret)
2499 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2500 }
2501
2502 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
2503 ret = btrfs_error_commit_super(root);
2447 if (ret) 2504 if (ret)
2448 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2505 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2449 } 2506 }
@@ -2502,6 +2559,8 @@ int close_ctree(struct btrfs_root *root)
2502 kfree(fs_info->chunk_root); 2559 kfree(fs_info->chunk_root);
2503 kfree(fs_info->dev_root); 2560 kfree(fs_info->dev_root);
2504 kfree(fs_info->csum_root); 2561 kfree(fs_info->csum_root);
2562 kfree(fs_info);
2563
2505 return 0; 2564 return 0;
2506} 2565}
2507 2566
@@ -2619,6 +2678,352 @@ out:
2619 return 0; 2678 return 0;
2620} 2679}
2621 2680
2681static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
2682 int read_only)
2683{
2684 if (read_only)
2685 return;
2686
2687 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2688 printk(KERN_WARNING "warning: mount fs with errors, "
2689 "running btrfsck is recommended\n");
2690}
2691
2692int btrfs_error_commit_super(struct btrfs_root *root)
2693{
2694 int ret;
2695
2696 mutex_lock(&root->fs_info->cleaner_mutex);
2697 btrfs_run_delayed_iputs(root);
2698 mutex_unlock(&root->fs_info->cleaner_mutex);
2699
2700 down_write(&root->fs_info->cleanup_work_sem);
2701 up_write(&root->fs_info->cleanup_work_sem);
2702
2703 /* cleanup FS via transaction */
2704 btrfs_cleanup_transaction(root);
2705
2706 ret = write_ctree_super(NULL, root, 0);
2707
2708 return ret;
2709}
2710
2711static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
2712{
2713 struct btrfs_inode *btrfs_inode;
2714 struct list_head splice;
2715
2716 INIT_LIST_HEAD(&splice);
2717
2718 mutex_lock(&root->fs_info->ordered_operations_mutex);
2719 spin_lock(&root->fs_info->ordered_extent_lock);
2720
2721 list_splice_init(&root->fs_info->ordered_operations, &splice);
2722 while (!list_empty(&splice)) {
2723 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
2724 ordered_operations);
2725
2726 list_del_init(&btrfs_inode->ordered_operations);
2727
2728 btrfs_invalidate_inodes(btrfs_inode->root);
2729 }
2730
2731 spin_unlock(&root->fs_info->ordered_extent_lock);
2732 mutex_unlock(&root->fs_info->ordered_operations_mutex);
2733
2734 return 0;
2735}
2736
2737static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
2738{
2739 struct list_head splice;
2740 struct btrfs_ordered_extent *ordered;
2741 struct inode *inode;
2742
2743 INIT_LIST_HEAD(&splice);
2744
2745 spin_lock(&root->fs_info->ordered_extent_lock);
2746
2747 list_splice_init(&root->fs_info->ordered_extents, &splice);
2748 while (!list_empty(&splice)) {
2749 ordered = list_entry(splice.next, struct btrfs_ordered_extent,
2750 root_extent_list);
2751
2752 list_del_init(&ordered->root_extent_list);
2753 atomic_inc(&ordered->refs);
2754
2755 /* the inode may be getting freed (in sys_unlink path). */
2756 inode = igrab(ordered->inode);
2757
2758 spin_unlock(&root->fs_info->ordered_extent_lock);
2759 if (inode)
2760 iput(inode);
2761
2762 atomic_set(&ordered->refs, 1);
2763 btrfs_put_ordered_extent(ordered);
2764
2765 spin_lock(&root->fs_info->ordered_extent_lock);
2766 }
2767
2768 spin_unlock(&root->fs_info->ordered_extent_lock);
2769
2770 return 0;
2771}
2772
2773static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
2774 struct btrfs_root *root)
2775{
2776 struct rb_node *node;
2777 struct btrfs_delayed_ref_root *delayed_refs;
2778 struct btrfs_delayed_ref_node *ref;
2779 int ret = 0;
2780
2781 delayed_refs = &trans->delayed_refs;
2782
2783 spin_lock(&delayed_refs->lock);
2784 if (delayed_refs->num_entries == 0) {
2785 printk(KERN_INFO "delayed_refs has NO entry\n");
2786 return ret;
2787 }
2788
2789 node = rb_first(&delayed_refs->root);
2790 while (node) {
2791 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2792 node = rb_next(node);
2793
2794 ref->in_tree = 0;
2795 rb_erase(&ref->rb_node, &delayed_refs->root);
2796 delayed_refs->num_entries--;
2797
2798 atomic_set(&ref->refs, 1);
2799 if (btrfs_delayed_ref_is_head(ref)) {
2800 struct btrfs_delayed_ref_head *head;
2801
2802 head = btrfs_delayed_node_to_head(ref);
2803 mutex_lock(&head->mutex);
2804 kfree(head->extent_op);
2805 delayed_refs->num_heads--;
2806 if (list_empty(&head->cluster))
2807 delayed_refs->num_heads_ready--;
2808 list_del_init(&head->cluster);
2809 mutex_unlock(&head->mutex);
2810 }
2811
2812 spin_unlock(&delayed_refs->lock);
2813 btrfs_put_delayed_ref(ref);
2814
2815 cond_resched();
2816 spin_lock(&delayed_refs->lock);
2817 }
2818
2819 spin_unlock(&delayed_refs->lock);
2820
2821 return ret;
2822}
2823
2824static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
2825{
2826 struct btrfs_pending_snapshot *snapshot;
2827 struct list_head splice;
2828
2829 INIT_LIST_HEAD(&splice);
2830
2831 list_splice_init(&t->pending_snapshots, &splice);
2832
2833 while (!list_empty(&splice)) {
2834 snapshot = list_entry(splice.next,
2835 struct btrfs_pending_snapshot,
2836 list);
2837
2838 list_del_init(&snapshot->list);
2839
2840 kfree(snapshot);
2841 }
2842
2843 return 0;
2844}
2845
2846static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
2847{
2848 struct btrfs_inode *btrfs_inode;
2849 struct list_head splice;
2850
2851 INIT_LIST_HEAD(&splice);
2852
2853 list_splice_init(&root->fs_info->delalloc_inodes, &splice);
2854
2855 spin_lock(&root->fs_info->delalloc_lock);
2856
2857 while (!list_empty(&splice)) {
2858 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
2859 delalloc_inodes);
2860
2861 list_del_init(&btrfs_inode->delalloc_inodes);
2862
2863 btrfs_invalidate_inodes(btrfs_inode->root);
2864 }
2865
2866 spin_unlock(&root->fs_info->delalloc_lock);
2867
2868 return 0;
2869}
2870
2871static int btrfs_destroy_marked_extents(struct btrfs_root *root,
2872 struct extent_io_tree *dirty_pages,
2873 int mark)
2874{
2875 int ret;
2876 struct page *page;
2877 struct inode *btree_inode = root->fs_info->btree_inode;
2878 struct extent_buffer *eb;
2879 u64 start = 0;
2880 u64 end;
2881 u64 offset;
2882 unsigned long index;
2883
2884 while (1) {
2885 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
2886 mark);
2887 if (ret)
2888 break;
2889
2890 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
2891 while (start <= end) {
2892 index = start >> PAGE_CACHE_SHIFT;
2893 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
2894 page = find_get_page(btree_inode->i_mapping, index);
2895 if (!page)
2896 continue;
2897 offset = page_offset(page);
2898
2899 spin_lock(&dirty_pages->buffer_lock);
2900 eb = radix_tree_lookup(
2901 &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
2902 offset >> PAGE_CACHE_SHIFT);
2903 spin_unlock(&dirty_pages->buffer_lock);
2904 if (eb) {
2905 ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
2906 &eb->bflags);
2907 atomic_set(&eb->refs, 1);
2908 }
2909 if (PageWriteback(page))
2910 end_page_writeback(page);
2911
2912 lock_page(page);
2913 if (PageDirty(page)) {
2914 clear_page_dirty_for_io(page);
2915 spin_lock_irq(&page->mapping->tree_lock);
2916 radix_tree_tag_clear(&page->mapping->page_tree,
2917 page_index(page),
2918 PAGECACHE_TAG_DIRTY);
2919 spin_unlock_irq(&page->mapping->tree_lock);
2920 }
2921
2922 page->mapping->a_ops->invalidatepage(page, 0);
2923 unlock_page(page);
2924 }
2925 }
2926
2927 return ret;
2928}
2929
2930static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
2931 struct extent_io_tree *pinned_extents)
2932{
2933 struct extent_io_tree *unpin;
2934 u64 start;
2935 u64 end;
2936 int ret;
2937
2938 unpin = pinned_extents;
2939 while (1) {
2940 ret = find_first_extent_bit(unpin, 0, &start, &end,
2941 EXTENT_DIRTY);
2942 if (ret)
2943 break;
2944
2945 /* opt_discard */
2946 ret = btrfs_error_discard_extent(root, start, end + 1 - start);
2947
2948 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2949 btrfs_error_unpin_extent_range(root, start, end);
2950 cond_resched();
2951 }
2952
2953 return 0;
2954}
2955
2956static int btrfs_cleanup_transaction(struct btrfs_root *root)
2957{
2958 struct btrfs_transaction *t;
2959 LIST_HEAD(list);
2960
2961 WARN_ON(1);
2962
2963 mutex_lock(&root->fs_info->trans_mutex);
2964 mutex_lock(&root->fs_info->transaction_kthread_mutex);
2965
2966 list_splice_init(&root->fs_info->trans_list, &list);
2967 while (!list_empty(&list)) {
2968 t = list_entry(list.next, struct btrfs_transaction, list);
2969 if (!t)
2970 break;
2971
2972 btrfs_destroy_ordered_operations(root);
2973
2974 btrfs_destroy_ordered_extents(root);
2975
2976 btrfs_destroy_delayed_refs(t, root);
2977
2978 btrfs_block_rsv_release(root,
2979 &root->fs_info->trans_block_rsv,
2980 t->dirty_pages.dirty_bytes);
2981
2982 /* FIXME: cleanup wait for commit */
2983 t->in_commit = 1;
2984 t->blocked = 1;
2985 if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
2986 wake_up(&root->fs_info->transaction_blocked_wait);
2987
2988 t->blocked = 0;
2989 if (waitqueue_active(&root->fs_info->transaction_wait))
2990 wake_up(&root->fs_info->transaction_wait);
2991 mutex_unlock(&root->fs_info->trans_mutex);
2992
2993 mutex_lock(&root->fs_info->trans_mutex);
2994 t->commit_done = 1;
2995 if (waitqueue_active(&t->commit_wait))
2996 wake_up(&t->commit_wait);
2997 mutex_unlock(&root->fs_info->trans_mutex);
2998
2999 mutex_lock(&root->fs_info->trans_mutex);
3000
3001 btrfs_destroy_pending_snapshots(t);
3002
3003 btrfs_destroy_delalloc_inodes(root);
3004
3005 spin_lock(&root->fs_info->new_trans_lock);
3006 root->fs_info->running_transaction = NULL;
3007 spin_unlock(&root->fs_info->new_trans_lock);
3008
3009 btrfs_destroy_marked_extents(root, &t->dirty_pages,
3010 EXTENT_DIRTY);
3011
3012 btrfs_destroy_pinned_extent(root,
3013 root->fs_info->pinned_extents);
3014
3015 t->use_count = 0;
3016 list_del_init(&t->list);
3017 memset(t, 0, sizeof(*t));
3018 kmem_cache_free(btrfs_transaction_cachep, t);
3019 }
3020
3021 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
3022 mutex_unlock(&root->fs_info->trans_mutex);
3023
3024 return 0;
3025}
3026
2622static struct extent_io_ops btree_extent_io_ops = { 3027static struct extent_io_ops btree_extent_io_ops = {
2623 .write_cache_pages_lock_hook = btree_lock_page_hook, 3028 .write_cache_pages_lock_hook = btree_lock_page_hook,
2624 .readpage_end_io_hook = btree_readpage_end_io_hook, 3029 .readpage_end_io_hook = btree_readpage_end_io_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 88e825a0bf21..07b20dc2fd95 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,6 +52,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
52 struct btrfs_root *root, int max_mirrors); 52 struct btrfs_root *root, int max_mirrors);
53struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); 53struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
54int btrfs_commit_super(struct btrfs_root *root); 54int btrfs_commit_super(struct btrfs_root *root);
55int btrfs_error_commit_super(struct btrfs_root *root);
55struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 56struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
56 u64 bytenr, u32 blocksize); 57 u64 bytenr, u32 blocksize);
57struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, 58struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 9786963b07e5..ff27d7a477b2 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -171,6 +171,8 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
171 int ret; 171 int ret;
172 172
173 path = btrfs_alloc_path(); 173 path = btrfs_alloc_path();
174 if (!path)
175 return ERR_PTR(-ENOMEM);
174 176
175 if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) { 177 if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
176 key.objectid = root->root_key.objectid; 178 key.objectid = root->root_key.objectid;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 227e5815d838..4e7e012ad667 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -320,11 +320,6 @@ static int caching_kthread(void *data)
320 if (!path) 320 if (!path)
321 return -ENOMEM; 321 return -ENOMEM;
322 322
323 exclude_super_stripes(extent_root, block_group);
324 spin_lock(&block_group->space_info->lock);
325 block_group->space_info->bytes_readonly += block_group->bytes_super;
326 spin_unlock(&block_group->space_info->lock);
327
328 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 323 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
329 324
330 /* 325 /*
@@ -467,8 +462,10 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
467 cache->cached = BTRFS_CACHE_NO; 462 cache->cached = BTRFS_CACHE_NO;
468 } 463 }
469 spin_unlock(&cache->lock); 464 spin_unlock(&cache->lock);
470 if (ret == 1) 465 if (ret == 1) {
466 free_excluded_extents(fs_info->extent_root, cache);
471 return 0; 467 return 0;
468 }
472 } 469 }
473 470
474 if (load_cache_only) 471 if (load_cache_only)
@@ -3089,7 +3086,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3089 return btrfs_reduce_alloc_profile(root, flags); 3086 return btrfs_reduce_alloc_profile(root, flags);
3090} 3087}
3091 3088
3092static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3089u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3093{ 3090{
3094 u64 flags; 3091 u64 flags;
3095 3092
@@ -3161,8 +3158,12 @@ alloc:
3161 bytes + 2 * 1024 * 1024, 3158 bytes + 2 * 1024 * 1024,
3162 alloc_target, 0); 3159 alloc_target, 0);
3163 btrfs_end_transaction(trans, root); 3160 btrfs_end_transaction(trans, root);
3164 if (ret < 0) 3161 if (ret < 0) {
3165 return ret; 3162 if (ret != -ENOSPC)
3163 return ret;
3164 else
3165 goto commit_trans;
3166 }
3166 3167
3167 if (!data_sinfo) { 3168 if (!data_sinfo) {
3168 btrfs_set_inode_space_info(root, inode); 3169 btrfs_set_inode_space_info(root, inode);
@@ -3173,6 +3174,7 @@ alloc:
3173 spin_unlock(&data_sinfo->lock); 3174 spin_unlock(&data_sinfo->lock);
3174 3175
3175 /* commit the current transaction and try again */ 3176 /* commit the current transaction and try again */
3177commit_trans:
3176 if (!committed && !root->fs_info->open_ioctl_trans) { 3178 if (!committed && !root->fs_info->open_ioctl_trans) {
3177 committed = 1; 3179 committed = 1;
3178 trans = btrfs_join_transaction(root, 1); 3180 trans = btrfs_join_transaction(root, 1);
@@ -3339,8 +3341,10 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3339 u64 reserved; 3341 u64 reserved;
3340 u64 max_reclaim; 3342 u64 max_reclaim;
3341 u64 reclaimed = 0; 3343 u64 reclaimed = 0;
3344 long time_left;
3342 int pause = 1; 3345 int pause = 1;
3343 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3346 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3347 int loops = 0;
3344 3348
3345 block_rsv = &root->fs_info->delalloc_block_rsv; 3349 block_rsv = &root->fs_info->delalloc_block_rsv;
3346 space_info = block_rsv->space_info; 3350 space_info = block_rsv->space_info;
@@ -3353,7 +3357,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3353 3357
3354 max_reclaim = min(reserved, to_reclaim); 3358 max_reclaim = min(reserved, to_reclaim);
3355 3359
3356 while (1) { 3360 while (loops < 1024) {
3357 /* have the flusher threads jump in and do some IO */ 3361 /* have the flusher threads jump in and do some IO */
3358 smp_mb(); 3362 smp_mb();
3359 nr_pages = min_t(unsigned long, nr_pages, 3363 nr_pages = min_t(unsigned long, nr_pages,
@@ -3361,8 +3365,12 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3361 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); 3365 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
3362 3366
3363 spin_lock(&space_info->lock); 3367 spin_lock(&space_info->lock);
3364 if (reserved > space_info->bytes_reserved) 3368 if (reserved > space_info->bytes_reserved) {
3369 loops = 0;
3365 reclaimed += reserved - space_info->bytes_reserved; 3370 reclaimed += reserved - space_info->bytes_reserved;
3371 } else {
3372 loops++;
3373 }
3366 reserved = space_info->bytes_reserved; 3374 reserved = space_info->bytes_reserved;
3367 spin_unlock(&space_info->lock); 3375 spin_unlock(&space_info->lock);
3368 3376
@@ -3373,7 +3381,12 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3373 return -EAGAIN; 3381 return -EAGAIN;
3374 3382
3375 __set_current_state(TASK_INTERRUPTIBLE); 3383 __set_current_state(TASK_INTERRUPTIBLE);
3376 schedule_timeout(pause); 3384 time_left = schedule_timeout(pause);
3385
3386 /* We were interrupted, exit */
3387 if (time_left)
3388 break;
3389
3377 pause <<= 1; 3390 pause <<= 1;
3378 if (pause > HZ / 10) 3391 if (pause > HZ / 10)
3379 pause = HZ / 10; 3392 pause = HZ / 10;
@@ -3583,8 +3596,20 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3583 3596
3584 if (num_bytes > 0) { 3597 if (num_bytes > 0) {
3585 if (dest) { 3598 if (dest) {
3586 block_rsv_add_bytes(dest, num_bytes, 0); 3599 spin_lock(&dest->lock);
3587 } else { 3600 if (!dest->full) {
3601 u64 bytes_to_add;
3602
3603 bytes_to_add = dest->size - dest->reserved;
3604 bytes_to_add = min(num_bytes, bytes_to_add);
3605 dest->reserved += bytes_to_add;
3606 if (dest->reserved >= dest->size)
3607 dest->full = 1;
3608 num_bytes -= bytes_to_add;
3609 }
3610 spin_unlock(&dest->lock);
3611 }
3612 if (num_bytes) {
3588 spin_lock(&space_info->lock); 3613 spin_lock(&space_info->lock);
3589 space_info->bytes_reserved -= num_bytes; 3614 space_info->bytes_reserved -= num_bytes;
3590 spin_unlock(&space_info->lock); 3615 spin_unlock(&space_info->lock);
@@ -3721,11 +3746,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3721 return 0; 3746 return 0;
3722 } 3747 }
3723 3748
3724 WARN_ON(1);
3725 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
3726 block_rsv->size, block_rsv->reserved,
3727 block_rsv->freed[0], block_rsv->freed[1]);
3728
3729 return -ENOSPC; 3749 return -ENOSPC;
3730} 3750}
3731 3751
@@ -4012,6 +4032,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4012 4032
4013 num_bytes = ALIGN(num_bytes, root->sectorsize); 4033 num_bytes = ALIGN(num_bytes, root->sectorsize);
4014 atomic_dec(&BTRFS_I(inode)->outstanding_extents); 4034 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
4035 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
4015 4036
4016 spin_lock(&BTRFS_I(inode)->accounting_lock); 4037 spin_lock(&BTRFS_I(inode)->accounting_lock);
4017 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); 4038 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
@@ -5633,6 +5654,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5633 struct btrfs_root *root, u32 blocksize) 5654 struct btrfs_root *root, u32 blocksize)
5634{ 5655{
5635 struct btrfs_block_rsv *block_rsv; 5656 struct btrfs_block_rsv *block_rsv;
5657 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5636 int ret; 5658 int ret;
5637 5659
5638 block_rsv = get_block_rsv(trans, root); 5660 block_rsv = get_block_rsv(trans, root);
@@ -5640,14 +5662,39 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5640 if (block_rsv->size == 0) { 5662 if (block_rsv->size == 0) {
5641 ret = reserve_metadata_bytes(trans, root, block_rsv, 5663 ret = reserve_metadata_bytes(trans, root, block_rsv,
5642 blocksize, 0); 5664 blocksize, 0);
5643 if (ret) 5665 /*
5666 * If we couldn't reserve metadata bytes try and use some from
5667 * the global reserve.
5668 */
5669 if (ret && block_rsv != global_rsv) {
5670 ret = block_rsv_use_bytes(global_rsv, blocksize);
5671 if (!ret)
5672 return global_rsv;
5673 return ERR_PTR(ret);
5674 } else if (ret) {
5644 return ERR_PTR(ret); 5675 return ERR_PTR(ret);
5676 }
5645 return block_rsv; 5677 return block_rsv;
5646 } 5678 }
5647 5679
5648 ret = block_rsv_use_bytes(block_rsv, blocksize); 5680 ret = block_rsv_use_bytes(block_rsv, blocksize);
5649 if (!ret) 5681 if (!ret)
5650 return block_rsv; 5682 return block_rsv;
5683 if (ret) {
5684 WARN_ON(1);
5685 ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
5686 0);
5687 if (!ret) {
5688 spin_lock(&block_rsv->lock);
5689 block_rsv->size += blocksize;
5690 spin_unlock(&block_rsv->lock);
5691 return block_rsv;
5692 } else if (ret && block_rsv != global_rsv) {
5693 ret = block_rsv_use_bytes(global_rsv, blocksize);
5694 if (!ret)
5695 return global_rsv;
5696 }
5697 }
5651 5698
5652 return ERR_PTR(-ENOSPC); 5699 return ERR_PTR(-ENOSPC);
5653} 5700}
@@ -6221,6 +6268,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6221 BUG_ON(!wc); 6268 BUG_ON(!wc);
6222 6269
6223 trans = btrfs_start_transaction(tree_root, 0); 6270 trans = btrfs_start_transaction(tree_root, 0);
6271 BUG_ON(IS_ERR(trans));
6272
6224 if (block_rsv) 6273 if (block_rsv)
6225 trans->block_rsv = block_rsv; 6274 trans->block_rsv = block_rsv;
6226 6275
@@ -6318,6 +6367,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6318 6367
6319 btrfs_end_transaction_throttle(trans, tree_root); 6368 btrfs_end_transaction_throttle(trans, tree_root);
6320 trans = btrfs_start_transaction(tree_root, 0); 6369 trans = btrfs_start_transaction(tree_root, 0);
6370 BUG_ON(IS_ERR(trans));
6321 if (block_rsv) 6371 if (block_rsv)
6322 trans->block_rsv = block_rsv; 6372 trans->block_rsv = block_rsv;
6323 } 6373 }
@@ -6446,6 +6496,8 @@ static noinline int relocate_inode_pages(struct inode *inode, u64 start,
6446 int ret = 0; 6496 int ret = 0;
6447 6497
6448 ra = kzalloc(sizeof(*ra), GFP_NOFS); 6498 ra = kzalloc(sizeof(*ra), GFP_NOFS);
6499 if (!ra)
6500 return -ENOMEM;
6449 6501
6450 mutex_lock(&inode->i_mutex); 6502 mutex_lock(&inode->i_mutex);
6451 first_index = start >> PAGE_CACHE_SHIFT; 6503 first_index = start >> PAGE_CACHE_SHIFT;
@@ -7477,7 +7529,7 @@ int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
7477 BUG_ON(reloc_root->commit_root != NULL); 7529 BUG_ON(reloc_root->commit_root != NULL);
7478 while (1) { 7530 while (1) {
7479 trans = btrfs_join_transaction(root, 1); 7531 trans = btrfs_join_transaction(root, 1);
7480 BUG_ON(!trans); 7532 BUG_ON(IS_ERR(trans));
7481 7533
7482 mutex_lock(&root->fs_info->drop_mutex); 7534 mutex_lock(&root->fs_info->drop_mutex);
7483 ret = btrfs_drop_snapshot(trans, reloc_root); 7535 ret = btrfs_drop_snapshot(trans, reloc_root);
@@ -7535,7 +7587,7 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
7535 7587
7536 if (found) { 7588 if (found) {
7537 trans = btrfs_start_transaction(root, 1); 7589 trans = btrfs_start_transaction(root, 1);
7538 BUG_ON(!trans); 7590 BUG_ON(IS_ERR(trans));
7539 ret = btrfs_commit_transaction(trans, root); 7591 ret = btrfs_commit_transaction(trans, root);
7540 BUG_ON(ret); 7592 BUG_ON(ret);
7541 } 7593 }
@@ -7779,7 +7831,7 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
7779 7831
7780 7832
7781 trans = btrfs_start_transaction(extent_root, 1); 7833 trans = btrfs_start_transaction(extent_root, 1);
7782 BUG_ON(!trans); 7834 BUG_ON(IS_ERR(trans));
7783 7835
7784 if (extent_key->objectid == 0) { 7836 if (extent_key->objectid == 0) {
7785 ret = del_extent_zero(trans, extent_root, path, extent_key); 7837 ret = del_extent_zero(trans, extent_root, path, extent_key);
@@ -7970,13 +8022,14 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache)
7970 8022
7971 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 8023 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7972 sinfo->bytes_may_use + sinfo->bytes_readonly + 8024 sinfo->bytes_may_use + sinfo->bytes_readonly +
7973 cache->reserved_pinned + num_bytes < sinfo->total_bytes) { 8025 cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
7974 sinfo->bytes_readonly += num_bytes; 8026 sinfo->bytes_readonly += num_bytes;
7975 sinfo->bytes_reserved += cache->reserved_pinned; 8027 sinfo->bytes_reserved += cache->reserved_pinned;
7976 cache->reserved_pinned = 0; 8028 cache->reserved_pinned = 0;
7977 cache->ro = 1; 8029 cache->ro = 1;
7978 ret = 0; 8030 ret = 0;
7979 } 8031 }
8032
7980 spin_unlock(&cache->lock); 8033 spin_unlock(&cache->lock);
7981 spin_unlock(&sinfo->lock); 8034 spin_unlock(&sinfo->lock);
7982 return ret; 8035 return ret;
@@ -8012,6 +8065,62 @@ out:
8012 return ret; 8065 return ret;
8013} 8066}
8014 8067
8068/*
8069 * helper to account the unused space of all the readonly block group in the
8070 * list. takes mirrors into account.
8071 */
8072static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
8073{
8074 struct btrfs_block_group_cache *block_group;
8075 u64 free_bytes = 0;
8076 int factor;
8077
8078 list_for_each_entry(block_group, groups_list, list) {
8079 spin_lock(&block_group->lock);
8080
8081 if (!block_group->ro) {
8082 spin_unlock(&block_group->lock);
8083 continue;
8084 }
8085
8086 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
8087 BTRFS_BLOCK_GROUP_RAID10 |
8088 BTRFS_BLOCK_GROUP_DUP))
8089 factor = 2;
8090 else
8091 factor = 1;
8092
8093 free_bytes += (block_group->key.offset -
8094 btrfs_block_group_used(&block_group->item)) *
8095 factor;
8096
8097 spin_unlock(&block_group->lock);
8098 }
8099
8100 return free_bytes;
8101}
8102
8103/*
8104 * helper to account the unused space of all the readonly block group in the
8105 * space_info. takes mirrors into account.
8106 */
8107u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
8108{
8109 int i;
8110 u64 free_bytes = 0;
8111
8112 spin_lock(&sinfo->lock);
8113
8114 for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
8115 if (!list_empty(&sinfo->block_groups[i]))
8116 free_bytes += __btrfs_get_ro_block_group_free_space(
8117 &sinfo->block_groups[i]);
8118
8119 spin_unlock(&sinfo->lock);
8120
8121 return free_bytes;
8122}
8123
8015int btrfs_set_block_group_rw(struct btrfs_root *root, 8124int btrfs_set_block_group_rw(struct btrfs_root *root,
8016 struct btrfs_block_group_cache *cache) 8125 struct btrfs_block_group_cache *cache)
8017{ 8126{
@@ -8092,7 +8201,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
8092 mutex_lock(&root->fs_info->chunk_mutex); 8201 mutex_lock(&root->fs_info->chunk_mutex);
8093 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 8202 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
8094 u64 min_free = btrfs_block_group_used(&block_group->item); 8203 u64 min_free = btrfs_block_group_used(&block_group->item);
8095 u64 dev_offset, max_avail; 8204 u64 dev_offset;
8096 8205
8097 /* 8206 /*
8098 * check to make sure we can actually find a chunk with enough 8207 * check to make sure we can actually find a chunk with enough
@@ -8100,7 +8209,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
8100 */ 8209 */
8101 if (device->total_bytes > device->bytes_used + min_free) { 8210 if (device->total_bytes > device->bytes_used + min_free) {
8102 ret = find_free_dev_extent(NULL, device, min_free, 8211 ret = find_free_dev_extent(NULL, device, min_free,
8103 &dev_offset, &max_avail); 8212 &dev_offset, NULL);
8104 if (!ret) 8213 if (!ret)
8105 break; 8214 break;
8106 ret = -1; 8215 ret = -1;
@@ -8213,6 +8322,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
8213 if (block_group->cached == BTRFS_CACHE_STARTED) 8322 if (block_group->cached == BTRFS_CACHE_STARTED)
8214 wait_block_group_cache_done(block_group); 8323 wait_block_group_cache_done(block_group);
8215 8324
8325 /*
8326 * We haven't cached this block group, which means we could
8327 * possibly have excluded extents on this block group.
8328 */
8329 if (block_group->cached == BTRFS_CACHE_NO)
8330 free_excluded_extents(info->extent_root, block_group);
8331
8216 btrfs_remove_free_space_cache(block_group); 8332 btrfs_remove_free_space_cache(block_group);
8217 btrfs_put_block_group(block_group); 8333 btrfs_put_block_group(block_group);
8218 8334
@@ -8328,6 +8444,13 @@ int btrfs_read_block_groups(struct btrfs_root *root)
8328 cache->sectorsize = root->sectorsize; 8444 cache->sectorsize = root->sectorsize;
8329 8445
8330 /* 8446 /*
8447 * We need to exclude the super stripes now so that the space
8448 * info has super bytes accounted for, otherwise we'll think
8449 * we have more space than we actually do.
8450 */
8451 exclude_super_stripes(root, cache);
8452
8453 /*
8331 * check for two cases, either we are full, and therefore 8454 * check for two cases, either we are full, and therefore
8332 * don't need to bother with the caching work since we won't 8455 * don't need to bother with the caching work since we won't
8333 * find any space, or we are empty, and we can just add all 8456 * find any space, or we are empty, and we can just add all
@@ -8335,12 +8458,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
8335 * time, particularly in the full case. 8458 * time, particularly in the full case.
8336 */ 8459 */
8337 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 8460 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
8338 exclude_super_stripes(root, cache);
8339 cache->last_byte_to_unpin = (u64)-1; 8461 cache->last_byte_to_unpin = (u64)-1;
8340 cache->cached = BTRFS_CACHE_FINISHED; 8462 cache->cached = BTRFS_CACHE_FINISHED;
8341 free_excluded_extents(root, cache); 8463 free_excluded_extents(root, cache);
8342 } else if (btrfs_block_group_used(&cache->item) == 0) { 8464 } else if (btrfs_block_group_used(&cache->item) == 0) {
8343 exclude_super_stripes(root, cache);
8344 cache->last_byte_to_unpin = (u64)-1; 8465 cache->last_byte_to_unpin = (u64)-1;
8345 cache->cached = BTRFS_CACHE_FINISHED; 8466 cache->cached = BTRFS_CACHE_FINISHED;
8346 add_new_free_space(cache, root->fs_info, 8467 add_new_free_space(cache, root->fs_info,
@@ -8584,3 +8705,14 @@ out:
8584 btrfs_free_path(path); 8705 btrfs_free_path(path);
8585 return ret; 8706 return ret;
8586} 8707}
8708
8709int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
8710{
8711 return unpin_extent_range(root, start, end);
8712}
8713
8714int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
8715 u64 num_bytes)
8716{
8717 return btrfs_discard_extent(root, bytenr, num_bytes);
8718}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3e86b9f36507..5e76a474cb7e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1865,7 +1865,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1865 bio_get(bio); 1865 bio_get(bio);
1866 1866
1867 if (tree->ops && tree->ops->submit_bio_hook) 1867 if (tree->ops && tree->ops->submit_bio_hook)
1868 tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1868 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1869 mirror_num, bio_flags, start); 1869 mirror_num, bio_flags, start);
1870 else 1870 else
1871 submit_bio(rw, bio); 1871 submit_bio(rw, bio);
@@ -1920,6 +1920,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
1920 nr = bio_get_nr_vecs(bdev); 1920 nr = bio_get_nr_vecs(bdev);
1921 1921
1922 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 1922 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1923 if (!bio)
1924 return -ENOMEM;
1923 1925
1924 bio_add_page(bio, page, page_size, offset); 1926 bio_add_page(bio, page, page_size, offset);
1925 bio->bi_end_io = end_io_func; 1927 bio->bi_end_io = end_io_func;
@@ -2028,8 +2030,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2028 BUG_ON(extent_map_end(em) <= cur); 2030 BUG_ON(extent_map_end(em) <= cur);
2029 BUG_ON(end < cur); 2031 BUG_ON(end < cur);
2030 2032
2031 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2033 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2032 this_bio_flag = EXTENT_BIO_COMPRESSED; 2034 this_bio_flag = EXTENT_BIO_COMPRESSED;
2035 extent_set_compress_type(&this_bio_flag,
2036 em->compress_type);
2037 }
2033 2038
2034 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2039 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2035 cur_end = min(extent_map_end(em) - 1, end); 2040 cur_end = min(extent_map_end(em) - 1, end);
@@ -2123,7 +2128,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2123 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2128 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2124 &bio_flags); 2129 &bio_flags);
2125 if (bio) 2130 if (bio)
2126 submit_one_bio(READ, bio, 0, bio_flags); 2131 ret = submit_one_bio(READ, bio, 0, bio_flags);
2127 return ret; 2132 return ret;
2128} 2133}
2129 2134
@@ -3072,6 +3077,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3072#endif 3077#endif
3073 3078
3074 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 3079 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3080 if (eb == NULL)
3081 return NULL;
3075 eb->start = start; 3082 eb->start = start;
3076 eb->len = len; 3083 eb->len = len;
3077 spin_lock_init(&eb->lock); 3084 spin_lock_init(&eb->lock);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4183c8178f01..7083cfafd061 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -20,8 +20,12 @@
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
22 22
23/* flags for bio submission */ 23/*
24 * flags for bio submission. The high bits indicate the compression
25 * type for this bio
26 */
24#define EXTENT_BIO_COMPRESSED 1 27#define EXTENT_BIO_COMPRESSED 1
28#define EXTENT_BIO_FLAG_SHIFT 16
25 29
26/* these are bit numbers for test/set bit */ 30/* these are bit numbers for test/set bit */
27#define EXTENT_BUFFER_UPTODATE 0 31#define EXTENT_BUFFER_UPTODATE 0
@@ -135,6 +139,17 @@ struct extent_buffer {
135 wait_queue_head_t lock_wq; 139 wait_queue_head_t lock_wq;
136}; 140};
137 141
142static inline void extent_set_compress_type(unsigned long *bio_flags,
143 int compress_type)
144{
145 *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
146}
147
148static inline int extent_compress_type(unsigned long bio_flags)
149{
150 return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
151}
152
138struct extent_map_tree; 153struct extent_map_tree;
139 154
140static inline struct extent_state *extent_state_next(struct extent_state *state) 155static inline struct extent_state *extent_state_next(struct extent_state *state)
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 23cb8da3ff66..b0e1fce12530 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,6 +3,7 @@
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/spinlock.h> 4#include <linux/spinlock.h>
5#include <linux/hardirq.h> 5#include <linux/hardirq.h>
6#include "ctree.h"
6#include "extent_map.h" 7#include "extent_map.h"
7 8
8 9
@@ -54,6 +55,7 @@ struct extent_map *alloc_extent_map(gfp_t mask)
54 return em; 55 return em;
55 em->in_tree = 0; 56 em->in_tree = 0;
56 em->flags = 0; 57 em->flags = 0;
58 em->compress_type = BTRFS_COMPRESS_NONE;
57 atomic_set(&em->refs, 1); 59 atomic_set(&em->refs, 1);
58 return em; 60 return em;
59} 61}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ab6d74b6e647..28b44dbd1e35 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -26,7 +26,8 @@ struct extent_map {
26 unsigned long flags; 26 unsigned long flags;
27 struct block_device *bdev; 27 struct block_device *bdev;
28 atomic_t refs; 28 atomic_t refs;
29 int in_tree; 29 unsigned int in_tree:1;
30 unsigned int compress_type:4;
30}; 31};
31 32
32struct extent_map_tree { 33struct extent_map_tree {
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a562a250ae77..4f19a3e1bf32 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -536,6 +536,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
536 root = root->fs_info->csum_root; 536 root = root->fs_info->csum_root;
537 537
538 path = btrfs_alloc_path(); 538 path = btrfs_alloc_path();
539 if (!path)
540 return -ENOMEM;
539 541
540 while (1) { 542 while (1) {
541 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 543 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -548,7 +550,10 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
548 if (path->slots[0] == 0) 550 if (path->slots[0] == 0)
549 goto out; 551 goto out;
550 path->slots[0]--; 552 path->slots[0]--;
553 } else if (ret < 0) {
554 goto out;
551 } 555 }
556
552 leaf = path->nodes[0]; 557 leaf = path->nodes[0];
553 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 558 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
554 559
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 66836d85763b..c1d3a818731a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
24#include <linux/string.h> 24#include <linux/string.h>
25#include <linux/backing-dev.h> 25#include <linux/backing-dev.h>
26#include <linux/mpage.h> 26#include <linux/mpage.h>
27#include <linux/falloc.h>
27#include <linux/swap.h> 28#include <linux/swap.h>
28#include <linux/writeback.h> 29#include <linux/writeback.h>
29#include <linux/statfs.h> 30#include <linux/statfs.h>
@@ -224,6 +225,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
224 225
225 split->bdev = em->bdev; 226 split->bdev = em->bdev;
226 split->flags = flags; 227 split->flags = flags;
228 split->compress_type = em->compress_type;
227 ret = add_extent_mapping(em_tree, split); 229 ret = add_extent_mapping(em_tree, split);
228 BUG_ON(ret); 230 BUG_ON(ret);
229 free_extent_map(split); 231 free_extent_map(split);
@@ -238,6 +240,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
238 split->len = em->start + em->len - (start + len); 240 split->len = em->start + em->len - (start + len);
239 split->bdev = em->bdev; 241 split->bdev = em->bdev;
240 split->flags = flags; 242 split->flags = flags;
243 split->compress_type = em->compress_type;
241 244
242 if (compressed) { 245 if (compressed) {
243 split->block_len = em->block_len; 246 split->block_len = em->block_len;
@@ -790,8 +793,12 @@ again:
790 for (i = 0; i < num_pages; i++) { 793 for (i = 0; i < num_pages; i++) {
791 pages[i] = grab_cache_page(inode->i_mapping, index + i); 794 pages[i] = grab_cache_page(inode->i_mapping, index + i);
792 if (!pages[i]) { 795 if (!pages[i]) {
793 err = -ENOMEM; 796 int c;
794 BUG_ON(1); 797 for (c = i - 1; c >= 0; c--) {
798 unlock_page(pages[c]);
799 page_cache_release(pages[c]);
800 }
801 return -ENOMEM;
795 } 802 }
796 wait_on_page_writeback(pages[i]); 803 wait_on_page_writeback(pages[i]);
797 } 804 }
@@ -890,6 +897,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
890 if (err) 897 if (err)
891 goto out; 898 goto out;
892 899
900 /*
901 * If BTRFS flips readonly due to some impossible error
902 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
903 * although we have opened a file as writable, we have
904 * to stop this write operation to ensure FS consistency.
905 */
906 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
907 err = -EROFS;
908 goto out;
909 }
910
893 file_update_time(file); 911 file_update_time(file);
894 BTRFS_I(inode)->sequence++; 912 BTRFS_I(inode)->sequence++;
895 913
@@ -932,6 +950,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
932 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / 950 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
933 (sizeof(struct page *))); 951 (sizeof(struct page *)));
934 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 952 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
953 if (!pages) {
954 ret = -ENOMEM;
955 goto out;
956 }
935 957
936 /* generic_write_checks can change our pos */ 958 /* generic_write_checks can change our pos */
937 start_pos = pos; 959 start_pos = pos;
@@ -970,8 +992,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
970 size_t write_bytes = min(iov_iter_count(&i), 992 size_t write_bytes = min(iov_iter_count(&i),
971 nrptrs * (size_t)PAGE_CACHE_SIZE - 993 nrptrs * (size_t)PAGE_CACHE_SIZE -
972 offset); 994 offset);
973 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> 995 size_t num_pages = (write_bytes + offset +
974 PAGE_CACHE_SHIFT; 996 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
975 997
976 WARN_ON(num_pages > nrptrs); 998 WARN_ON(num_pages > nrptrs);
977 memset(pages, 0, sizeof(struct page *) * nrptrs); 999 memset(pages, 0, sizeof(struct page *) * nrptrs);
@@ -1001,8 +1023,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1001 1023
1002 copied = btrfs_copy_from_user(pos, num_pages, 1024 copied = btrfs_copy_from_user(pos, num_pages,
1003 write_bytes, pages, &i); 1025 write_bytes, pages, &i);
1004 dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >> 1026 dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >>
1005 PAGE_CACHE_SHIFT; 1027 PAGE_CACHE_SHIFT;
1006 1028
1007 if (num_pages > dirty_pages) { 1029 if (num_pages > dirty_pages) {
1008 if (copied > 0) 1030 if (copied > 0)
@@ -1237,6 +1259,117 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1237 return 0; 1259 return 0;
1238} 1260}
1239 1261
1262static long btrfs_fallocate(struct file *file, int mode,
1263 loff_t offset, loff_t len)
1264{
1265 struct inode *inode = file->f_path.dentry->d_inode;
1266 struct extent_state *cached_state = NULL;
1267 u64 cur_offset;
1268 u64 last_byte;
1269 u64 alloc_start;
1270 u64 alloc_end;
1271 u64 alloc_hint = 0;
1272 u64 locked_end;
1273 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1274 struct extent_map *em;
1275 int ret;
1276
1277 alloc_start = offset & ~mask;
1278 alloc_end = (offset + len + mask) & ~mask;
1279
1280 /* We only support the FALLOC_FL_KEEP_SIZE mode */
1281 if (mode & ~FALLOC_FL_KEEP_SIZE)
1282 return -EOPNOTSUPP;
1283
1284 /*
1285 * wait for ordered IO before we have any locks. We'll loop again
1286 * below with the locks held.
1287 */
1288 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
1289
1290 mutex_lock(&inode->i_mutex);
1291 ret = inode_newsize_ok(inode, alloc_end);
1292 if (ret)
1293 goto out;
1294
1295 if (alloc_start > inode->i_size) {
1296 ret = btrfs_cont_expand(inode, alloc_start);
1297 if (ret)
1298 goto out;
1299 }
1300
1301 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1302 if (ret)
1303 goto out;
1304
1305 locked_end = alloc_end - 1;
1306 while (1) {
1307 struct btrfs_ordered_extent *ordered;
1308
1309 /* the extent lock is ordered inside the running
1310 * transaction
1311 */
1312 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
1313 locked_end, 0, &cached_state, GFP_NOFS);
1314 ordered = btrfs_lookup_first_ordered_extent(inode,
1315 alloc_end - 1);
1316 if (ordered &&
1317 ordered->file_offset + ordered->len > alloc_start &&
1318 ordered->file_offset < alloc_end) {
1319 btrfs_put_ordered_extent(ordered);
1320 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1321 alloc_start, locked_end,
1322 &cached_state, GFP_NOFS);
1323 /*
1324 * we can't wait on the range with the transaction
1325 * running or with the extent lock held
1326 */
1327 btrfs_wait_ordered_range(inode, alloc_start,
1328 alloc_end - alloc_start);
1329 } else {
1330 if (ordered)
1331 btrfs_put_ordered_extent(ordered);
1332 break;
1333 }
1334 }
1335
1336 cur_offset = alloc_start;
1337 while (1) {
1338 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
1339 alloc_end - cur_offset, 0);
1340 BUG_ON(IS_ERR(em) || !em);
1341 last_byte = min(extent_map_end(em), alloc_end);
1342 last_byte = (last_byte + mask) & ~mask;
1343 if (em->block_start == EXTENT_MAP_HOLE ||
1344 (cur_offset >= inode->i_size &&
1345 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
1346 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
1347 last_byte - cur_offset,
1348 1 << inode->i_blkbits,
1349 offset + len,
1350 &alloc_hint);
1351 if (ret < 0) {
1352 free_extent_map(em);
1353 break;
1354 }
1355 }
1356 free_extent_map(em);
1357
1358 cur_offset = last_byte;
1359 if (cur_offset >= alloc_end) {
1360 ret = 0;
1361 break;
1362 }
1363 }
1364 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
1365 &cached_state, GFP_NOFS);
1366
1367 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1368out:
1369 mutex_unlock(&inode->i_mutex);
1370 return ret;
1371}
1372
1240const struct file_operations btrfs_file_operations = { 1373const struct file_operations btrfs_file_operations = {
1241 .llseek = generic_file_llseek, 1374 .llseek = generic_file_llseek,
1242 .read = do_sync_read, 1375 .read = do_sync_read,
@@ -1248,6 +1381,7 @@ const struct file_operations btrfs_file_operations = {
1248 .open = generic_file_open, 1381 .open = generic_file_open,
1249 .release = btrfs_release_file, 1382 .release = btrfs_release_file,
1250 .fsync = btrfs_sync_file, 1383 .fsync = btrfs_sync_file,
1384 .fallocate = btrfs_fallocate,
1251 .unlocked_ioctl = btrfs_ioctl, 1385 .unlocked_ioctl = btrfs_ioctl,
1252#ifdef CONFIG_COMPAT 1386#ifdef CONFIG_COMPAT
1253 .compat_ioctl = btrfs_ioctl, 1387 .compat_ioctl = btrfs_ioctl,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 60d684266959..a0390657451b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -987,11 +987,18 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
987 return entry; 987 return entry;
988} 988}
989 989
990static void unlink_free_space(struct btrfs_block_group_cache *block_group, 990static inline void
991 struct btrfs_free_space *info) 991__unlink_free_space(struct btrfs_block_group_cache *block_group,
992 struct btrfs_free_space *info)
992{ 993{
993 rb_erase(&info->offset_index, &block_group->free_space_offset); 994 rb_erase(&info->offset_index, &block_group->free_space_offset);
994 block_group->free_extents--; 995 block_group->free_extents--;
996}
997
998static void unlink_free_space(struct btrfs_block_group_cache *block_group,
999 struct btrfs_free_space *info)
1000{
1001 __unlink_free_space(block_group, info);
995 block_group->free_space -= info->bytes; 1002 block_group->free_space -= info->bytes;
996} 1003}
997 1004
@@ -1016,14 +1023,18 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
1016 u64 max_bytes; 1023 u64 max_bytes;
1017 u64 bitmap_bytes; 1024 u64 bitmap_bytes;
1018 u64 extent_bytes; 1025 u64 extent_bytes;
1026 u64 size = block_group->key.offset;
1019 1027
1020 /* 1028 /*
1021 * The goal is to keep the total amount of memory used per 1gb of space 1029 * The goal is to keep the total amount of memory used per 1gb of space
1022 * at or below 32k, so we need to adjust how much memory we allow to be 1030 * at or below 32k, so we need to adjust how much memory we allow to be
1023 * used by extent based free space tracking 1031 * used by extent based free space tracking
1024 */ 1032 */
1025 max_bytes = MAX_CACHE_BYTES_PER_GIG * 1033 if (size < 1024 * 1024 * 1024)
1026 (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); 1034 max_bytes = MAX_CACHE_BYTES_PER_GIG;
1035 else
1036 max_bytes = MAX_CACHE_BYTES_PER_GIG *
1037 div64_u64(size, 1024 * 1024 * 1024);
1027 1038
1028 /* 1039 /*
1029 * we want to account for 1 more bitmap than what we have so we can make 1040 * we want to account for 1 more bitmap than what we have so we can make
@@ -1171,6 +1182,16 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
1171 recalculate_thresholds(block_group); 1182 recalculate_thresholds(block_group);
1172} 1183}
1173 1184
1185static void free_bitmap(struct btrfs_block_group_cache *block_group,
1186 struct btrfs_free_space *bitmap_info)
1187{
1188 unlink_free_space(block_group, bitmap_info);
1189 kfree(bitmap_info->bitmap);
1190 kfree(bitmap_info);
1191 block_group->total_bitmaps--;
1192 recalculate_thresholds(block_group);
1193}
1194
1174static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group, 1195static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
1175 struct btrfs_free_space *bitmap_info, 1196 struct btrfs_free_space *bitmap_info,
1176 u64 *offset, u64 *bytes) 1197 u64 *offset, u64 *bytes)
@@ -1195,6 +1216,7 @@ again:
1195 */ 1216 */
1196 search_start = *offset; 1217 search_start = *offset;
1197 search_bytes = *bytes; 1218 search_bytes = *bytes;
1219 search_bytes = min(search_bytes, end - search_start + 1);
1198 ret = search_bitmap(block_group, bitmap_info, &search_start, 1220 ret = search_bitmap(block_group, bitmap_info, &search_start,
1199 &search_bytes); 1221 &search_bytes);
1200 BUG_ON(ret < 0 || search_start != *offset); 1222 BUG_ON(ret < 0 || search_start != *offset);
@@ -1211,13 +1233,8 @@ again:
1211 1233
1212 if (*bytes) { 1234 if (*bytes) {
1213 struct rb_node *next = rb_next(&bitmap_info->offset_index); 1235 struct rb_node *next = rb_next(&bitmap_info->offset_index);
1214 if (!bitmap_info->bytes) { 1236 if (!bitmap_info->bytes)
1215 unlink_free_space(block_group, bitmap_info); 1237 free_bitmap(block_group, bitmap_info);
1216 kfree(bitmap_info->bitmap);
1217 kfree(bitmap_info);
1218 block_group->total_bitmaps--;
1219 recalculate_thresholds(block_group);
1220 }
1221 1238
1222 /* 1239 /*
1223 * no entry after this bitmap, but we still have bytes to 1240 * no entry after this bitmap, but we still have bytes to
@@ -1250,13 +1267,8 @@ again:
1250 return -EAGAIN; 1267 return -EAGAIN;
1251 1268
1252 goto again; 1269 goto again;
1253 } else if (!bitmap_info->bytes) { 1270 } else if (!bitmap_info->bytes)
1254 unlink_free_space(block_group, bitmap_info); 1271 free_bitmap(block_group, bitmap_info);
1255 kfree(bitmap_info->bitmap);
1256 kfree(bitmap_info);
1257 block_group->total_bitmaps--;
1258 recalculate_thresholds(block_group);
1259 }
1260 1272
1261 return 0; 1273 return 0;
1262} 1274}
@@ -1359,22 +1371,14 @@ out:
1359 return ret; 1371 return ret;
1360} 1372}
1361 1373
1362int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 1374bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
1363 u64 offset, u64 bytes) 1375 struct btrfs_free_space *info, bool update_stat)
1364{ 1376{
1365 struct btrfs_free_space *right_info = NULL; 1377 struct btrfs_free_space *left_info;
1366 struct btrfs_free_space *left_info = NULL; 1378 struct btrfs_free_space *right_info;
1367 struct btrfs_free_space *info = NULL; 1379 bool merged = false;
1368 int ret = 0; 1380 u64 offset = info->offset;
1369 1381 u64 bytes = info->bytes;
1370 info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
1371 if (!info)
1372 return -ENOMEM;
1373
1374 info->offset = offset;
1375 info->bytes = bytes;
1376
1377 spin_lock(&block_group->tree_lock);
1378 1382
1379 /* 1383 /*
1380 * first we want to see if there is free space adjacent to the range we 1384 * first we want to see if there is free space adjacent to the range we
@@ -1388,37 +1392,62 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
1388 else 1392 else
1389 left_info = tree_search_offset(block_group, offset - 1, 0, 0); 1393 left_info = tree_search_offset(block_group, offset - 1, 0, 0);
1390 1394
1391 /*
1392 * If there was no extent directly to the left or right of this new
1393 * extent then we know we're going to have to allocate a new extent, so
1394 * before we do that see if we need to drop this into a bitmap
1395 */
1396 if ((!left_info || left_info->bitmap) &&
1397 (!right_info || right_info->bitmap)) {
1398 ret = insert_into_bitmap(block_group, info);
1399
1400 if (ret < 0) {
1401 goto out;
1402 } else if (ret) {
1403 ret = 0;
1404 goto out;
1405 }
1406 }
1407
1408 if (right_info && !right_info->bitmap) { 1395 if (right_info && !right_info->bitmap) {
1409 unlink_free_space(block_group, right_info); 1396 if (update_stat)
1397 unlink_free_space(block_group, right_info);
1398 else
1399 __unlink_free_space(block_group, right_info);
1410 info->bytes += right_info->bytes; 1400 info->bytes += right_info->bytes;
1411 kfree(right_info); 1401 kfree(right_info);
1402 merged = true;
1412 } 1403 }
1413 1404
1414 if (left_info && !left_info->bitmap && 1405 if (left_info && !left_info->bitmap &&
1415 left_info->offset + left_info->bytes == offset) { 1406 left_info->offset + left_info->bytes == offset) {
1416 unlink_free_space(block_group, left_info); 1407 if (update_stat)
1408 unlink_free_space(block_group, left_info);
1409 else
1410 __unlink_free_space(block_group, left_info);
1417 info->offset = left_info->offset; 1411 info->offset = left_info->offset;
1418 info->bytes += left_info->bytes; 1412 info->bytes += left_info->bytes;
1419 kfree(left_info); 1413 kfree(left_info);
1414 merged = true;
1420 } 1415 }
1421 1416
1417 return merged;
1418}
1419
1420int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
1421 u64 offset, u64 bytes)
1422{
1423 struct btrfs_free_space *info;
1424 int ret = 0;
1425
1426 info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
1427 if (!info)
1428 return -ENOMEM;
1429
1430 info->offset = offset;
1431 info->bytes = bytes;
1432
1433 spin_lock(&block_group->tree_lock);
1434
1435 if (try_merge_free_space(block_group, info, true))
1436 goto link;
1437
1438 /*
1439 * There was no extent directly to the left or right of this new
1440 * extent then we know we're going to have to allocate a new extent, so
1441 * before we do that see if we need to drop this into a bitmap
1442 */
1443 ret = insert_into_bitmap(block_group, info);
1444 if (ret < 0) {
1445 goto out;
1446 } else if (ret) {
1447 ret = 0;
1448 goto out;
1449 }
1450link:
1422 ret = link_free_space(block_group, info); 1451 ret = link_free_space(block_group, info);
1423 if (ret) 1452 if (ret)
1424 kfree(info); 1453 kfree(info);
@@ -1621,6 +1650,7 @@ __btrfs_return_cluster_to_free_space(
1621 node = rb_next(&entry->offset_index); 1650 node = rb_next(&entry->offset_index);
1622 rb_erase(&entry->offset_index, &cluster->root); 1651 rb_erase(&entry->offset_index, &cluster->root);
1623 BUG_ON(entry->bitmap); 1652 BUG_ON(entry->bitmap);
1653 try_merge_free_space(block_group, entry, false);
1624 tree_insert_offset(&block_group->free_space_offset, 1654 tree_insert_offset(&block_group->free_space_offset,
1625 entry->offset, &entry->offset_index, 0); 1655 entry->offset, &entry->offset_index, 0);
1626 } 1656 }
@@ -1685,13 +1715,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
1685 ret = offset; 1715 ret = offset;
1686 if (entry->bitmap) { 1716 if (entry->bitmap) {
1687 bitmap_clear_bits(block_group, entry, offset, bytes); 1717 bitmap_clear_bits(block_group, entry, offset, bytes);
1688 if (!entry->bytes) { 1718 if (!entry->bytes)
1689 unlink_free_space(block_group, entry); 1719 free_bitmap(block_group, entry);
1690 kfree(entry->bitmap);
1691 kfree(entry);
1692 block_group->total_bitmaps--;
1693 recalculate_thresholds(block_group);
1694 }
1695 } else { 1720 } else {
1696 unlink_free_space(block_group, entry); 1721 unlink_free_space(block_group, entry);
1697 entry->offset += bytes; 1722 entry->offset += bytes;
@@ -1789,6 +1814,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
1789 1814
1790 ret = search_start; 1815 ret = search_start;
1791 bitmap_clear_bits(block_group, entry, ret, bytes); 1816 bitmap_clear_bits(block_group, entry, ret, bytes);
1817 if (entry->bytes == 0)
1818 free_bitmap(block_group, entry);
1792out: 1819out:
1793 spin_unlock(&cluster->lock); 1820 spin_unlock(&cluster->lock);
1794 spin_unlock(&block_group->tree_lock); 1821 spin_unlock(&block_group->tree_lock);
@@ -1842,15 +1869,26 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
1842 entry->offset += bytes; 1869 entry->offset += bytes;
1843 entry->bytes -= bytes; 1870 entry->bytes -= bytes;
1844 1871
1845 if (entry->bytes == 0) { 1872 if (entry->bytes == 0)
1846 rb_erase(&entry->offset_index, &cluster->root); 1873 rb_erase(&entry->offset_index, &cluster->root);
1847 kfree(entry);
1848 }
1849 break; 1874 break;
1850 } 1875 }
1851out: 1876out:
1852 spin_unlock(&cluster->lock); 1877 spin_unlock(&cluster->lock);
1853 1878
1879 if (!ret)
1880 return 0;
1881
1882 spin_lock(&block_group->tree_lock);
1883
1884 block_group->free_space -= bytes;
1885 if (entry->bytes == 0) {
1886 block_group->free_extents--;
1887 kfree(entry);
1888 }
1889
1890 spin_unlock(&block_group->tree_lock);
1891
1854 return ret; 1892 return ret;
1855} 1893}
1856 1894
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a3798a3aa0d2..bcc461a9695f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -122,10 +122,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
122 size_t cur_size = size; 122 size_t cur_size = size;
123 size_t datasize; 123 size_t datasize;
124 unsigned long offset; 124 unsigned long offset;
125 int use_compress = 0; 125 int compress_type = BTRFS_COMPRESS_NONE;
126 126
127 if (compressed_size && compressed_pages) { 127 if (compressed_size && compressed_pages) {
128 use_compress = 1; 128 compress_type = root->fs_info->compress_type;
129 cur_size = compressed_size; 129 cur_size = compressed_size;
130 } 130 }
131 131
@@ -159,7 +159,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
159 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 159 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
160 ptr = btrfs_file_extent_inline_start(ei); 160 ptr = btrfs_file_extent_inline_start(ei);
161 161
162 if (use_compress) { 162 if (compress_type != BTRFS_COMPRESS_NONE) {
163 struct page *cpage; 163 struct page *cpage;
164 int i = 0; 164 int i = 0;
165 while (compressed_size > 0) { 165 while (compressed_size > 0) {
@@ -176,7 +176,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
176 compressed_size -= cur_size; 176 compressed_size -= cur_size;
177 } 177 }
178 btrfs_set_file_extent_compression(leaf, ei, 178 btrfs_set_file_extent_compression(leaf, ei,
179 BTRFS_COMPRESS_ZLIB); 179 compress_type);
180 } else { 180 } else {
181 page = find_get_page(inode->i_mapping, 181 page = find_get_page(inode->i_mapping,
182 start >> PAGE_CACHE_SHIFT); 182 start >> PAGE_CACHE_SHIFT);
@@ -263,6 +263,7 @@ struct async_extent {
263 u64 compressed_size; 263 u64 compressed_size;
264 struct page **pages; 264 struct page **pages;
265 unsigned long nr_pages; 265 unsigned long nr_pages;
266 int compress_type;
266 struct list_head list; 267 struct list_head list;
267}; 268};
268 269
@@ -280,7 +281,8 @@ static noinline int add_async_extent(struct async_cow *cow,
280 u64 start, u64 ram_size, 281 u64 start, u64 ram_size,
281 u64 compressed_size, 282 u64 compressed_size,
282 struct page **pages, 283 struct page **pages,
283 unsigned long nr_pages) 284 unsigned long nr_pages,
285 int compress_type)
284{ 286{
285 struct async_extent *async_extent; 287 struct async_extent *async_extent;
286 288
@@ -290,6 +292,7 @@ static noinline int add_async_extent(struct async_cow *cow,
290 async_extent->compressed_size = compressed_size; 292 async_extent->compressed_size = compressed_size;
291 async_extent->pages = pages; 293 async_extent->pages = pages;
292 async_extent->nr_pages = nr_pages; 294 async_extent->nr_pages = nr_pages;
295 async_extent->compress_type = compress_type;
293 list_add_tail(&async_extent->list, &cow->extents); 296 list_add_tail(&async_extent->list, &cow->extents);
294 return 0; 297 return 0;
295} 298}
@@ -332,6 +335,7 @@ static noinline int compress_file_range(struct inode *inode,
332 unsigned long max_uncompressed = 128 * 1024; 335 unsigned long max_uncompressed = 128 * 1024;
333 int i; 336 int i;
334 int will_compress; 337 int will_compress;
338 int compress_type = root->fs_info->compress_type;
335 339
336 actual_end = min_t(u64, isize, end + 1); 340 actual_end = min_t(u64, isize, end + 1);
337again: 341again:
@@ -381,12 +385,16 @@ again:
381 WARN_ON(pages); 385 WARN_ON(pages);
382 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 386 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
383 387
384 ret = btrfs_zlib_compress_pages(inode->i_mapping, start, 388 if (BTRFS_I(inode)->force_compress)
385 total_compressed, pages, 389 compress_type = BTRFS_I(inode)->force_compress;
386 nr_pages, &nr_pages_ret, 390
387 &total_in, 391 ret = btrfs_compress_pages(compress_type,
388 &total_compressed, 392 inode->i_mapping, start,
389 max_compressed); 393 total_compressed, pages,
394 nr_pages, &nr_pages_ret,
395 &total_in,
396 &total_compressed,
397 max_compressed);
390 398
391 if (!ret) { 399 if (!ret) {
392 unsigned long offset = total_compressed & 400 unsigned long offset = total_compressed &
@@ -408,7 +416,7 @@ again:
408 } 416 }
409 if (start == 0) { 417 if (start == 0) {
410 trans = btrfs_join_transaction(root, 1); 418 trans = btrfs_join_transaction(root, 1);
411 BUG_ON(!trans); 419 BUG_ON(IS_ERR(trans));
412 btrfs_set_trans_block_group(trans, inode); 420 btrfs_set_trans_block_group(trans, inode);
413 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 421 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
414 422
@@ -493,7 +501,8 @@ again:
493 * and will submit them to the elevator. 501 * and will submit them to the elevator.
494 */ 502 */
495 add_async_extent(async_cow, start, num_bytes, 503 add_async_extent(async_cow, start, num_bytes,
496 total_compressed, pages, nr_pages_ret); 504 total_compressed, pages, nr_pages_ret,
505 compress_type);
497 506
498 if (start + num_bytes < end) { 507 if (start + num_bytes < end) {
499 start += num_bytes; 508 start += num_bytes;
@@ -515,7 +524,8 @@ cleanup_and_bail_uncompressed:
515 __set_page_dirty_nobuffers(locked_page); 524 __set_page_dirty_nobuffers(locked_page);
516 /* unlocked later on in the async handlers */ 525 /* unlocked later on in the async handlers */
517 } 526 }
518 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0); 527 add_async_extent(async_cow, start, end - start + 1,
528 0, NULL, 0, BTRFS_COMPRESS_NONE);
519 *num_added += 1; 529 *num_added += 1;
520 } 530 }
521 531
@@ -602,6 +612,7 @@ retry:
602 GFP_NOFS); 612 GFP_NOFS);
603 613
604 trans = btrfs_join_transaction(root, 1); 614 trans = btrfs_join_transaction(root, 1);
615 BUG_ON(IS_ERR(trans));
605 ret = btrfs_reserve_extent(trans, root, 616 ret = btrfs_reserve_extent(trans, root,
606 async_extent->compressed_size, 617 async_extent->compressed_size,
607 async_extent->compressed_size, 618 async_extent->compressed_size,
@@ -640,6 +651,7 @@ retry:
640 em->block_start = ins.objectid; 651 em->block_start = ins.objectid;
641 em->block_len = ins.offset; 652 em->block_len = ins.offset;
642 em->bdev = root->fs_info->fs_devices->latest_bdev; 653 em->bdev = root->fs_info->fs_devices->latest_bdev;
654 em->compress_type = async_extent->compress_type;
643 set_bit(EXTENT_FLAG_PINNED, &em->flags); 655 set_bit(EXTENT_FLAG_PINNED, &em->flags);
644 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 656 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
645 657
@@ -656,11 +668,13 @@ retry:
656 async_extent->ram_size - 1, 0); 668 async_extent->ram_size - 1, 0);
657 } 669 }
658 670
659 ret = btrfs_add_ordered_extent(inode, async_extent->start, 671 ret = btrfs_add_ordered_extent_compress(inode,
660 ins.objectid, 672 async_extent->start,
661 async_extent->ram_size, 673 ins.objectid,
662 ins.offset, 674 async_extent->ram_size,
663 BTRFS_ORDERED_COMPRESSED); 675 ins.offset,
676 BTRFS_ORDERED_COMPRESSED,
677 async_extent->compress_type);
664 BUG_ON(ret); 678 BUG_ON(ret);
665 679
666 /* 680 /*
@@ -758,7 +772,7 @@ static noinline int cow_file_range(struct inode *inode,
758 772
759 BUG_ON(root == root->fs_info->tree_root); 773 BUG_ON(root == root->fs_info->tree_root);
760 trans = btrfs_join_transaction(root, 1); 774 trans = btrfs_join_transaction(root, 1);
761 BUG_ON(!trans); 775 BUG_ON(IS_ERR(trans));
762 btrfs_set_trans_block_group(trans, inode); 776 btrfs_set_trans_block_group(trans, inode);
763 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 777 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
764 778
@@ -1036,7 +1050,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1036 } else { 1050 } else {
1037 trans = btrfs_join_transaction(root, 1); 1051 trans = btrfs_join_transaction(root, 1);
1038 } 1052 }
1039 BUG_ON(!trans); 1053 BUG_ON(IS_ERR(trans));
1040 1054
1041 cow_start = (u64)-1; 1055 cow_start = (u64)-1;
1042 cur_offset = start; 1056 cur_offset = start;
@@ -1544,6 +1558,7 @@ out:
1544out_page: 1558out_page:
1545 unlock_page(page); 1559 unlock_page(page);
1546 page_cache_release(page); 1560 page_cache_release(page);
1561 kfree(fixup);
1547} 1562}
1548 1563
1549/* 1564/*
@@ -1670,7 +1685,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1670 struct btrfs_ordered_extent *ordered_extent = NULL; 1685 struct btrfs_ordered_extent *ordered_extent = NULL;
1671 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1686 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1672 struct extent_state *cached_state = NULL; 1687 struct extent_state *cached_state = NULL;
1673 int compressed = 0; 1688 int compress_type = 0;
1674 int ret; 1689 int ret;
1675 bool nolock = false; 1690 bool nolock = false;
1676 1691
@@ -1690,7 +1705,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1690 trans = btrfs_join_transaction_nolock(root, 1); 1705 trans = btrfs_join_transaction_nolock(root, 1);
1691 else 1706 else
1692 trans = btrfs_join_transaction(root, 1); 1707 trans = btrfs_join_transaction(root, 1);
1693 BUG_ON(!trans); 1708 BUG_ON(IS_ERR(trans));
1694 btrfs_set_trans_block_group(trans, inode); 1709 btrfs_set_trans_block_group(trans, inode);
1695 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1710 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1696 ret = btrfs_update_inode(trans, root, inode); 1711 ret = btrfs_update_inode(trans, root, inode);
@@ -1707,13 +1722,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1707 trans = btrfs_join_transaction_nolock(root, 1); 1722 trans = btrfs_join_transaction_nolock(root, 1);
1708 else 1723 else
1709 trans = btrfs_join_transaction(root, 1); 1724 trans = btrfs_join_transaction(root, 1);
1725 BUG_ON(IS_ERR(trans));
1710 btrfs_set_trans_block_group(trans, inode); 1726 btrfs_set_trans_block_group(trans, inode);
1711 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1727 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1712 1728
1713 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1729 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1714 compressed = 1; 1730 compress_type = ordered_extent->compress_type;
1715 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1731 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1716 BUG_ON(compressed); 1732 BUG_ON(compress_type);
1717 ret = btrfs_mark_extent_written(trans, inode, 1733 ret = btrfs_mark_extent_written(trans, inode,
1718 ordered_extent->file_offset, 1734 ordered_extent->file_offset,
1719 ordered_extent->file_offset + 1735 ordered_extent->file_offset +
@@ -1727,7 +1743,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1727 ordered_extent->disk_len, 1743 ordered_extent->disk_len,
1728 ordered_extent->len, 1744 ordered_extent->len,
1729 ordered_extent->len, 1745 ordered_extent->len,
1730 compressed, 0, 0, 1746 compress_type, 0, 0,
1731 BTRFS_FILE_EXTENT_REG); 1747 BTRFS_FILE_EXTENT_REG);
1732 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 1748 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1733 ordered_extent->file_offset, 1749 ordered_extent->file_offset,
@@ -1829,6 +1845,8 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1829 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 1845 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1830 logical = em->block_start; 1846 logical = em->block_start;
1831 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 1847 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1848 extent_set_compress_type(&failrec->bio_flags,
1849 em->compress_type);
1832 } 1850 }
1833 failrec->logical = logical; 1851 failrec->logical = logical;
1834 free_extent_map(em); 1852 free_extent_map(em);
@@ -2339,6 +2357,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2339 */ 2357 */
2340 if (is_bad_inode(inode)) { 2358 if (is_bad_inode(inode)) {
2341 trans = btrfs_start_transaction(root, 0); 2359 trans = btrfs_start_transaction(root, 0);
2360 BUG_ON(IS_ERR(trans));
2342 btrfs_orphan_del(trans, inode); 2361 btrfs_orphan_del(trans, inode);
2343 btrfs_end_transaction(trans, root); 2362 btrfs_end_transaction(trans, root);
2344 iput(inode); 2363 iput(inode);
@@ -2366,6 +2385,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2366 2385
2367 if (root->orphan_block_rsv || root->orphan_item_inserted) { 2386 if (root->orphan_block_rsv || root->orphan_item_inserted) {
2368 trans = btrfs_join_transaction(root, 1); 2387 trans = btrfs_join_transaction(root, 1);
2388 BUG_ON(IS_ERR(trans));
2369 btrfs_end_transaction(trans, root); 2389 btrfs_end_transaction(trans, root);
2370 } 2390 }
2371 2391
@@ -2626,7 +2646,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2626 path = btrfs_alloc_path(); 2646 path = btrfs_alloc_path();
2627 if (!path) { 2647 if (!path) {
2628 ret = -ENOMEM; 2648 ret = -ENOMEM;
2629 goto err; 2649 goto out;
2630 } 2650 }
2631 2651
2632 path->leave_spinning = 1; 2652 path->leave_spinning = 1;
@@ -2699,9 +2719,10 @@ static int check_path_shared(struct btrfs_root *root,
2699 struct extent_buffer *eb; 2719 struct extent_buffer *eb;
2700 int level; 2720 int level;
2701 u64 refs = 1; 2721 u64 refs = 1;
2702 int uninitialized_var(ret);
2703 2722
2704 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2723 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2724 int ret;
2725
2705 if (!path->nodes[level]) 2726 if (!path->nodes[level])
2706 break; 2727 break;
2707 eb = path->nodes[level]; 2728 eb = path->nodes[level];
@@ -2712,7 +2733,7 @@ static int check_path_shared(struct btrfs_root *root,
2712 if (refs > 1) 2733 if (refs > 1)
2713 return 1; 2734 return 1;
2714 } 2735 }
2715 return ret; /* XXX callers? */ 2736 return 0;
2716} 2737}
2717 2738
2718/* 2739/*
@@ -3671,8 +3692,12 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3671static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 3692static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3672{ 3693{
3673 struct inode *inode = dentry->d_inode; 3694 struct inode *inode = dentry->d_inode;
3695 struct btrfs_root *root = BTRFS_I(inode)->root;
3674 int err; 3696 int err;
3675 3697
3698 if (btrfs_root_readonly(root))
3699 return -EROFS;
3700
3676 err = inode_change_ok(inode, attr); 3701 err = inode_change_ok(inode, attr);
3677 if (err) 3702 if (err)
3678 return err; 3703 return err;
@@ -4115,7 +4140,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4115 } 4140 }
4116 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 4141 srcu_read_unlock(&root->fs_info->subvol_srcu, index);
4117 4142
4118 if (root != sub_root) { 4143 if (!IS_ERR(inode) && root != sub_root) {
4119 down_read(&root->fs_info->cleanup_work_sem); 4144 down_read(&root->fs_info->cleanup_work_sem);
4120 if (!(inode->i_sb->s_flags & MS_RDONLY)) 4145 if (!(inode->i_sb->s_flags & MS_RDONLY))
4121 btrfs_orphan_cleanup(sub_root); 4146 btrfs_orphan_cleanup(sub_root);
@@ -4328,6 +4353,8 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4328 trans = btrfs_join_transaction_nolock(root, 1); 4353 trans = btrfs_join_transaction_nolock(root, 1);
4329 else 4354 else
4330 trans = btrfs_join_transaction(root, 1); 4355 trans = btrfs_join_transaction(root, 1);
4356 if (IS_ERR(trans))
4357 return PTR_ERR(trans);
4331 btrfs_set_trans_block_group(trans, inode); 4358 btrfs_set_trans_block_group(trans, inode);
4332 if (nolock) 4359 if (nolock)
4333 ret = btrfs_end_transaction_nolock(trans, root); 4360 ret = btrfs_end_transaction_nolock(trans, root);
@@ -4353,6 +4380,7 @@ void btrfs_dirty_inode(struct inode *inode)
4353 return; 4380 return;
4354 4381
4355 trans = btrfs_join_transaction(root, 1); 4382 trans = btrfs_join_transaction(root, 1);
4383 BUG_ON(IS_ERR(trans));
4356 btrfs_set_trans_block_group(trans, inode); 4384 btrfs_set_trans_block_group(trans, inode);
4357 4385
4358 ret = btrfs_update_inode(trans, root, inode); 4386 ret = btrfs_update_inode(trans, root, inode);
@@ -4928,8 +4956,10 @@ static noinline int uncompress_inline(struct btrfs_path *path,
4928 size_t max_size; 4956 size_t max_size;
4929 unsigned long inline_size; 4957 unsigned long inline_size;
4930 unsigned long ptr; 4958 unsigned long ptr;
4959 int compress_type;
4931 4960
4932 WARN_ON(pg_offset != 0); 4961 WARN_ON(pg_offset != 0);
4962 compress_type = btrfs_file_extent_compression(leaf, item);
4933 max_size = btrfs_file_extent_ram_bytes(leaf, item); 4963 max_size = btrfs_file_extent_ram_bytes(leaf, item);
4934 inline_size = btrfs_file_extent_inline_item_len(leaf, 4964 inline_size = btrfs_file_extent_inline_item_len(leaf,
4935 btrfs_item_nr(leaf, path->slots[0])); 4965 btrfs_item_nr(leaf, path->slots[0]));
@@ -4939,8 +4969,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
4939 read_extent_buffer(leaf, tmp, ptr, inline_size); 4969 read_extent_buffer(leaf, tmp, ptr, inline_size);
4940 4970
4941 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); 4971 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
4942 ret = btrfs_zlib_decompress(tmp, page, extent_offset, 4972 ret = btrfs_decompress(compress_type, tmp, page,
4943 inline_size, max_size); 4973 extent_offset, inline_size, max_size);
4944 if (ret) { 4974 if (ret) {
4945 char *kaddr = kmap_atomic(page, KM_USER0); 4975 char *kaddr = kmap_atomic(page, KM_USER0);
4946 unsigned long copy_size = min_t(u64, 4976 unsigned long copy_size = min_t(u64,
@@ -4982,7 +5012,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
4982 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5012 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4983 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5013 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4984 struct btrfs_trans_handle *trans = NULL; 5014 struct btrfs_trans_handle *trans = NULL;
4985 int compressed; 5015 int compress_type;
4986 5016
4987again: 5017again:
4988 read_lock(&em_tree->lock); 5018 read_lock(&em_tree->lock);
@@ -5041,7 +5071,7 @@ again:
5041 5071
5042 found_type = btrfs_file_extent_type(leaf, item); 5072 found_type = btrfs_file_extent_type(leaf, item);
5043 extent_start = found_key.offset; 5073 extent_start = found_key.offset;
5044 compressed = btrfs_file_extent_compression(leaf, item); 5074 compress_type = btrfs_file_extent_compression(leaf, item);
5045 if (found_type == BTRFS_FILE_EXTENT_REG || 5075 if (found_type == BTRFS_FILE_EXTENT_REG ||
5046 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5076 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
5047 extent_end = extent_start + 5077 extent_end = extent_start +
@@ -5087,8 +5117,9 @@ again:
5087 em->block_start = EXTENT_MAP_HOLE; 5117 em->block_start = EXTENT_MAP_HOLE;
5088 goto insert; 5118 goto insert;
5089 } 5119 }
5090 if (compressed) { 5120 if (compress_type != BTRFS_COMPRESS_NONE) {
5091 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5121 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5122 em->compress_type = compress_type;
5092 em->block_start = bytenr; 5123 em->block_start = bytenr;
5093 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5124 em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
5094 item); 5125 item);
@@ -5122,12 +5153,14 @@ again:
5122 em->len = (copy_size + root->sectorsize - 1) & 5153 em->len = (copy_size + root->sectorsize - 1) &
5123 ~((u64)root->sectorsize - 1); 5154 ~((u64)root->sectorsize - 1);
5124 em->orig_start = EXTENT_MAP_INLINE; 5155 em->orig_start = EXTENT_MAP_INLINE;
5125 if (compressed) 5156 if (compress_type) {
5126 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5157 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5158 em->compress_type = compress_type;
5159 }
5127 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 5160 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
5128 if (create == 0 && !PageUptodate(page)) { 5161 if (create == 0 && !PageUptodate(page)) {
5129 if (btrfs_file_extent_compression(leaf, item) == 5162 if (btrfs_file_extent_compression(leaf, item) !=
5130 BTRFS_COMPRESS_ZLIB) { 5163 BTRFS_COMPRESS_NONE) {
5131 ret = uncompress_inline(path, inode, page, 5164 ret = uncompress_inline(path, inode, page,
5132 pg_offset, 5165 pg_offset,
5133 extent_offset, item); 5166 extent_offset, item);
@@ -5152,6 +5185,8 @@ again:
5152 em = NULL; 5185 em = NULL;
5153 btrfs_release_path(root, path); 5186 btrfs_release_path(root, path);
5154 trans = btrfs_join_transaction(root, 1); 5187 trans = btrfs_join_transaction(root, 1);
5188 if (IS_ERR(trans))
5189 return ERR_CAST(trans);
5155 goto again; 5190 goto again;
5156 } 5191 }
5157 map = kmap(page); 5192 map = kmap(page);
@@ -5256,8 +5291,8 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5256 btrfs_drop_extent_cache(inode, start, start + len - 1, 0); 5291 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5257 5292
5258 trans = btrfs_join_transaction(root, 0); 5293 trans = btrfs_join_transaction(root, 0);
5259 if (!trans) 5294 if (IS_ERR(trans))
5260 return ERR_PTR(-ENOMEM); 5295 return ERR_CAST(trans);
5261 5296
5262 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5297 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5263 5298
@@ -5481,7 +5516,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5481 * while we look for nocow cross refs 5516 * while we look for nocow cross refs
5482 */ 5517 */
5483 trans = btrfs_join_transaction(root, 0); 5518 trans = btrfs_join_transaction(root, 0);
5484 if (!trans) 5519 if (IS_ERR(trans))
5485 goto must_cow; 5520 goto must_cow;
5486 5521
5487 if (can_nocow_odirect(trans, inode, start, len) == 1) { 5522 if (can_nocow_odirect(trans, inode, start, len) == 1) {
@@ -5616,7 +5651,7 @@ again:
5616 BUG_ON(!ordered); 5651 BUG_ON(!ordered);
5617 5652
5618 trans = btrfs_join_transaction(root, 1); 5653 trans = btrfs_join_transaction(root, 1);
5619 if (!trans) { 5654 if (IS_ERR(trans)) {
5620 err = -ENOMEM; 5655 err = -ENOMEM;
5621 goto out; 5656 goto out;
5622 } 5657 }
@@ -6477,7 +6512,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6477 ei->ordered_data_close = 0; 6512 ei->ordered_data_close = 0;
6478 ei->orphan_meta_reserved = 0; 6513 ei->orphan_meta_reserved = 0;
6479 ei->dummy_inode = 0; 6514 ei->dummy_inode = 0;
6480 ei->force_compress = 0; 6515 ei->force_compress = BTRFS_COMPRESS_NONE;
6481 6516
6482 inode = &ei->vfs_inode; 6517 inode = &ei->vfs_inode;
6483 extent_map_tree_init(&ei->extent_tree, GFP_NOFS); 6518 extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
@@ -7098,116 +7133,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
7098 min_size, actual_len, alloc_hint, trans); 7133 min_size, actual_len, alloc_hint, trans);
7099} 7134}
7100 7135
7101static long btrfs_fallocate(struct inode *inode, int mode,
7102 loff_t offset, loff_t len)
7103{
7104 struct extent_state *cached_state = NULL;
7105 u64 cur_offset;
7106 u64 last_byte;
7107 u64 alloc_start;
7108 u64 alloc_end;
7109 u64 alloc_hint = 0;
7110 u64 locked_end;
7111 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
7112 struct extent_map *em;
7113 int ret;
7114
7115 alloc_start = offset & ~mask;
7116 alloc_end = (offset + len + mask) & ~mask;
7117
7118 /* We only support the FALLOC_FL_KEEP_SIZE mode */
7119 if (mode && (mode != FALLOC_FL_KEEP_SIZE))
7120 return -EOPNOTSUPP;
7121
7122 /*
7123 * wait for ordered IO before we have any locks. We'll loop again
7124 * below with the locks held.
7125 */
7126 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
7127
7128 mutex_lock(&inode->i_mutex);
7129 ret = inode_newsize_ok(inode, alloc_end);
7130 if (ret)
7131 goto out;
7132
7133 if (alloc_start > inode->i_size) {
7134 ret = btrfs_cont_expand(inode, alloc_start);
7135 if (ret)
7136 goto out;
7137 }
7138
7139 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
7140 if (ret)
7141 goto out;
7142
7143 locked_end = alloc_end - 1;
7144 while (1) {
7145 struct btrfs_ordered_extent *ordered;
7146
7147 /* the extent lock is ordered inside the running
7148 * transaction
7149 */
7150 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
7151 locked_end, 0, &cached_state, GFP_NOFS);
7152 ordered = btrfs_lookup_first_ordered_extent(inode,
7153 alloc_end - 1);
7154 if (ordered &&
7155 ordered->file_offset + ordered->len > alloc_start &&
7156 ordered->file_offset < alloc_end) {
7157 btrfs_put_ordered_extent(ordered);
7158 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
7159 alloc_start, locked_end,
7160 &cached_state, GFP_NOFS);
7161 /*
7162 * we can't wait on the range with the transaction
7163 * running or with the extent lock held
7164 */
7165 btrfs_wait_ordered_range(inode, alloc_start,
7166 alloc_end - alloc_start);
7167 } else {
7168 if (ordered)
7169 btrfs_put_ordered_extent(ordered);
7170 break;
7171 }
7172 }
7173
7174 cur_offset = alloc_start;
7175 while (1) {
7176 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
7177 alloc_end - cur_offset, 0);
7178 BUG_ON(IS_ERR(em) || !em);
7179 last_byte = min(extent_map_end(em), alloc_end);
7180 last_byte = (last_byte + mask) & ~mask;
7181 if (em->block_start == EXTENT_MAP_HOLE ||
7182 (cur_offset >= inode->i_size &&
7183 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
7184 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
7185 last_byte - cur_offset,
7186 1 << inode->i_blkbits,
7187 offset + len,
7188 &alloc_hint);
7189 if (ret < 0) {
7190 free_extent_map(em);
7191 break;
7192 }
7193 }
7194 free_extent_map(em);
7195
7196 cur_offset = last_byte;
7197 if (cur_offset >= alloc_end) {
7198 ret = 0;
7199 break;
7200 }
7201 }
7202 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
7203 &cached_state, GFP_NOFS);
7204
7205 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
7206out:
7207 mutex_unlock(&inode->i_mutex);
7208 return ret;
7209}
7210
7211static int btrfs_set_page_dirty(struct page *page) 7136static int btrfs_set_page_dirty(struct page *page)
7212{ 7137{
7213 return __set_page_dirty_nobuffers(page); 7138 return __set_page_dirty_nobuffers(page);
@@ -7215,6 +7140,10 @@ static int btrfs_set_page_dirty(struct page *page)
7215 7140
7216static int btrfs_permission(struct inode *inode, int mask, unsigned int flags) 7141static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
7217{ 7142{
7143 struct btrfs_root *root = BTRFS_I(inode)->root;
7144
7145 if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
7146 return -EROFS;
7218 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) 7147 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
7219 return -EACCES; 7148 return -EACCES;
7220 return generic_permission(inode, mask, flags, btrfs_check_acl); 7149 return generic_permission(inode, mask, flags, btrfs_check_acl);
@@ -7310,7 +7239,6 @@ static const struct inode_operations btrfs_file_inode_operations = {
7310 .listxattr = btrfs_listxattr, 7239 .listxattr = btrfs_listxattr,
7311 .removexattr = btrfs_removexattr, 7240 .removexattr = btrfs_removexattr,
7312 .permission = btrfs_permission, 7241 .permission = btrfs_permission,
7313 .fallocate = btrfs_fallocate,
7314 .fiemap = btrfs_fiemap, 7242 .fiemap = btrfs_fiemap,
7315}; 7243};
7316static const struct inode_operations btrfs_special_inode_operations = { 7244static const struct inode_operations btrfs_special_inode_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f87552a1d7ea..02d224e8c83f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -147,6 +147,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
147 unsigned int flags, oldflags; 147 unsigned int flags, oldflags;
148 int ret; 148 int ret;
149 149
150 if (btrfs_root_readonly(root))
151 return -EROFS;
152
150 if (copy_from_user(&flags, arg, sizeof(flags))) 153 if (copy_from_user(&flags, arg, sizeof(flags)))
151 return -EFAULT; 154 return -EFAULT;
152 155
@@ -200,7 +203,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
200 203
201 204
202 trans = btrfs_join_transaction(root, 1); 205 trans = btrfs_join_transaction(root, 1);
203 BUG_ON(!trans); 206 BUG_ON(IS_ERR(trans));
204 207
205 ret = btrfs_update_inode(trans, root, inode); 208 ret = btrfs_update_inode(trans, root, inode);
206 BUG_ON(ret); 209 BUG_ON(ret);
@@ -360,7 +363,8 @@ fail:
360} 363}
361 364
362static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 365static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
363 char *name, int namelen, u64 *async_transid) 366 char *name, int namelen, u64 *async_transid,
367 bool readonly)
364{ 368{
365 struct inode *inode; 369 struct inode *inode;
366 struct dentry *parent; 370 struct dentry *parent;
@@ -378,6 +382,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
378 btrfs_init_block_rsv(&pending_snapshot->block_rsv); 382 btrfs_init_block_rsv(&pending_snapshot->block_rsv);
379 pending_snapshot->dentry = dentry; 383 pending_snapshot->dentry = dentry;
380 pending_snapshot->root = root; 384 pending_snapshot->root = root;
385 pending_snapshot->readonly = readonly;
381 386
382 trans = btrfs_start_transaction(root->fs_info->extent_root, 5); 387 trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
383 if (IS_ERR(trans)) { 388 if (IS_ERR(trans)) {
@@ -509,7 +514,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
509static noinline int btrfs_mksubvol(struct path *parent, 514static noinline int btrfs_mksubvol(struct path *parent,
510 char *name, int namelen, 515 char *name, int namelen,
511 struct btrfs_root *snap_src, 516 struct btrfs_root *snap_src,
512 u64 *async_transid) 517 u64 *async_transid, bool readonly)
513{ 518{
514 struct inode *dir = parent->dentry->d_inode; 519 struct inode *dir = parent->dentry->d_inode;
515 struct dentry *dentry; 520 struct dentry *dentry;
@@ -541,7 +546,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
541 546
542 if (snap_src) { 547 if (snap_src) {
543 error = create_snapshot(snap_src, dentry, 548 error = create_snapshot(snap_src, dentry,
544 name, namelen, async_transid); 549 name, namelen, async_transid, readonly);
545 } else { 550 } else {
546 error = create_subvol(BTRFS_I(dir)->root, dentry, 551 error = create_subvol(BTRFS_I(dir)->root, dentry,
547 name, namelen, async_transid); 552 name, namelen, async_transid);
@@ -638,9 +643,11 @@ static int btrfs_defrag_file(struct file *file,
638 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 643 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
639 struct btrfs_ordered_extent *ordered; 644 struct btrfs_ordered_extent *ordered;
640 struct page *page; 645 struct page *page;
646 struct btrfs_super_block *disk_super;
641 unsigned long last_index; 647 unsigned long last_index;
642 unsigned long ra_pages = root->fs_info->bdi.ra_pages; 648 unsigned long ra_pages = root->fs_info->bdi.ra_pages;
643 unsigned long total_read = 0; 649 unsigned long total_read = 0;
650 u64 features;
644 u64 page_start; 651 u64 page_start;
645 u64 page_end; 652 u64 page_end;
646 u64 last_len = 0; 653 u64 last_len = 0;
@@ -648,6 +655,14 @@ static int btrfs_defrag_file(struct file *file,
648 u64 defrag_end = 0; 655 u64 defrag_end = 0;
649 unsigned long i; 656 unsigned long i;
650 int ret; 657 int ret;
658 int compress_type = BTRFS_COMPRESS_ZLIB;
659
660 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
661 if (range->compress_type > BTRFS_COMPRESS_TYPES)
662 return -EINVAL;
663 if (range->compress_type)
664 compress_type = range->compress_type;
665 }
651 666
652 if (inode->i_size == 0) 667 if (inode->i_size == 0)
653 return 0; 668 return 0;
@@ -683,7 +698,7 @@ static int btrfs_defrag_file(struct file *file,
683 total_read++; 698 total_read++;
684 mutex_lock(&inode->i_mutex); 699 mutex_lock(&inode->i_mutex);
685 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 700 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
686 BTRFS_I(inode)->force_compress = 1; 701 BTRFS_I(inode)->force_compress = compress_type;
687 702
688 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 703 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
689 if (ret) 704 if (ret)
@@ -781,10 +796,17 @@ loop_unlock:
781 atomic_dec(&root->fs_info->async_submit_draining); 796 atomic_dec(&root->fs_info->async_submit_draining);
782 797
783 mutex_lock(&inode->i_mutex); 798 mutex_lock(&inode->i_mutex);
784 BTRFS_I(inode)->force_compress = 0; 799 BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
785 mutex_unlock(&inode->i_mutex); 800 mutex_unlock(&inode->i_mutex);
786 } 801 }
787 802
803 disk_super = &root->fs_info->super_copy;
804 features = btrfs_super_incompat_flags(disk_super);
805 if (range->compress_type == BTRFS_COMPRESS_LZO) {
806 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
807 btrfs_set_super_incompat_flags(disk_super, features);
808 }
809
788 return 0; 810 return 0;
789 811
790err_reservations: 812err_reservations:
@@ -885,6 +907,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
885 907
886 if (new_size > old_size) { 908 if (new_size > old_size) {
887 trans = btrfs_start_transaction(root, 0); 909 trans = btrfs_start_transaction(root, 0);
910 if (IS_ERR(trans)) {
911 ret = PTR_ERR(trans);
912 goto out_unlock;
913 }
888 ret = btrfs_grow_device(trans, device, new_size); 914 ret = btrfs_grow_device(trans, device, new_size);
889 btrfs_commit_transaction(trans, root); 915 btrfs_commit_transaction(trans, root);
890 } else { 916 } else {
@@ -901,7 +927,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
901 char *name, 927 char *name,
902 unsigned long fd, 928 unsigned long fd,
903 int subvol, 929 int subvol,
904 u64 *transid) 930 u64 *transid,
931 bool readonly)
905{ 932{
906 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 933 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
907 struct file *src_file; 934 struct file *src_file;
@@ -919,7 +946,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
919 946
920 if (subvol) { 947 if (subvol) {
921 ret = btrfs_mksubvol(&file->f_path, name, namelen, 948 ret = btrfs_mksubvol(&file->f_path, name, namelen,
922 NULL, transid); 949 NULL, transid, readonly);
923 } else { 950 } else {
924 struct inode *src_inode; 951 struct inode *src_inode;
925 src_file = fget(fd); 952 src_file = fget(fd);
@@ -938,7 +965,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
938 } 965 }
939 ret = btrfs_mksubvol(&file->f_path, name, namelen, 966 ret = btrfs_mksubvol(&file->f_path, name, namelen,
940 BTRFS_I(src_inode)->root, 967 BTRFS_I(src_inode)->root,
941 transid); 968 transid, readonly);
942 fput(src_file); 969 fput(src_file);
943 } 970 }
944out: 971out:
@@ -946,58 +973,139 @@ out:
946} 973}
947 974
948static noinline int btrfs_ioctl_snap_create(struct file *file, 975static noinline int btrfs_ioctl_snap_create(struct file *file,
949 void __user *arg, int subvol, 976 void __user *arg, int subvol)
950 int v2)
951{ 977{
952 struct btrfs_ioctl_vol_args *vol_args = NULL; 978 struct btrfs_ioctl_vol_args *vol_args;
953 struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL;
954 char *name;
955 u64 fd;
956 int ret; 979 int ret;
957 980
958 if (v2) { 981 vol_args = memdup_user(arg, sizeof(*vol_args));
959 u64 transid = 0; 982 if (IS_ERR(vol_args))
960 u64 *ptr = NULL; 983 return PTR_ERR(vol_args);
961 984 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
962 vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2));
963 if (IS_ERR(vol_args_v2))
964 return PTR_ERR(vol_args_v2);
965 985
966 if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) { 986 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
967 ret = -EINVAL; 987 vol_args->fd, subvol,
968 goto out; 988 NULL, false);
969 }
970 989
971 name = vol_args_v2->name; 990 kfree(vol_args);
972 fd = vol_args_v2->fd; 991 return ret;
973 vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; 992}
974 993
975 if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC) 994static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
976 ptr = &transid; 995 void __user *arg, int subvol)
996{
997 struct btrfs_ioctl_vol_args_v2 *vol_args;
998 int ret;
999 u64 transid = 0;
1000 u64 *ptr = NULL;
1001 bool readonly = false;
977 1002
978 ret = btrfs_ioctl_snap_create_transid(file, name, fd, 1003 vol_args = memdup_user(arg, sizeof(*vol_args));
979 subvol, ptr); 1004 if (IS_ERR(vol_args))
1005 return PTR_ERR(vol_args);
1006 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
980 1007
981 if (ret == 0 && ptr && 1008 if (vol_args->flags &
982 copy_to_user(arg + 1009 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
983 offsetof(struct btrfs_ioctl_vol_args_v2, 1010 ret = -EOPNOTSUPP;
984 transid), ptr, sizeof(*ptr))) 1011 goto out;
985 ret = -EFAULT;
986 } else {
987 vol_args = memdup_user(arg, sizeof(*vol_args));
988 if (IS_ERR(vol_args))
989 return PTR_ERR(vol_args);
990 name = vol_args->name;
991 fd = vol_args->fd;
992 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
993
994 ret = btrfs_ioctl_snap_create_transid(file, name, fd,
995 subvol, NULL);
996 } 1012 }
1013
1014 if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
1015 ptr = &transid;
1016 if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
1017 readonly = true;
1018
1019 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1020 vol_args->fd, subvol,
1021 ptr, readonly);
1022
1023 if (ret == 0 && ptr &&
1024 copy_to_user(arg +
1025 offsetof(struct btrfs_ioctl_vol_args_v2,
1026 transid), ptr, sizeof(*ptr)))
1027 ret = -EFAULT;
997out: 1028out:
998 kfree(vol_args); 1029 kfree(vol_args);
999 kfree(vol_args_v2); 1030 return ret;
1031}
1032
1033static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
1034 void __user *arg)
1035{
1036 struct inode *inode = fdentry(file)->d_inode;
1037 struct btrfs_root *root = BTRFS_I(inode)->root;
1038 int ret = 0;
1039 u64 flags = 0;
1040
1041 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
1042 return -EINVAL;
1043
1044 down_read(&root->fs_info->subvol_sem);
1045 if (btrfs_root_readonly(root))
1046 flags |= BTRFS_SUBVOL_RDONLY;
1047 up_read(&root->fs_info->subvol_sem);
1048
1049 if (copy_to_user(arg, &flags, sizeof(flags)))
1050 ret = -EFAULT;
1051
1052 return ret;
1053}
1054
1055static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1056 void __user *arg)
1057{
1058 struct inode *inode = fdentry(file)->d_inode;
1059 struct btrfs_root *root = BTRFS_I(inode)->root;
1060 struct btrfs_trans_handle *trans;
1061 u64 root_flags;
1062 u64 flags;
1063 int ret = 0;
1064
1065 if (root->fs_info->sb->s_flags & MS_RDONLY)
1066 return -EROFS;
1067
1068 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
1069 return -EINVAL;
1070
1071 if (copy_from_user(&flags, arg, sizeof(flags)))
1072 return -EFAULT;
1073
1074 if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC)
1075 return -EINVAL;
1076
1077 if (flags & ~BTRFS_SUBVOL_RDONLY)
1078 return -EOPNOTSUPP;
1079
1080 down_write(&root->fs_info->subvol_sem);
1000 1081
1082 /* nothing to do */
1083 if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
1084 goto out;
1085
1086 root_flags = btrfs_root_flags(&root->root_item);
1087 if (flags & BTRFS_SUBVOL_RDONLY)
1088 btrfs_set_root_flags(&root->root_item,
1089 root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
1090 else
1091 btrfs_set_root_flags(&root->root_item,
1092 root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
1093
1094 trans = btrfs_start_transaction(root, 1);
1095 if (IS_ERR(trans)) {
1096 ret = PTR_ERR(trans);
1097 goto out_reset;
1098 }
1099
1100 ret = btrfs_update_root(trans, root,
1101 &root->root_key, &root->root_item);
1102
1103 btrfs_commit_transaction(trans, root);
1104out_reset:
1105 if (ret)
1106 btrfs_set_root_flags(&root->root_item, root_flags);
1107out:
1108 up_write(&root->fs_info->subvol_sem);
1001 return ret; 1109 return ret;
1002} 1110}
1003 1111
@@ -1509,6 +1617,9 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1509 struct btrfs_ioctl_defrag_range_args *range; 1617 struct btrfs_ioctl_defrag_range_args *range;
1510 int ret; 1618 int ret;
1511 1619
1620 if (btrfs_root_readonly(root))
1621 return -EROFS;
1622
1512 ret = mnt_want_write(file->f_path.mnt); 1623 ret = mnt_want_write(file->f_path.mnt);
1513 if (ret) 1624 if (ret)
1514 return ret; 1625 return ret;
@@ -1637,6 +1748,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1637 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) 1748 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
1638 return -EINVAL; 1749 return -EINVAL;
1639 1750
1751 if (btrfs_root_readonly(root))
1752 return -EROFS;
1753
1640 ret = mnt_want_write(file->f_path.mnt); 1754 ret = mnt_want_write(file->f_path.mnt);
1641 if (ret) 1755 if (ret)
1642 return ret; 1756 return ret;
@@ -1788,7 +1902,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1788 1902
1789 memcpy(&new_key, &key, sizeof(new_key)); 1903 memcpy(&new_key, &key, sizeof(new_key));
1790 new_key.objectid = inode->i_ino; 1904 new_key.objectid = inode->i_ino;
1791 new_key.offset = key.offset + destoff - off; 1905 if (off <= key.offset)
1906 new_key.offset = key.offset + destoff - off;
1907 else
1908 new_key.offset = destoff;
1792 1909
1793 trans = btrfs_start_transaction(root, 1); 1910 trans = btrfs_start_transaction(root, 1);
1794 if (IS_ERR(trans)) { 1911 if (IS_ERR(trans)) {
@@ -1958,6 +2075,10 @@ static long btrfs_ioctl_trans_start(struct file *file)
1958 if (file->private_data) 2075 if (file->private_data)
1959 goto out; 2076 goto out;
1960 2077
2078 ret = -EROFS;
2079 if (btrfs_root_readonly(root))
2080 goto out;
2081
1961 ret = mnt_want_write(file->f_path.mnt); 2082 ret = mnt_want_write(file->f_path.mnt);
1962 if (ret) 2083 if (ret)
1963 goto out; 2084 goto out;
@@ -1968,7 +2089,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
1968 2089
1969 ret = -ENOMEM; 2090 ret = -ENOMEM;
1970 trans = btrfs_start_ioctl_transaction(root, 0); 2091 trans = btrfs_start_ioctl_transaction(root, 0);
1971 if (!trans) 2092 if (IS_ERR(trans))
1972 goto out_drop; 2093 goto out_drop;
1973 2094
1974 file->private_data = trans; 2095 file->private_data = trans;
@@ -2024,9 +2145,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2024 path->leave_spinning = 1; 2145 path->leave_spinning = 1;
2025 2146
2026 trans = btrfs_start_transaction(root, 1); 2147 trans = btrfs_start_transaction(root, 1);
2027 if (!trans) { 2148 if (IS_ERR(trans)) {
2028 btrfs_free_path(path); 2149 btrfs_free_path(path);
2029 return -ENOMEM; 2150 return PTR_ERR(trans);
2030 } 2151 }
2031 2152
2032 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 2153 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
@@ -2220,6 +2341,8 @@ static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp
2220 u64 transid; 2341 u64 transid;
2221 2342
2222 trans = btrfs_start_transaction(root, 0); 2343 trans = btrfs_start_transaction(root, 0);
2344 if (IS_ERR(trans))
2345 return PTR_ERR(trans);
2223 transid = trans->transid; 2346 transid = trans->transid;
2224 btrfs_commit_transaction_async(trans, root, 0); 2347 btrfs_commit_transaction_async(trans, root, 0);
2225 2348
@@ -2257,13 +2380,17 @@ long btrfs_ioctl(struct file *file, unsigned int
2257 case FS_IOC_GETVERSION: 2380 case FS_IOC_GETVERSION:
2258 return btrfs_ioctl_getversion(file, argp); 2381 return btrfs_ioctl_getversion(file, argp);
2259 case BTRFS_IOC_SNAP_CREATE: 2382 case BTRFS_IOC_SNAP_CREATE:
2260 return btrfs_ioctl_snap_create(file, argp, 0, 0); 2383 return btrfs_ioctl_snap_create(file, argp, 0);
2261 case BTRFS_IOC_SNAP_CREATE_V2: 2384 case BTRFS_IOC_SNAP_CREATE_V2:
2262 return btrfs_ioctl_snap_create(file, argp, 0, 1); 2385 return btrfs_ioctl_snap_create_v2(file, argp, 0);
2263 case BTRFS_IOC_SUBVOL_CREATE: 2386 case BTRFS_IOC_SUBVOL_CREATE:
2264 return btrfs_ioctl_snap_create(file, argp, 1, 0); 2387 return btrfs_ioctl_snap_create(file, argp, 1);
2265 case BTRFS_IOC_SNAP_DESTROY: 2388 case BTRFS_IOC_SNAP_DESTROY:
2266 return btrfs_ioctl_snap_destroy(file, argp); 2389 return btrfs_ioctl_snap_destroy(file, argp);
2390 case BTRFS_IOC_SUBVOL_GETFLAGS:
2391 return btrfs_ioctl_subvol_getflags(file, argp);
2392 case BTRFS_IOC_SUBVOL_SETFLAGS:
2393 return btrfs_ioctl_subvol_setflags(file, argp);
2267 case BTRFS_IOC_DEFAULT_SUBVOL: 2394 case BTRFS_IOC_DEFAULT_SUBVOL:
2268 return btrfs_ioctl_default_subvol(file, argp); 2395 return btrfs_ioctl_default_subvol(file, argp);
2269 case BTRFS_IOC_DEFRAG: 2396 case BTRFS_IOC_DEFRAG:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index c344d12c646b..8fb382167b13 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -31,6 +31,7 @@ struct btrfs_ioctl_vol_args {
31}; 31};
32 32
33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) 33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
34#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
34 35
35#define BTRFS_SUBVOL_NAME_MAX 4039 36#define BTRFS_SUBVOL_NAME_MAX 4039
36struct btrfs_ioctl_vol_args_v2 { 37struct btrfs_ioctl_vol_args_v2 {
@@ -133,8 +134,15 @@ struct btrfs_ioctl_defrag_range_args {
133 */ 134 */
134 __u32 extent_thresh; 135 __u32 extent_thresh;
135 136
137 /*
138 * which compression method to use if turning on compression
139 * for this defrag operation. If unspecified, zlib will
140 * be used
141 */
142 __u32 compress_type;
143
136 /* spare for later */ 144 /* spare for later */
137 __u32 unused[5]; 145 __u32 unused[4];
138}; 146};
139 147
140struct btrfs_ioctl_space_info { 148struct btrfs_ioctl_space_info {
@@ -193,4 +201,6 @@ struct btrfs_ioctl_space_args {
193#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) 201#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
194#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ 202#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
195 struct btrfs_ioctl_vol_args_v2) 203 struct btrfs_ioctl_vol_args_v2)
204#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
205#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
196#endif 206#endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
new file mode 100644
index 000000000000..cc9b450399df
--- /dev/null
+++ b/fs/btrfs/lzo.c
@@ -0,0 +1,420 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/slab.h>
21#include <linux/vmalloc.h>
22#include <linux/init.h>
23#include <linux/err.h>
24#include <linux/sched.h>
25#include <linux/pagemap.h>
26#include <linux/bio.h>
27#include <linux/lzo.h>
28#include "compression.h"
29
30#define LZO_LEN 4
31
32struct workspace {
33 void *mem;
34 void *buf; /* where compressed data goes */
35 void *cbuf; /* where decompressed data goes */
36 struct list_head list;
37};
38
39static void lzo_free_workspace(struct list_head *ws)
40{
41 struct workspace *workspace = list_entry(ws, struct workspace, list);
42
43 vfree(workspace->buf);
44 vfree(workspace->cbuf);
45 vfree(workspace->mem);
46 kfree(workspace);
47}
48
49static struct list_head *lzo_alloc_workspace(void)
50{
51 struct workspace *workspace;
52
53 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
54 if (!workspace)
55 return ERR_PTR(-ENOMEM);
56
57 workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
58 workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
59 workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
60 if (!workspace->mem || !workspace->buf || !workspace->cbuf)
61 goto fail;
62
63 INIT_LIST_HEAD(&workspace->list);
64
65 return &workspace->list;
66fail:
67 lzo_free_workspace(&workspace->list);
68 return ERR_PTR(-ENOMEM);
69}
70
71static inline void write_compress_length(char *buf, size_t len)
72{
73 __le32 dlen;
74
75 dlen = cpu_to_le32(len);
76 memcpy(buf, &dlen, LZO_LEN);
77}
78
79static inline size_t read_compress_length(char *buf)
80{
81 __le32 dlen;
82
83 memcpy(&dlen, buf, LZO_LEN);
84 return le32_to_cpu(dlen);
85}
86
87static int lzo_compress_pages(struct list_head *ws,
88 struct address_space *mapping,
89 u64 start, unsigned long len,
90 struct page **pages,
91 unsigned long nr_dest_pages,
92 unsigned long *out_pages,
93 unsigned long *total_in,
94 unsigned long *total_out,
95 unsigned long max_out)
96{
97 struct workspace *workspace = list_entry(ws, struct workspace, list);
98 int ret = 0;
99 char *data_in;
100 char *cpage_out;
101 int nr_pages = 0;
102 struct page *in_page = NULL;
103 struct page *out_page = NULL;
104 unsigned long bytes_left;
105
106 size_t in_len;
107 size_t out_len;
108 char *buf;
109 unsigned long tot_in = 0;
110 unsigned long tot_out = 0;
111 unsigned long pg_bytes_left;
112 unsigned long out_offset;
113 unsigned long bytes;
114
115 *out_pages = 0;
116 *total_out = 0;
117 *total_in = 0;
118
119 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
120 data_in = kmap(in_page);
121
122 /*
123 * store the size of all chunks of compressed data in
124 * the first 4 bytes
125 */
126 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
127 if (out_page == NULL) {
128 ret = -ENOMEM;
129 goto out;
130 }
131 cpage_out = kmap(out_page);
132 out_offset = LZO_LEN;
133 tot_out = LZO_LEN;
134 pages[0] = out_page;
135 nr_pages = 1;
136 pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
137
138 /* compress at most one page of data each time */
139 in_len = min(len, PAGE_CACHE_SIZE);
140 while (tot_in < len) {
141 ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
142 &out_len, workspace->mem);
143 if (ret != LZO_E_OK) {
144 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
145 ret);
146 ret = -1;
147 goto out;
148 }
149
150 /* store the size of this chunk of compressed data */
151 write_compress_length(cpage_out + out_offset, out_len);
152 tot_out += LZO_LEN;
153 out_offset += LZO_LEN;
154 pg_bytes_left -= LZO_LEN;
155
156 tot_in += in_len;
157 tot_out += out_len;
158
159 /* copy bytes from the working buffer into the pages */
160 buf = workspace->cbuf;
161 while (out_len) {
162 bytes = min_t(unsigned long, pg_bytes_left, out_len);
163
164 memcpy(cpage_out + out_offset, buf, bytes);
165
166 out_len -= bytes;
167 pg_bytes_left -= bytes;
168 buf += bytes;
169 out_offset += bytes;
170
171 /*
172 * we need another page for writing out.
173 *
174 * Note if there's less than 4 bytes left, we just
175 * skip to a new page.
176 */
177 if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
178 pg_bytes_left == 0) {
179 if (pg_bytes_left) {
180 memset(cpage_out + out_offset, 0,
181 pg_bytes_left);
182 tot_out += pg_bytes_left;
183 }
184
185 /* we're done, don't allocate new page */
186 if (out_len == 0 && tot_in >= len)
187 break;
188
189 kunmap(out_page);
190 if (nr_pages == nr_dest_pages) {
191 out_page = NULL;
192 ret = -1;
193 goto out;
194 }
195
196 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
197 if (out_page == NULL) {
198 ret = -ENOMEM;
199 goto out;
200 }
201 cpage_out = kmap(out_page);
202 pages[nr_pages++] = out_page;
203
204 pg_bytes_left = PAGE_CACHE_SIZE;
205 out_offset = 0;
206 }
207 }
208
209 /* we're making it bigger, give up */
210 if (tot_in > 8192 && tot_in < tot_out)
211 goto out;
212
213 /* we're all done */
214 if (tot_in >= len)
215 break;
216
217 if (tot_out > max_out)
218 break;
219
220 bytes_left = len - tot_in;
221 kunmap(in_page);
222 page_cache_release(in_page);
223
224 start += PAGE_CACHE_SIZE;
225 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
226 data_in = kmap(in_page);
227 in_len = min(bytes_left, PAGE_CACHE_SIZE);
228 }
229
230 if (tot_out > tot_in)
231 goto out;
232
233 /* store the size of all chunks of compressed data */
234 cpage_out = kmap(pages[0]);
235 write_compress_length(cpage_out, tot_out);
236
237 kunmap(pages[0]);
238
239 ret = 0;
240 *total_out = tot_out;
241 *total_in = tot_in;
242out:
243 *out_pages = nr_pages;
244 if (out_page)
245 kunmap(out_page);
246
247 if (in_page) {
248 kunmap(in_page);
249 page_cache_release(in_page);
250 }
251
252 return ret;
253}
254
255static int lzo_decompress_biovec(struct list_head *ws,
256 struct page **pages_in,
257 u64 disk_start,
258 struct bio_vec *bvec,
259 int vcnt,
260 size_t srclen)
261{
262 struct workspace *workspace = list_entry(ws, struct workspace, list);
263 int ret = 0, ret2;
264 char *data_in;
265 unsigned long page_in_index = 0;
266 unsigned long page_out_index = 0;
267 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
268 PAGE_CACHE_SIZE;
269 unsigned long buf_start;
270 unsigned long buf_offset = 0;
271 unsigned long bytes;
272 unsigned long working_bytes;
273 unsigned long pg_offset;
274
275 size_t in_len;
276 size_t out_len;
277 unsigned long in_offset;
278 unsigned long in_page_bytes_left;
279 unsigned long tot_in;
280 unsigned long tot_out;
281 unsigned long tot_len;
282 char *buf;
283
284 data_in = kmap(pages_in[0]);
285 tot_len = read_compress_length(data_in);
286
287 tot_in = LZO_LEN;
288 in_offset = LZO_LEN;
289 tot_len = min_t(size_t, srclen, tot_len);
290 in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
291
292 tot_out = 0;
293 pg_offset = 0;
294
295 while (tot_in < tot_len) {
296 in_len = read_compress_length(data_in + in_offset);
297 in_page_bytes_left -= LZO_LEN;
298 in_offset += LZO_LEN;
299 tot_in += LZO_LEN;
300
301 tot_in += in_len;
302 working_bytes = in_len;
303
304 /* fast path: avoid using the working buffer */
305 if (in_page_bytes_left >= in_len) {
306 buf = data_in + in_offset;
307 bytes = in_len;
308 goto cont;
309 }
310
311 /* copy bytes from the pages into the working buffer */
312 buf = workspace->cbuf;
313 buf_offset = 0;
314 while (working_bytes) {
315 bytes = min(working_bytes, in_page_bytes_left);
316
317 memcpy(buf + buf_offset, data_in + in_offset, bytes);
318 buf_offset += bytes;
319cont:
320 working_bytes -= bytes;
321 in_page_bytes_left -= bytes;
322 in_offset += bytes;
323
324 /* check if we need to pick another page */
325 if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
326 || in_page_bytes_left == 0) {
327 tot_in += in_page_bytes_left;
328
329 if (working_bytes == 0 && tot_in >= tot_len)
330 break;
331
332 kunmap(pages_in[page_in_index]);
333 page_in_index++;
334 if (page_in_index >= total_pages_in) {
335 ret = -1;
336 data_in = NULL;
337 goto done;
338 }
339 data_in = kmap(pages_in[page_in_index]);
340
341 in_page_bytes_left = PAGE_CACHE_SIZE;
342 in_offset = 0;
343 }
344 }
345
346 out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
347 ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
348 &out_len);
349 if (ret != LZO_E_OK) {
350 printk(KERN_WARNING "btrfs decompress failed\n");
351 ret = -1;
352 break;
353 }
354
355 buf_start = tot_out;
356 tot_out += out_len;
357
358 ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
359 tot_out, disk_start,
360 bvec, vcnt,
361 &page_out_index, &pg_offset);
362 if (ret2 == 0)
363 break;
364 }
365done:
366 if (data_in)
367 kunmap(pages_in[page_in_index]);
368 return ret;
369}
370
371static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
372 struct page *dest_page,
373 unsigned long start_byte,
374 size_t srclen, size_t destlen)
375{
376 struct workspace *workspace = list_entry(ws, struct workspace, list);
377 size_t in_len;
378 size_t out_len;
379 size_t tot_len;
380 int ret = 0;
381 char *kaddr;
382 unsigned long bytes;
383
384 BUG_ON(srclen < LZO_LEN);
385
386 tot_len = read_compress_length(data_in);
387 data_in += LZO_LEN;
388
389 in_len = read_compress_length(data_in);
390 data_in += LZO_LEN;
391
392 out_len = PAGE_CACHE_SIZE;
393 ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
394 if (ret != LZO_E_OK) {
395 printk(KERN_WARNING "btrfs decompress failed!\n");
396 ret = -1;
397 goto out;
398 }
399
400 if (out_len < start_byte) {
401 ret = -1;
402 goto out;
403 }
404
405 bytes = min_t(unsigned long, destlen, out_len - start_byte);
406
407 kaddr = kmap_atomic(dest_page, KM_USER0);
408 memcpy(kaddr, workspace->buf + start_byte, bytes);
409 kunmap_atomic(kaddr, KM_USER0);
410out:
411 return ret;
412}
413
414struct btrfs_compress_op btrfs_lzo_compress = {
415 .alloc_workspace = lzo_alloc_workspace,
416 .free_workspace = lzo_free_workspace,
417 .compress_pages = lzo_compress_pages,
418 .decompress_biovec = lzo_decompress_biovec,
419 .decompress = lzo_decompress,
420};
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ae7737e352c9..083a55477375 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -141,7 +141,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
141 u64 file_offset) 141 u64 file_offset)
142{ 142{
143 struct rb_root *root = &tree->tree; 143 struct rb_root *root = &tree->tree;
144 struct rb_node *prev; 144 struct rb_node *prev = NULL;
145 struct rb_node *ret; 145 struct rb_node *ret;
146 struct btrfs_ordered_extent *entry; 146 struct btrfs_ordered_extent *entry;
147 147
@@ -172,7 +172,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
172 */ 172 */
173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
174 u64 start, u64 len, u64 disk_len, 174 u64 start, u64 len, u64 disk_len,
175 int type, int dio) 175 int type, int dio, int compress_type)
176{ 176{
177 struct btrfs_ordered_inode_tree *tree; 177 struct btrfs_ordered_inode_tree *tree;
178 struct rb_node *node; 178 struct rb_node *node;
@@ -189,6 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
189 entry->disk_len = disk_len; 189 entry->disk_len = disk_len;
190 entry->bytes_left = len; 190 entry->bytes_left = len;
191 entry->inode = inode; 191 entry->inode = inode;
192 entry->compress_type = compress_type;
192 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 193 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
193 set_bit(type, &entry->flags); 194 set_bit(type, &entry->flags);
194 195
@@ -220,14 +221,25 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
220 u64 start, u64 len, u64 disk_len, int type) 221 u64 start, u64 len, u64 disk_len, int type)
221{ 222{
222 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 223 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
223 disk_len, type, 0); 224 disk_len, type, 0,
225 BTRFS_COMPRESS_NONE);
224} 226}
225 227
226int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 228int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
227 u64 start, u64 len, u64 disk_len, int type) 229 u64 start, u64 len, u64 disk_len, int type)
228{ 230{
229 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 231 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
230 disk_len, type, 1); 232 disk_len, type, 1,
233 BTRFS_COMPRESS_NONE);
234}
235
236int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
237 u64 start, u64 len, u64 disk_len,
238 int type, int compress_type)
239{
240 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
241 disk_len, type, 0,
242 compress_type);
231} 243}
232 244
233/* 245/*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 61dca83119dd..ff1f69aa1883 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -68,7 +68,7 @@ struct btrfs_ordered_sum {
68 68
69#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ 69#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
70 70
71#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */ 71#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
72 72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ 73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74 74
@@ -93,6 +93,9 @@ struct btrfs_ordered_extent {
93 /* flags (described above) */ 93 /* flags (described above) */
94 unsigned long flags; 94 unsigned long flags;
95 95
96 /* compression algorithm */
97 int compress_type;
98
96 /* reference count */ 99 /* reference count */
97 atomic_t refs; 100 atomic_t refs;
98 101
@@ -148,6 +151,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
148 u64 start, u64 len, u64 disk_len, int type); 151 u64 start, u64 len, u64 disk_len, int type);
149int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 152int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
150 u64 start, u64 len, u64 disk_len, int type); 153 u64 start, u64 len, u64 disk_len, int type);
154int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
155 u64 start, u64 len, u64 disk_len,
156 int type, int compress_type);
151int btrfs_add_ordered_sum(struct inode *inode, 157int btrfs_add_ordered_sum(struct inode *inode,
152 struct btrfs_ordered_extent *entry, 158 struct btrfs_ordered_extent *entry,
153 struct btrfs_ordered_sum *sum); 159 struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0d126be22b63..fb2605d998e9 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -260,6 +260,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
260#else 260#else
261 BUG(); 261 BUG();
262#endif 262#endif
263 break;
263 case BTRFS_BLOCK_GROUP_ITEM_KEY: 264 case BTRFS_BLOCK_GROUP_ITEM_KEY:
264 bi = btrfs_item_ptr(l, i, 265 bi = btrfs_item_ptr(l, i,
265 struct btrfs_block_group_item); 266 struct btrfs_block_group_item);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 045c9c2b2d7e..1f5556acb530 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2028,6 +2028,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2028 2028
2029 while (1) { 2029 while (1) {
2030 trans = btrfs_start_transaction(root, 0); 2030 trans = btrfs_start_transaction(root, 0);
2031 BUG_ON(IS_ERR(trans));
2031 trans->block_rsv = rc->block_rsv; 2032 trans->block_rsv = rc->block_rsv;
2032 2033
2033 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, 2034 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
@@ -2147,6 +2148,12 @@ again:
2147 } 2148 }
2148 2149
2149 trans = btrfs_join_transaction(rc->extent_root, 1); 2150 trans = btrfs_join_transaction(rc->extent_root, 1);
2151 if (IS_ERR(trans)) {
2152 if (!err)
2153 btrfs_block_rsv_release(rc->extent_root,
2154 rc->block_rsv, num_bytes);
2155 return PTR_ERR(trans);
2156 }
2150 2157
2151 if (!err) { 2158 if (!err) {
2152 if (num_bytes != rc->merging_rsv_size) { 2159 if (num_bytes != rc->merging_rsv_size) {
@@ -3222,6 +3229,7 @@ truncate:
3222 trans = btrfs_join_transaction(root, 0); 3229 trans = btrfs_join_transaction(root, 0);
3223 if (IS_ERR(trans)) { 3230 if (IS_ERR(trans)) {
3224 btrfs_free_path(path); 3231 btrfs_free_path(path);
3232 ret = PTR_ERR(trans);
3225 goto out; 3233 goto out;
3226 } 3234 }
3227 3235
@@ -3628,6 +3636,7 @@ int prepare_to_relocate(struct reloc_control *rc)
3628 set_reloc_control(rc); 3636 set_reloc_control(rc);
3629 3637
3630 trans = btrfs_join_transaction(rc->extent_root, 1); 3638 trans = btrfs_join_transaction(rc->extent_root, 1);
3639 BUG_ON(IS_ERR(trans));
3631 btrfs_commit_transaction(trans, rc->extent_root); 3640 btrfs_commit_transaction(trans, rc->extent_root);
3632 return 0; 3641 return 0;
3633} 3642}
@@ -3657,6 +3666,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3657 3666
3658 while (1) { 3667 while (1) {
3659 trans = btrfs_start_transaction(rc->extent_root, 0); 3668 trans = btrfs_start_transaction(rc->extent_root, 0);
3669 BUG_ON(IS_ERR(trans));
3660 3670
3661 if (update_backref_cache(trans, &rc->backref_cache)) { 3671 if (update_backref_cache(trans, &rc->backref_cache)) {
3662 btrfs_end_transaction(trans, rc->extent_root); 3672 btrfs_end_transaction(trans, rc->extent_root);
@@ -3804,7 +3814,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3804 3814
3805 /* get rid of pinned extents */ 3815 /* get rid of pinned extents */
3806 trans = btrfs_join_transaction(rc->extent_root, 1); 3816 trans = btrfs_join_transaction(rc->extent_root, 1);
3807 btrfs_commit_transaction(trans, rc->extent_root); 3817 if (IS_ERR(trans))
3818 err = PTR_ERR(trans);
3819 else
3820 btrfs_commit_transaction(trans, rc->extent_root);
3808out_free: 3821out_free:
3809 btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); 3822 btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
3810 btrfs_free_path(path); 3823 btrfs_free_path(path);
@@ -4022,6 +4035,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4022 int ret; 4035 int ret;
4023 4036
4024 trans = btrfs_start_transaction(root->fs_info->tree_root, 0); 4037 trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
4038 BUG_ON(IS_ERR(trans));
4025 4039
4026 memset(&root->root_item.drop_progress, 0, 4040 memset(&root->root_item.drop_progress, 0,
4027 sizeof(root->root_item.drop_progress)); 4041 sizeof(root->root_item.drop_progress));
@@ -4125,6 +4139,11 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4125 set_reloc_control(rc); 4139 set_reloc_control(rc);
4126 4140
4127 trans = btrfs_join_transaction(rc->extent_root, 1); 4141 trans = btrfs_join_transaction(rc->extent_root, 1);
4142 if (IS_ERR(trans)) {
4143 unset_reloc_control(rc);
4144 err = PTR_ERR(trans);
4145 goto out_free;
4146 }
4128 4147
4129 rc->merge_reloc_tree = 1; 4148 rc->merge_reloc_tree = 1;
4130 4149
@@ -4154,9 +4173,13 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4154 unset_reloc_control(rc); 4173 unset_reloc_control(rc);
4155 4174
4156 trans = btrfs_join_transaction(rc->extent_root, 1); 4175 trans = btrfs_join_transaction(rc->extent_root, 1);
4157 btrfs_commit_transaction(trans, rc->extent_root); 4176 if (IS_ERR(trans))
4158out: 4177 err = PTR_ERR(trans);
4178 else
4179 btrfs_commit_transaction(trans, rc->extent_root);
4180out_free:
4159 kfree(rc); 4181 kfree(rc);
4182out:
4160 while (!list_empty(&reloc_roots)) { 4183 while (!list_empty(&reloc_roots)) {
4161 reloc_root = list_entry(reloc_roots.next, 4184 reloc_root = list_entry(reloc_roots.next,
4162 struct btrfs_root, root_list); 4185 struct btrfs_root, root_list);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 22acdaa78ce1..a004008f7d28 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -54,6 +54,90 @@
54 54
55static const struct super_operations btrfs_super_ops; 55static const struct super_operations btrfs_super_ops;
56 56
57static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
58 char nbuf[16])
59{
60 char *errstr = NULL;
61
62 switch (errno) {
63 case -EIO:
64 errstr = "IO failure";
65 break;
66 case -ENOMEM:
67 errstr = "Out of memory";
68 break;
69 case -EROFS:
70 errstr = "Readonly filesystem";
71 break;
72 default:
73 if (nbuf) {
74 if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
75 errstr = nbuf;
76 }
77 break;
78 }
79
80 return errstr;
81}
82
83static void __save_error_info(struct btrfs_fs_info *fs_info)
84{
85 /*
86 * today we only save the error info into ram. Long term we'll
87 * also send it down to the disk
88 */
89 fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
90}
91
92/* NOTE:
93 * We move write_super stuff at umount in order to avoid deadlock
94 * for umount hold all lock.
95 */
96static void save_error_info(struct btrfs_fs_info *fs_info)
97{
98 __save_error_info(fs_info);
99}
100
101/* btrfs handle error by forcing the filesystem readonly */
102static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
103{
104 struct super_block *sb = fs_info->sb;
105
106 if (sb->s_flags & MS_RDONLY)
107 return;
108
109 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
110 sb->s_flags |= MS_RDONLY;
111 printk(KERN_INFO "btrfs is forced readonly\n");
112 }
113}
114
115/*
116 * __btrfs_std_error decodes expected errors from the caller and
117 * invokes the approciate error response.
118 */
119void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
120 unsigned int line, int errno)
121{
122 struct super_block *sb = fs_info->sb;
123 char nbuf[16];
124 const char *errstr;
125
126 /*
127 * Special case: if the error is EROFS, and we're already
128 * under MS_RDONLY, then it is safe here.
129 */
130 if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
131 return;
132
133 errstr = btrfs_decode_error(fs_info, errno, nbuf);
134 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
135 sb->s_id, function, line, errstr);
136 save_error_info(fs_info);
137
138 btrfs_handle_error(fs_info);
139}
140
57static void btrfs_put_super(struct super_block *sb) 141static void btrfs_put_super(struct super_block *sb)
58{ 142{
59 struct btrfs_root *root = btrfs_sb(sb); 143 struct btrfs_root *root = btrfs_sb(sb);
@@ -69,9 +153,9 @@ enum {
69 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum, 153 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
70 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, 154 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
71 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, 155 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
72 Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit, 156 Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
73 Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err, 157 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
74 Opt_user_subvol_rm_allowed, 158 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
75}; 159};
76 160
77static match_table_t tokens = { 161static match_table_t tokens = {
@@ -86,7 +170,9 @@ static match_table_t tokens = {
86 {Opt_alloc_start, "alloc_start=%s"}, 170 {Opt_alloc_start, "alloc_start=%s"},
87 {Opt_thread_pool, "thread_pool=%d"}, 171 {Opt_thread_pool, "thread_pool=%d"},
88 {Opt_compress, "compress"}, 172 {Opt_compress, "compress"},
173 {Opt_compress_type, "compress=%s"},
89 {Opt_compress_force, "compress-force"}, 174 {Opt_compress_force, "compress-force"},
175 {Opt_compress_force_type, "compress-force=%s"},
90 {Opt_ssd, "ssd"}, 176 {Opt_ssd, "ssd"},
91 {Opt_ssd_spread, "ssd_spread"}, 177 {Opt_ssd_spread, "ssd_spread"},
92 {Opt_nossd, "nossd"}, 178 {Opt_nossd, "nossd"},
@@ -112,6 +198,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
112 char *p, *num, *orig; 198 char *p, *num, *orig;
113 int intarg; 199 int intarg;
114 int ret = 0; 200 int ret = 0;
201 char *compress_type;
202 bool compress_force = false;
115 203
116 if (!options) 204 if (!options)
117 return 0; 205 return 0;
@@ -154,14 +242,32 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
154 btrfs_set_opt(info->mount_opt, NODATACOW); 242 btrfs_set_opt(info->mount_opt, NODATACOW);
155 btrfs_set_opt(info->mount_opt, NODATASUM); 243 btrfs_set_opt(info->mount_opt, NODATASUM);
156 break; 244 break;
157 case Opt_compress:
158 printk(KERN_INFO "btrfs: use compression\n");
159 btrfs_set_opt(info->mount_opt, COMPRESS);
160 break;
161 case Opt_compress_force: 245 case Opt_compress_force:
162 printk(KERN_INFO "btrfs: forcing compression\n"); 246 case Opt_compress_force_type:
163 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); 247 compress_force = true;
248 case Opt_compress:
249 case Opt_compress_type:
250 if (token == Opt_compress ||
251 token == Opt_compress_force ||
252 strcmp(args[0].from, "zlib") == 0) {
253 compress_type = "zlib";
254 info->compress_type = BTRFS_COMPRESS_ZLIB;
255 } else if (strcmp(args[0].from, "lzo") == 0) {
256 compress_type = "lzo";
257 info->compress_type = BTRFS_COMPRESS_LZO;
258 } else {
259 ret = -EINVAL;
260 goto out;
261 }
262
164 btrfs_set_opt(info->mount_opt, COMPRESS); 263 btrfs_set_opt(info->mount_opt, COMPRESS);
264 if (compress_force) {
265 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
266 pr_info("btrfs: force %s compression\n",
267 compress_type);
268 } else
269 pr_info("btrfs: use %s compression\n",
270 compress_type);
165 break; 271 break;
166 case Opt_ssd: 272 case Opt_ssd:
167 printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); 273 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
@@ -277,7 +383,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
277 struct btrfs_fs_devices **fs_devices) 383 struct btrfs_fs_devices **fs_devices)
278{ 384{
279 substring_t args[MAX_OPT_ARGS]; 385 substring_t args[MAX_OPT_ARGS];
280 char *opts, *p; 386 char *opts, *orig, *p;
281 int error = 0; 387 int error = 0;
282 int intarg; 388 int intarg;
283 389
@@ -291,6 +397,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
291 opts = kstrdup(options, GFP_KERNEL); 397 opts = kstrdup(options, GFP_KERNEL);
292 if (!opts) 398 if (!opts)
293 return -ENOMEM; 399 return -ENOMEM;
400 orig = opts;
294 401
295 while ((p = strsep(&opts, ",")) != NULL) { 402 while ((p = strsep(&opts, ",")) != NULL) {
296 int token; 403 int token;
@@ -326,7 +433,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
326 } 433 }
327 434
328 out_free_opts: 435 out_free_opts:
329 kfree(opts); 436 kfree(orig);
330 out: 437 out:
331 /* 438 /*
332 * If no subvolume name is specified we use the default one. Allocate 439 * If no subvolume name is specified we use the default one. Allocate
@@ -517,6 +624,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
517 btrfs_wait_ordered_extents(root, 0, 0); 624 btrfs_wait_ordered_extents(root, 0, 0);
518 625
519 trans = btrfs_start_transaction(root, 0); 626 trans = btrfs_start_transaction(root, 0);
627 if (IS_ERR(trans))
628 return PTR_ERR(trans);
520 ret = btrfs_commit_transaction(trans, root); 629 ret = btrfs_commit_transaction(trans, root);
521 return ret; 630 return ret;
522} 631}
@@ -655,6 +764,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
655 } 764 }
656 765
657 btrfs_close_devices(fs_devices); 766 btrfs_close_devices(fs_devices);
767 kfree(fs_info);
768 kfree(tree_root);
658 } else { 769 } else {
659 char b[BDEVNAME_SIZE]; 770 char b[BDEVNAME_SIZE];
660 771
@@ -753,6 +864,127 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
753 return 0; 864 return 0;
754} 865}
755 866
867/*
868 * The helper to calc the free space on the devices that can be used to store
869 * file data.
870 */
871static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
872{
873 struct btrfs_fs_info *fs_info = root->fs_info;
874 struct btrfs_device_info *devices_info;
875 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
876 struct btrfs_device *device;
877 u64 skip_space;
878 u64 type;
879 u64 avail_space;
880 u64 used_space;
881 u64 min_stripe_size;
882 int min_stripes = 1;
883 int i = 0, nr_devices;
884 int ret;
885
886 nr_devices = fs_info->fs_devices->rw_devices;
887 BUG_ON(!nr_devices);
888
889 devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
890 GFP_NOFS);
891 if (!devices_info)
892 return -ENOMEM;
893
894 /* calc min stripe number for data space alloction */
895 type = btrfs_get_alloc_profile(root, 1);
896 if (type & BTRFS_BLOCK_GROUP_RAID0)
897 min_stripes = 2;
898 else if (type & BTRFS_BLOCK_GROUP_RAID1)
899 min_stripes = 2;
900 else if (type & BTRFS_BLOCK_GROUP_RAID10)
901 min_stripes = 4;
902
903 if (type & BTRFS_BLOCK_GROUP_DUP)
904 min_stripe_size = 2 * BTRFS_STRIPE_LEN;
905 else
906 min_stripe_size = BTRFS_STRIPE_LEN;
907
908 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
909 if (!device->in_fs_metadata)
910 continue;
911
912 avail_space = device->total_bytes - device->bytes_used;
913
914 /* align with stripe_len */
915 do_div(avail_space, BTRFS_STRIPE_LEN);
916 avail_space *= BTRFS_STRIPE_LEN;
917
918 /*
919 * In order to avoid overwritting the superblock on the drive,
920 * btrfs starts at an offset of at least 1MB when doing chunk
921 * allocation.
922 */
923 skip_space = 1024 * 1024;
924
925 /* user can set the offset in fs_info->alloc_start. */
926 if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
927 device->total_bytes)
928 skip_space = max(fs_info->alloc_start, skip_space);
929
930 /*
931 * btrfs can not use the free space in [0, skip_space - 1],
932 * we must subtract it from the total. In order to implement
933 * it, we account the used space in this range first.
934 */
935 ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
936 &used_space);
937 if (ret) {
938 kfree(devices_info);
939 return ret;
940 }
941
942 /* calc the free space in [0, skip_space - 1] */
943 skip_space -= used_space;
944
945 /*
946 * we can use the free space in [0, skip_space - 1], subtract
947 * it from the total.
948 */
949 if (avail_space && avail_space >= skip_space)
950 avail_space -= skip_space;
951 else
952 avail_space = 0;
953
954 if (avail_space < min_stripe_size)
955 continue;
956
957 devices_info[i].dev = device;
958 devices_info[i].max_avail = avail_space;
959
960 i++;
961 }
962
963 nr_devices = i;
964
965 btrfs_descending_sort_devices(devices_info, nr_devices);
966
967 i = nr_devices - 1;
968 avail_space = 0;
969 while (nr_devices >= min_stripes) {
970 if (devices_info[i].max_avail >= min_stripe_size) {
971 int j;
972 u64 alloc_size;
973
974 avail_space += devices_info[i].max_avail * min_stripes;
975 alloc_size = devices_info[i].max_avail;
976 for (j = i + 1 - min_stripes; j <= i; j++)
977 devices_info[j].max_avail -= alloc_size;
978 }
979 i--;
980 nr_devices--;
981 }
982
983 kfree(devices_info);
984 *free_bytes = avail_space;
985 return 0;
986}
987
756static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 988static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
757{ 989{
758 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 990 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
@@ -760,17 +992,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
760 struct list_head *head = &root->fs_info->space_info; 992 struct list_head *head = &root->fs_info->space_info;
761 struct btrfs_space_info *found; 993 struct btrfs_space_info *found;
762 u64 total_used = 0; 994 u64 total_used = 0;
763 u64 total_used_data = 0; 995 u64 total_free_data = 0;
764 int bits = dentry->d_sb->s_blocksize_bits; 996 int bits = dentry->d_sb->s_blocksize_bits;
765 __be32 *fsid = (__be32 *)root->fs_info->fsid; 997 __be32 *fsid = (__be32 *)root->fs_info->fsid;
998 int ret;
766 999
1000 /* holding chunk_muext to avoid allocating new chunks */
1001 mutex_lock(&root->fs_info->chunk_mutex);
767 rcu_read_lock(); 1002 rcu_read_lock();
768 list_for_each_entry_rcu(found, head, list) { 1003 list_for_each_entry_rcu(found, head, list) {
769 if (found->flags & (BTRFS_BLOCK_GROUP_METADATA | 1004 if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
770 BTRFS_BLOCK_GROUP_SYSTEM)) 1005 total_free_data += found->disk_total - found->disk_used;
771 total_used_data += found->disk_total; 1006 total_free_data -=
772 else 1007 btrfs_account_ro_block_groups_free_space(found);
773 total_used_data += found->disk_used; 1008 }
1009
774 total_used += found->disk_used; 1010 total_used += found->disk_used;
775 } 1011 }
776 rcu_read_unlock(); 1012 rcu_read_unlock();
@@ -778,9 +1014,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
778 buf->f_namelen = BTRFS_NAME_LEN; 1014 buf->f_namelen = BTRFS_NAME_LEN;
779 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 1015 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
780 buf->f_bfree = buf->f_blocks - (total_used >> bits); 1016 buf->f_bfree = buf->f_blocks - (total_used >> bits);
781 buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
782 buf->f_bsize = dentry->d_sb->s_blocksize; 1017 buf->f_bsize = dentry->d_sb->s_blocksize;
783 buf->f_type = BTRFS_SUPER_MAGIC; 1018 buf->f_type = BTRFS_SUPER_MAGIC;
1019 buf->f_bavail = total_free_data;
1020 ret = btrfs_calc_avail_data_space(root, &total_free_data);
1021 if (ret) {
1022 mutex_unlock(&root->fs_info->chunk_mutex);
1023 return ret;
1024 }
1025 buf->f_bavail += total_free_data;
1026 buf->f_bavail = buf->f_bavail >> bits;
1027 mutex_unlock(&root->fs_info->chunk_mutex);
784 1028
785 /* We treat it as constant endianness (it doesn't matter _which_) 1029 /* We treat it as constant endianness (it doesn't matter _which_)
786 because we want the fsid to come out the same whether mounted 1030 because we want the fsid to come out the same whether mounted
@@ -897,10 +1141,14 @@ static int __init init_btrfs_fs(void)
897 if (err) 1141 if (err)
898 return err; 1142 return err;
899 1143
900 err = btrfs_init_cachep(); 1144 err = btrfs_init_compress();
901 if (err) 1145 if (err)
902 goto free_sysfs; 1146 goto free_sysfs;
903 1147
1148 err = btrfs_init_cachep();
1149 if (err)
1150 goto free_compress;
1151
904 err = extent_io_init(); 1152 err = extent_io_init();
905 if (err) 1153 if (err)
906 goto free_cachep; 1154 goto free_cachep;
@@ -928,6 +1176,8 @@ free_extent_io:
928 extent_io_exit(); 1176 extent_io_exit();
929free_cachep: 1177free_cachep:
930 btrfs_destroy_cachep(); 1178 btrfs_destroy_cachep();
1179free_compress:
1180 btrfs_exit_compress();
931free_sysfs: 1181free_sysfs:
932 btrfs_exit_sysfs(); 1182 btrfs_exit_sysfs();
933 return err; 1183 return err;
@@ -942,7 +1192,7 @@ static void __exit exit_btrfs_fs(void)
942 unregister_filesystem(&btrfs_fs_type); 1192 unregister_filesystem(&btrfs_fs_type);
943 btrfs_exit_sysfs(); 1193 btrfs_exit_sysfs();
944 btrfs_cleanup_fs_uuids(); 1194 btrfs_cleanup_fs_uuids();
945 btrfs_zlib_exit(); 1195 btrfs_exit_compress();
946} 1196}
947 1197
948module_init(init_btrfs_fs) 1198module_init(init_btrfs_fs)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f50e931fc217..3d73c8d93bbb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -181,6 +181,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
181 struct btrfs_trans_handle *h; 181 struct btrfs_trans_handle *h;
182 struct btrfs_transaction *cur_trans; 182 struct btrfs_transaction *cur_trans;
183 int ret; 183 int ret;
184
185 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
186 return ERR_PTR(-EROFS);
184again: 187again:
185 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 188 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
186 if (!h) 189 if (!h)
@@ -910,6 +913,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
910 u64 to_reserve = 0; 913 u64 to_reserve = 0;
911 u64 index = 0; 914 u64 index = 0;
912 u64 objectid; 915 u64 objectid;
916 u64 root_flags;
913 917
914 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 918 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
915 if (!new_root_item) { 919 if (!new_root_item) {
@@ -967,6 +971,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
967 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 971 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
968 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 972 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
969 973
974 root_flags = btrfs_root_flags(new_root_item);
975 if (pending->readonly)
976 root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
977 else
978 root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
979 btrfs_set_root_flags(new_root_item, root_flags);
980
970 old = btrfs_lock_root_node(root); 981 old = btrfs_lock_root_node(root);
971 btrfs_cow_block(trans, root, old, NULL, 0, &old); 982 btrfs_cow_block(trans, root, old, NULL, 0, &old);
972 btrfs_set_lock_blocking(old); 983 btrfs_set_lock_blocking(old);
@@ -1150,6 +1161,11 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1150 INIT_DELAYED_WORK(&ac->work, do_async_commit); 1161 INIT_DELAYED_WORK(&ac->work, do_async_commit);
1151 ac->root = root; 1162 ac->root = root;
1152 ac->newtrans = btrfs_join_transaction(root, 0); 1163 ac->newtrans = btrfs_join_transaction(root, 0);
1164 if (IS_ERR(ac->newtrans)) {
1165 int err = PTR_ERR(ac->newtrans);
1166 kfree(ac);
1167 return err;
1168 }
1153 1169
1154 /* take transaction reference */ 1170 /* take transaction reference */
1155 mutex_lock(&root->fs_info->trans_mutex); 1171 mutex_lock(&root->fs_info->trans_mutex);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index f104b57ad4ef..229a594cacd5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -62,6 +62,7 @@ struct btrfs_pending_snapshot {
62 struct btrfs_block_rsv block_rsv; 62 struct btrfs_block_rsv block_rsv;
63 /* extra metadata reseration for relocation */ 63 /* extra metadata reseration for relocation */
64 int error; 64 int error;
65 bool readonly;
65 struct list_head list; 66 struct list_head list;
66}; 67};
67 68
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 054744ac5719..a4bbb854dfd2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -338,6 +338,12 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
338 } 338 }
339 dst_copy = kmalloc(item_size, GFP_NOFS); 339 dst_copy = kmalloc(item_size, GFP_NOFS);
340 src_copy = kmalloc(item_size, GFP_NOFS); 340 src_copy = kmalloc(item_size, GFP_NOFS);
341 if (!dst_copy || !src_copy) {
342 btrfs_release_path(root, path);
343 kfree(dst_copy);
344 kfree(src_copy);
345 return -ENOMEM;
346 }
341 347
342 read_extent_buffer(eb, src_copy, src_ptr, item_size); 348 read_extent_buffer(eb, src_copy, src_ptr, item_size);
343 349
@@ -665,6 +671,9 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
665 btrfs_dir_item_key_to_cpu(leaf, di, &location); 671 btrfs_dir_item_key_to_cpu(leaf, di, &location);
666 name_len = btrfs_dir_name_len(leaf, di); 672 name_len = btrfs_dir_name_len(leaf, di);
667 name = kmalloc(name_len, GFP_NOFS); 673 name = kmalloc(name_len, GFP_NOFS);
674 if (!name)
675 return -ENOMEM;
676
668 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 677 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
669 btrfs_release_path(root, path); 678 btrfs_release_path(root, path);
670 679
@@ -744,6 +753,9 @@ static noinline int backref_in_log(struct btrfs_root *log,
744 int match = 0; 753 int match = 0;
745 754
746 path = btrfs_alloc_path(); 755 path = btrfs_alloc_path();
756 if (!path)
757 return -ENOMEM;
758
747 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 759 ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
748 if (ret != 0) 760 if (ret != 0)
749 goto out; 761 goto out;
@@ -967,6 +979,8 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
967 key.offset = (u64)-1; 979 key.offset = (u64)-1;
968 980
969 path = btrfs_alloc_path(); 981 path = btrfs_alloc_path();
982 if (!path)
983 return -ENOMEM;
970 984
971 while (1) { 985 while (1) {
972 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 986 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -1178,6 +1192,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1178 1192
1179 name_len = btrfs_dir_name_len(eb, di); 1193 name_len = btrfs_dir_name_len(eb, di);
1180 name = kmalloc(name_len, GFP_NOFS); 1194 name = kmalloc(name_len, GFP_NOFS);
1195 if (!name)
1196 return -ENOMEM;
1197
1181 log_type = btrfs_dir_type(eb, di); 1198 log_type = btrfs_dir_type(eb, di);
1182 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1199 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1183 name_len); 1200 name_len);
@@ -1692,6 +1709,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1692 root_owner = btrfs_header_owner(parent); 1709 root_owner = btrfs_header_owner(parent);
1693 1710
1694 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1711 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1712 if (!next)
1713 return -ENOMEM;
1695 1714
1696 if (*level == 1) { 1715 if (*level == 1) {
1697 wc->process_func(root, next, wc, ptr_gen); 1716 wc->process_func(root, next, wc, ptr_gen);
@@ -2032,6 +2051,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2032 wait_log_commit(trans, log_root_tree, 2051 wait_log_commit(trans, log_root_tree,
2033 log_root_tree->log_transid); 2052 log_root_tree->log_transid);
2034 mutex_unlock(&log_root_tree->log_mutex); 2053 mutex_unlock(&log_root_tree->log_mutex);
2054 ret = 0;
2035 goto out; 2055 goto out;
2036 } 2056 }
2037 atomic_set(&log_root_tree->log_commit[index2], 1); 2057 atomic_set(&log_root_tree->log_commit[index2], 1);
@@ -2096,7 +2116,7 @@ out:
2096 smp_mb(); 2116 smp_mb();
2097 if (waitqueue_active(&root->log_commit_wait[index1])) 2117 if (waitqueue_active(&root->log_commit_wait[index1]))
2098 wake_up(&root->log_commit_wait[index1]); 2118 wake_up(&root->log_commit_wait[index1]);
2099 return 0; 2119 return ret;
2100} 2120}
2101 2121
2102static void free_log_tree(struct btrfs_trans_handle *trans, 2122static void free_log_tree(struct btrfs_trans_handle *trans,
@@ -2194,6 +2214,9 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2194 2214
2195 log = root->log_root; 2215 log = root->log_root;
2196 path = btrfs_alloc_path(); 2216 path = btrfs_alloc_path();
2217 if (!path)
2218 return -ENOMEM;
2219
2197 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2220 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2198 name, name_len, -1); 2221 name, name_len, -1);
2199 if (IS_ERR(di)) { 2222 if (IS_ERR(di)) {
@@ -2594,6 +2617,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2594 2617
2595 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 2618 ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
2596 nr * sizeof(u32), GFP_NOFS); 2619 nr * sizeof(u32), GFP_NOFS);
2620 if (!ins_data)
2621 return -ENOMEM;
2622
2597 ins_sizes = (u32 *)ins_data; 2623 ins_sizes = (u32 *)ins_data;
2598 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 2624 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
2599 2625
@@ -2725,7 +2751,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2725 log = root->log_root; 2751 log = root->log_root;
2726 2752
2727 path = btrfs_alloc_path(); 2753 path = btrfs_alloc_path();
2754 if (!path)
2755 return -ENOMEM;
2728 dst_path = btrfs_alloc_path(); 2756 dst_path = btrfs_alloc_path();
2757 if (!dst_path) {
2758 btrfs_free_path(path);
2759 return -ENOMEM;
2760 }
2729 2761
2730 min_key.objectid = inode->i_ino; 2762 min_key.objectid = inode->i_ino;
2731 min_key.type = BTRFS_INODE_ITEM_KEY; 2763 min_key.type = BTRFS_INODE_ITEM_KEY;
@@ -3080,6 +3112,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
3080 BUG_ON(!path); 3112 BUG_ON(!path);
3081 3113
3082 trans = btrfs_start_transaction(fs_info->tree_root, 0); 3114 trans = btrfs_start_transaction(fs_info->tree_root, 0);
3115 BUG_ON(IS_ERR(trans));
3083 3116
3084 wc.trans = trans; 3117 wc.trans = trans;
3085 wc.pin = 1; 3118 wc.pin = 1;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1718e1a5c320..2636a051e4b2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -22,6 +22,7 @@
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/random.h> 23#include <linux/random.h>
24#include <linux/iocontext.h> 24#include <linux/iocontext.h>
25#include <linux/capability.h>
25#include <asm/div64.h> 26#include <asm/div64.h>
26#include "compat.h" 27#include "compat.h"
27#include "ctree.h" 28#include "ctree.h"
@@ -600,8 +601,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
600 set_blocksize(bdev, 4096); 601 set_blocksize(bdev, 4096);
601 602
602 bh = btrfs_read_dev_super(bdev); 603 bh = btrfs_read_dev_super(bdev);
603 if (!bh) 604 if (!bh) {
605 ret = -EINVAL;
604 goto error_close; 606 goto error_close;
607 }
605 608
606 disk_super = (struct btrfs_super_block *)bh->b_data; 609 disk_super = (struct btrfs_super_block *)bh->b_data;
607 devid = btrfs_stack_device_id(&disk_super->dev_item); 610 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -703,7 +706,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
703 goto error_close; 706 goto error_close;
704 bh = btrfs_read_dev_super(bdev); 707 bh = btrfs_read_dev_super(bdev);
705 if (!bh) { 708 if (!bh) {
706 ret = -EIO; 709 ret = -EINVAL;
707 goto error_close; 710 goto error_close;
708 } 711 }
709 disk_super = (struct btrfs_super_block *)bh->b_data; 712 disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -729,59 +732,167 @@ error:
729 return ret; 732 return ret;
730} 733}
731 734
735/* helper to account the used device space in the range */
736int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
737 u64 end, u64 *length)
738{
739 struct btrfs_key key;
740 struct btrfs_root *root = device->dev_root;
741 struct btrfs_dev_extent *dev_extent;
742 struct btrfs_path *path;
743 u64 extent_end;
744 int ret;
745 int slot;
746 struct extent_buffer *l;
747
748 *length = 0;
749
750 if (start >= device->total_bytes)
751 return 0;
752
753 path = btrfs_alloc_path();
754 if (!path)
755 return -ENOMEM;
756 path->reada = 2;
757
758 key.objectid = device->devid;
759 key.offset = start;
760 key.type = BTRFS_DEV_EXTENT_KEY;
761
762 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
763 if (ret < 0)
764 goto out;
765 if (ret > 0) {
766 ret = btrfs_previous_item(root, path, key.objectid, key.type);
767 if (ret < 0)
768 goto out;
769 }
770
771 while (1) {
772 l = path->nodes[0];
773 slot = path->slots[0];
774 if (slot >= btrfs_header_nritems(l)) {
775 ret = btrfs_next_leaf(root, path);
776 if (ret == 0)
777 continue;
778 if (ret < 0)
779 goto out;
780
781 break;
782 }
783 btrfs_item_key_to_cpu(l, &key, slot);
784
785 if (key.objectid < device->devid)
786 goto next;
787
788 if (key.objectid > device->devid)
789 break;
790
791 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
792 goto next;
793
794 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
795 extent_end = key.offset + btrfs_dev_extent_length(l,
796 dev_extent);
797 if (key.offset <= start && extent_end > end) {
798 *length = end - start + 1;
799 break;
800 } else if (key.offset <= start && extent_end > start)
801 *length += extent_end - start;
802 else if (key.offset > start && extent_end <= end)
803 *length += extent_end - key.offset;
804 else if (key.offset > start && key.offset <= end) {
805 *length += end - key.offset + 1;
806 break;
807 } else if (key.offset > end)
808 break;
809
810next:
811 path->slots[0]++;
812 }
813 ret = 0;
814out:
815 btrfs_free_path(path);
816 return ret;
817}
818
732/* 819/*
820 * find_free_dev_extent - find free space in the specified device
821 * @trans: transaction handler
822 * @device: the device which we search the free space in
823 * @num_bytes: the size of the free space that we need
824 * @start: store the start of the free space.
825 * @len: the size of the free space. that we find, or the size of the max
826 * free space if we don't find suitable free space
827 *
733 * this uses a pretty simple search, the expectation is that it is 828 * this uses a pretty simple search, the expectation is that it is
734 * called very infrequently and that a given device has a small number 829 * called very infrequently and that a given device has a small number
735 * of extents 830 * of extents
831 *
832 * @start is used to store the start of the free space if we find. But if we
833 * don't find suitable free space, it will be used to store the start position
834 * of the max free space.
835 *
836 * @len is used to store the size of the free space that we find.
837 * But if we don't find suitable free space, it is used to store the size of
838 * the max free space.
736 */ 839 */
737int find_free_dev_extent(struct btrfs_trans_handle *trans, 840int find_free_dev_extent(struct btrfs_trans_handle *trans,
738 struct btrfs_device *device, u64 num_bytes, 841 struct btrfs_device *device, u64 num_bytes,
739 u64 *start, u64 *max_avail) 842 u64 *start, u64 *len)
740{ 843{
741 struct btrfs_key key; 844 struct btrfs_key key;
742 struct btrfs_root *root = device->dev_root; 845 struct btrfs_root *root = device->dev_root;
743 struct btrfs_dev_extent *dev_extent = NULL; 846 struct btrfs_dev_extent *dev_extent;
744 struct btrfs_path *path; 847 struct btrfs_path *path;
745 u64 hole_size = 0; 848 u64 hole_size;
746 u64 last_byte = 0; 849 u64 max_hole_start;
747 u64 search_start = 0; 850 u64 max_hole_size;
851 u64 extent_end;
852 u64 search_start;
748 u64 search_end = device->total_bytes; 853 u64 search_end = device->total_bytes;
749 int ret; 854 int ret;
750 int slot = 0; 855 int slot;
751 int start_found;
752 struct extent_buffer *l; 856 struct extent_buffer *l;
753 857
754 path = btrfs_alloc_path();
755 if (!path)
756 return -ENOMEM;
757 path->reada = 2;
758 start_found = 0;
759
760 /* FIXME use last free of some kind */ 858 /* FIXME use last free of some kind */
761 859
762 /* we don't want to overwrite the superblock on the drive, 860 /* we don't want to overwrite the superblock on the drive,
763 * so we make sure to start at an offset of at least 1MB 861 * so we make sure to start at an offset of at least 1MB
764 */ 862 */
765 search_start = max((u64)1024 * 1024, search_start); 863 search_start = 1024 * 1024;
766 864
767 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) 865 if (root->fs_info->alloc_start + num_bytes <= search_end)
768 search_start = max(root->fs_info->alloc_start, search_start); 866 search_start = max(root->fs_info->alloc_start, search_start);
769 867
868 max_hole_start = search_start;
869 max_hole_size = 0;
870
871 if (search_start >= search_end) {
872 ret = -ENOSPC;
873 goto error;
874 }
875
876 path = btrfs_alloc_path();
877 if (!path) {
878 ret = -ENOMEM;
879 goto error;
880 }
881 path->reada = 2;
882
770 key.objectid = device->devid; 883 key.objectid = device->devid;
771 key.offset = search_start; 884 key.offset = search_start;
772 key.type = BTRFS_DEV_EXTENT_KEY; 885 key.type = BTRFS_DEV_EXTENT_KEY;
886
773 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 887 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
774 if (ret < 0) 888 if (ret < 0)
775 goto error; 889 goto out;
776 if (ret > 0) { 890 if (ret > 0) {
777 ret = btrfs_previous_item(root, path, key.objectid, key.type); 891 ret = btrfs_previous_item(root, path, key.objectid, key.type);
778 if (ret < 0) 892 if (ret < 0)
779 goto error; 893 goto out;
780 if (ret > 0)
781 start_found = 1;
782 } 894 }
783 l = path->nodes[0]; 895
784 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
785 while (1) { 896 while (1) {
786 l = path->nodes[0]; 897 l = path->nodes[0];
787 slot = path->slots[0]; 898 slot = path->slots[0];
@@ -790,24 +901,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
790 if (ret == 0) 901 if (ret == 0)
791 continue; 902 continue;
792 if (ret < 0) 903 if (ret < 0)
793 goto error; 904 goto out;
794no_more_items: 905
795 if (!start_found) { 906 break;
796 if (search_start >= search_end) {
797 ret = -ENOSPC;
798 goto error;
799 }
800 *start = search_start;
801 start_found = 1;
802 goto check_pending;
803 }
804 *start = last_byte > search_start ?
805 last_byte : search_start;
806 if (search_end <= *start) {
807 ret = -ENOSPC;
808 goto error;
809 }
810 goto check_pending;
811 } 907 }
812 btrfs_item_key_to_cpu(l, &key, slot); 908 btrfs_item_key_to_cpu(l, &key, slot);
813 909
@@ -815,48 +911,62 @@ no_more_items:
815 goto next; 911 goto next;
816 912
817 if (key.objectid > device->devid) 913 if (key.objectid > device->devid)
818 goto no_more_items; 914 break;
819 915
820 if (key.offset >= search_start && key.offset > last_byte && 916 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
821 start_found) { 917 goto next;
822 if (last_byte < search_start)
823 last_byte = search_start;
824 hole_size = key.offset - last_byte;
825 918
826 if (hole_size > *max_avail) 919 if (key.offset > search_start) {
827 *max_avail = hole_size; 920 hole_size = key.offset - search_start;
828 921
829 if (key.offset > last_byte && 922 if (hole_size > max_hole_size) {
830 hole_size >= num_bytes) { 923 max_hole_start = search_start;
831 *start = last_byte; 924 max_hole_size = hole_size;
832 goto check_pending; 925 }
926
927 /*
928 * If this free space is greater than which we need,
929 * it must be the max free space that we have found
930 * until now, so max_hole_start must point to the start
931 * of this free space and the length of this free space
932 * is stored in max_hole_size. Thus, we return
933 * max_hole_start and max_hole_size and go back to the
934 * caller.
935 */
936 if (hole_size >= num_bytes) {
937 ret = 0;
938 goto out;
833 } 939 }
834 } 940 }
835 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
836 goto next;
837 941
838 start_found = 1;
839 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 942 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
840 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); 943 extent_end = key.offset + btrfs_dev_extent_length(l,
944 dev_extent);
945 if (extent_end > search_start)
946 search_start = extent_end;
841next: 947next:
842 path->slots[0]++; 948 path->slots[0]++;
843 cond_resched(); 949 cond_resched();
844 } 950 }
845check_pending:
846 /* we have to make sure we didn't find an extent that has already
847 * been allocated by the map tree or the original allocation
848 */
849 BUG_ON(*start < search_start);
850 951
851 if (*start + num_bytes > search_end) { 952 hole_size = search_end- search_start;
852 ret = -ENOSPC; 953 if (hole_size > max_hole_size) {
853 goto error; 954 max_hole_start = search_start;
955 max_hole_size = hole_size;
854 } 956 }
855 /* check for pending inserts here */
856 ret = 0;
857 957
858error: 958 /* See above. */
959 if (hole_size < num_bytes)
960 ret = -ENOSPC;
961 else
962 ret = 0;
963
964out:
859 btrfs_free_path(path); 965 btrfs_free_path(path);
966error:
967 *start = max_hole_start;
968 if (len)
969 *len = max_hole_size;
860 return ret; 970 return ret;
861} 971}
862 972
@@ -1103,6 +1213,10 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1103 return -ENOMEM; 1213 return -ENOMEM;
1104 1214
1105 trans = btrfs_start_transaction(root, 0); 1215 trans = btrfs_start_transaction(root, 0);
1216 if (IS_ERR(trans)) {
1217 btrfs_free_path(path);
1218 return PTR_ERR(trans);
1219 }
1106 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1220 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1107 key.type = BTRFS_DEV_ITEM_KEY; 1221 key.type = BTRFS_DEV_ITEM_KEY;
1108 key.offset = device->devid; 1222 key.offset = device->devid;
@@ -1196,7 +1310,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1196 set_blocksize(bdev, 4096); 1310 set_blocksize(bdev, 4096);
1197 bh = btrfs_read_dev_super(bdev); 1311 bh = btrfs_read_dev_super(bdev);
1198 if (!bh) { 1312 if (!bh) {
1199 ret = -EIO; 1313 ret = -EINVAL;
1200 goto error_close; 1314 goto error_close;
1201 } 1315 }
1202 disk_super = (struct btrfs_super_block *)bh->b_data; 1316 disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -1496,6 +1610,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1496 } 1610 }
1497 1611
1498 trans = btrfs_start_transaction(root, 0); 1612 trans = btrfs_start_transaction(root, 0);
1613 if (IS_ERR(trans)) {
1614 kfree(device);
1615 ret = PTR_ERR(trans);
1616 goto error;
1617 }
1618
1499 lock_chunks(root); 1619 lock_chunks(root);
1500 1620
1501 device->writeable = 1; 1621 device->writeable = 1;
@@ -1763,7 +1883,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1763 return ret; 1883 return ret;
1764 1884
1765 trans = btrfs_start_transaction(root, 0); 1885 trans = btrfs_start_transaction(root, 0);
1766 BUG_ON(!trans); 1886 BUG_ON(IS_ERR(trans));
1767 1887
1768 lock_chunks(root); 1888 lock_chunks(root);
1769 1889
@@ -1916,6 +2036,9 @@ int btrfs_balance(struct btrfs_root *dev_root)
1916 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 2036 if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
1917 return -EROFS; 2037 return -EROFS;
1918 2038
2039 if (!capable(CAP_SYS_ADMIN))
2040 return -EPERM;
2041
1919 mutex_lock(&dev_root->fs_info->volume_mutex); 2042 mutex_lock(&dev_root->fs_info->volume_mutex);
1920 dev_root = dev_root->fs_info->dev_root; 2043 dev_root = dev_root->fs_info->dev_root;
1921 2044
@@ -1934,7 +2057,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
1934 BUG_ON(ret); 2057 BUG_ON(ret);
1935 2058
1936 trans = btrfs_start_transaction(dev_root, 0); 2059 trans = btrfs_start_transaction(dev_root, 0);
1937 BUG_ON(!trans); 2060 BUG_ON(IS_ERR(trans));
1938 2061
1939 ret = btrfs_grow_device(trans, device, old_size); 2062 ret = btrfs_grow_device(trans, device, old_size);
1940 BUG_ON(ret); 2063 BUG_ON(ret);
@@ -2100,6 +2223,11 @@ again:
2100 2223
2101 /* Shrinking succeeded, else we would be at "done". */ 2224 /* Shrinking succeeded, else we would be at "done". */
2102 trans = btrfs_start_transaction(root, 0); 2225 trans = btrfs_start_transaction(root, 0);
2226 if (IS_ERR(trans)) {
2227 ret = PTR_ERR(trans);
2228 goto done;
2229 }
2230
2103 lock_chunks(root); 2231 lock_chunks(root);
2104 2232
2105 device->disk_total_bytes = new_size; 2233 device->disk_total_bytes = new_size;
@@ -2154,66 +2282,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
2154 return calc_size * num_stripes; 2282 return calc_size * num_stripes;
2155} 2283}
2156 2284
2157static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2285/* Used to sort the devices by max_avail(descending sort) */
2158 struct btrfs_root *extent_root, 2286int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
2159 struct map_lookup **map_ret,
2160 u64 *num_bytes, u64 *stripe_size,
2161 u64 start, u64 type)
2162{ 2287{
2163 struct btrfs_fs_info *info = extent_root->fs_info; 2288 if (((struct btrfs_device_info *)dev_info1)->max_avail >
2164 struct btrfs_device *device = NULL; 2289 ((struct btrfs_device_info *)dev_info2)->max_avail)
2165 struct btrfs_fs_devices *fs_devices = info->fs_devices; 2290 return -1;
2166 struct list_head *cur; 2291 else if (((struct btrfs_device_info *)dev_info1)->max_avail <
2167 struct map_lookup *map = NULL; 2292 ((struct btrfs_device_info *)dev_info2)->max_avail)
2168 struct extent_map_tree *em_tree; 2293 return 1;
2169 struct extent_map *em; 2294 else
2170 struct list_head private_devs; 2295 return 0;
2171 int min_stripe_size = 1 * 1024 * 1024; 2296}
2172 u64 calc_size = 1024 * 1024 * 1024;
2173 u64 max_chunk_size = calc_size;
2174 u64 min_free;
2175 u64 avail;
2176 u64 max_avail = 0;
2177 u64 dev_offset;
2178 int num_stripes = 1;
2179 int min_stripes = 1;
2180 int sub_stripes = 0;
2181 int looped = 0;
2182 int ret;
2183 int index;
2184 int stripe_len = 64 * 1024;
2185 2297
2186 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 2298static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
2187 (type & BTRFS_BLOCK_GROUP_DUP)) { 2299 int *num_stripes, int *min_stripes,
2188 WARN_ON(1); 2300 int *sub_stripes)
2189 type &= ~BTRFS_BLOCK_GROUP_DUP; 2301{
2190 } 2302 *num_stripes = 1;
2191 if (list_empty(&fs_devices->alloc_list)) 2303 *min_stripes = 1;
2192 return -ENOSPC; 2304 *sub_stripes = 0;
2193 2305
2194 if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 2306 if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
2195 num_stripes = fs_devices->rw_devices; 2307 *num_stripes = fs_devices->rw_devices;
2196 min_stripes = 2; 2308 *min_stripes = 2;
2197 } 2309 }
2198 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 2310 if (type & (BTRFS_BLOCK_GROUP_DUP)) {
2199 num_stripes = 2; 2311 *num_stripes = 2;
2200 min_stripes = 2; 2312 *min_stripes = 2;
2201 } 2313 }
2202 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2314 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2203 if (fs_devices->rw_devices < 2) 2315 if (fs_devices->rw_devices < 2)
2204 return -ENOSPC; 2316 return -ENOSPC;
2205 num_stripes = 2; 2317 *num_stripes = 2;
2206 min_stripes = 2; 2318 *min_stripes = 2;
2207 } 2319 }
2208 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2320 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2209 num_stripes = fs_devices->rw_devices; 2321 *num_stripes = fs_devices->rw_devices;
2210 if (num_stripes < 4) 2322 if (*num_stripes < 4)
2211 return -ENOSPC; 2323 return -ENOSPC;
2212 num_stripes &= ~(u32)1; 2324 *num_stripes &= ~(u32)1;
2213 sub_stripes = 2; 2325 *sub_stripes = 2;
2214 min_stripes = 4; 2326 *min_stripes = 4;
2215 } 2327 }
2216 2328
2329 return 0;
2330}
2331
2332static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
2333 u64 proposed_size, u64 type,
2334 int num_stripes, int small_stripe)
2335{
2336 int min_stripe_size = 1 * 1024 * 1024;
2337 u64 calc_size = proposed_size;
2338 u64 max_chunk_size = calc_size;
2339 int ncopies = 1;
2340
2341 if (type & (BTRFS_BLOCK_GROUP_RAID1 |
2342 BTRFS_BLOCK_GROUP_DUP |
2343 BTRFS_BLOCK_GROUP_RAID10))
2344 ncopies = 2;
2345
2217 if (type & BTRFS_BLOCK_GROUP_DATA) { 2346 if (type & BTRFS_BLOCK_GROUP_DATA) {
2218 max_chunk_size = 10 * calc_size; 2347 max_chunk_size = 10 * calc_size;
2219 min_stripe_size = 64 * 1024 * 1024; 2348 min_stripe_size = 64 * 1024 * 1024;
@@ -2230,51 +2359,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2230 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 2359 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2231 max_chunk_size); 2360 max_chunk_size);
2232 2361
2233again: 2362 if (calc_size * num_stripes > max_chunk_size * ncopies) {
2234 max_avail = 0; 2363 calc_size = max_chunk_size * ncopies;
2235 if (!map || map->num_stripes != num_stripes) {
2236 kfree(map);
2237 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2238 if (!map)
2239 return -ENOMEM;
2240 map->num_stripes = num_stripes;
2241 }
2242
2243 if (calc_size * num_stripes > max_chunk_size) {
2244 calc_size = max_chunk_size;
2245 do_div(calc_size, num_stripes); 2364 do_div(calc_size, num_stripes);
2246 do_div(calc_size, stripe_len); 2365 do_div(calc_size, BTRFS_STRIPE_LEN);
2247 calc_size *= stripe_len; 2366 calc_size *= BTRFS_STRIPE_LEN;
2248 } 2367 }
2249 2368
2250 /* we don't want tiny stripes */ 2369 /* we don't want tiny stripes */
2251 if (!looped) 2370 if (!small_stripe)
2252 calc_size = max_t(u64, min_stripe_size, calc_size); 2371 calc_size = max_t(u64, min_stripe_size, calc_size);
2253 2372
2254 /* 2373 /*
2255 * we're about to do_div by the stripe_len so lets make sure 2374 * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
2256 * we end up with something bigger than a stripe 2375 * we end up with something bigger than a stripe
2257 */ 2376 */
2258 calc_size = max_t(u64, calc_size, stripe_len * 4); 2377 calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
2378
2379 do_div(calc_size, BTRFS_STRIPE_LEN);
2380 calc_size *= BTRFS_STRIPE_LEN;
2381
2382 return calc_size;
2383}
2384
2385static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
2386 int num_stripes)
2387{
2388 struct map_lookup *new;
2389 size_t len = map_lookup_size(num_stripes);
2390
2391 BUG_ON(map->num_stripes < num_stripes);
2392
2393 if (map->num_stripes == num_stripes)
2394 return map;
2395
2396 new = kmalloc(len, GFP_NOFS);
2397 if (!new) {
2398 /* just change map->num_stripes */
2399 map->num_stripes = num_stripes;
2400 return map;
2401 }
2402
2403 memcpy(new, map, len);
2404 new->num_stripes = num_stripes;
2405 kfree(map);
2406 return new;
2407}
2408
2409/*
2410 * helper to allocate device space from btrfs_device_info, in which we stored
2411 * max free space information of every device. It is used when we can not
2412 * allocate chunks by default size.
2413 *
2414 * By this helper, we can allocate a new chunk as larger as possible.
2415 */
2416static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
2417 struct btrfs_fs_devices *fs_devices,
2418 struct btrfs_device_info *devices,
2419 int nr_device, u64 type,
2420 struct map_lookup **map_lookup,
2421 int min_stripes, u64 *stripe_size)
2422{
2423 int i, index, sort_again = 0;
2424 int min_devices = min_stripes;
2425 u64 max_avail, min_free;
2426 struct map_lookup *map = *map_lookup;
2427 int ret;
2428
2429 if (nr_device < min_stripes)
2430 return -ENOSPC;
2431
2432 btrfs_descending_sort_devices(devices, nr_device);
2433
2434 max_avail = devices[0].max_avail;
2435 if (!max_avail)
2436 return -ENOSPC;
2437
2438 for (i = 0; i < nr_device; i++) {
2439 /*
2440 * if dev_offset = 0, it means the free space of this device
2441 * is less than what we need, and we didn't search max avail
2442 * extent on this device, so do it now.
2443 */
2444 if (!devices[i].dev_offset) {
2445 ret = find_free_dev_extent(trans, devices[i].dev,
2446 max_avail,
2447 &devices[i].dev_offset,
2448 &devices[i].max_avail);
2449 if (ret != 0 && ret != -ENOSPC)
2450 return ret;
2451 sort_again = 1;
2452 }
2453 }
2454
2455 /* we update the max avail free extent of each devices, sort again */
2456 if (sort_again)
2457 btrfs_descending_sort_devices(devices, nr_device);
2458
2459 if (type & BTRFS_BLOCK_GROUP_DUP)
2460 min_devices = 1;
2461
2462 if (!devices[min_devices - 1].max_avail)
2463 return -ENOSPC;
2464
2465 max_avail = devices[min_devices - 1].max_avail;
2466 if (type & BTRFS_BLOCK_GROUP_DUP)
2467 do_div(max_avail, 2);
2468
2469 max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
2470 min_stripes, 1);
2471 if (type & BTRFS_BLOCK_GROUP_DUP)
2472 min_free = max_avail * 2;
2473 else
2474 min_free = max_avail;
2475
2476 if (min_free > devices[min_devices - 1].max_avail)
2477 return -ENOSPC;
2478
2479 map = __shrink_map_lookup_stripes(map, min_stripes);
2480 *stripe_size = max_avail;
2481
2482 index = 0;
2483 for (i = 0; i < min_stripes; i++) {
2484 map->stripes[i].dev = devices[index].dev;
2485 map->stripes[i].physical = devices[index].dev_offset;
2486 if (type & BTRFS_BLOCK_GROUP_DUP) {
2487 i++;
2488 map->stripes[i].dev = devices[index].dev;
2489 map->stripes[i].physical = devices[index].dev_offset +
2490 max_avail;
2491 }
2492 index++;
2493 }
2494 *map_lookup = map;
2259 2495
2260 do_div(calc_size, stripe_len); 2496 return 0;
2261 calc_size *= stripe_len; 2497}
2498
2499static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2500 struct btrfs_root *extent_root,
2501 struct map_lookup **map_ret,
2502 u64 *num_bytes, u64 *stripe_size,
2503 u64 start, u64 type)
2504{
2505 struct btrfs_fs_info *info = extent_root->fs_info;
2506 struct btrfs_device *device = NULL;
2507 struct btrfs_fs_devices *fs_devices = info->fs_devices;
2508 struct list_head *cur;
2509 struct map_lookup *map;
2510 struct extent_map_tree *em_tree;
2511 struct extent_map *em;
2512 struct btrfs_device_info *devices_info;
2513 struct list_head private_devs;
2514 u64 calc_size = 1024 * 1024 * 1024;
2515 u64 min_free;
2516 u64 avail;
2517 u64 dev_offset;
2518 int num_stripes;
2519 int min_stripes;
2520 int sub_stripes;
2521 int min_devices; /* the min number of devices we need */
2522 int i;
2523 int ret;
2524 int index;
2525
2526 if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
2527 (type & BTRFS_BLOCK_GROUP_DUP)) {
2528 WARN_ON(1);
2529 type &= ~BTRFS_BLOCK_GROUP_DUP;
2530 }
2531 if (list_empty(&fs_devices->alloc_list))
2532 return -ENOSPC;
2533
2534 ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
2535 &min_stripes, &sub_stripes);
2536 if (ret)
2537 return ret;
2538
2539 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
2540 GFP_NOFS);
2541 if (!devices_info)
2542 return -ENOMEM;
2543
2544 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2545 if (!map) {
2546 ret = -ENOMEM;
2547 goto error;
2548 }
2549 map->num_stripes = num_stripes;
2262 2550
2263 cur = fs_devices->alloc_list.next; 2551 cur = fs_devices->alloc_list.next;
2264 index = 0; 2552 index = 0;
2553 i = 0;
2265 2554
2266 if (type & BTRFS_BLOCK_GROUP_DUP) 2555 calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
2556 num_stripes, 0);
2557
2558 if (type & BTRFS_BLOCK_GROUP_DUP) {
2267 min_free = calc_size * 2; 2559 min_free = calc_size * 2;
2268 else 2560 min_devices = 1;
2561 } else {
2269 min_free = calc_size; 2562 min_free = calc_size;
2270 2563 min_devices = min_stripes;
2271 /* 2564 }
2272 * we add 1MB because we never use the first 1MB of the device, unless
2273 * we've looped, then we are likely allocating the maximum amount of
2274 * space left already
2275 */
2276 if (!looped)
2277 min_free += 1024 * 1024;
2278 2565
2279 INIT_LIST_HEAD(&private_devs); 2566 INIT_LIST_HEAD(&private_devs);
2280 while (index < num_stripes) { 2567 while (index < num_stripes) {
@@ -2287,27 +2574,39 @@ again:
2287 cur = cur->next; 2574 cur = cur->next;
2288 2575
2289 if (device->in_fs_metadata && avail >= min_free) { 2576 if (device->in_fs_metadata && avail >= min_free) {
2290 ret = find_free_dev_extent(trans, device, 2577 ret = find_free_dev_extent(trans, device, min_free,
2291 min_free, &dev_offset, 2578 &devices_info[i].dev_offset,
2292 &max_avail); 2579 &devices_info[i].max_avail);
2293 if (ret == 0) { 2580 if (ret == 0) {
2294 list_move_tail(&device->dev_alloc_list, 2581 list_move_tail(&device->dev_alloc_list,
2295 &private_devs); 2582 &private_devs);
2296 map->stripes[index].dev = device; 2583 map->stripes[index].dev = device;
2297 map->stripes[index].physical = dev_offset; 2584 map->stripes[index].physical =
2585 devices_info[i].dev_offset;
2298 index++; 2586 index++;
2299 if (type & BTRFS_BLOCK_GROUP_DUP) { 2587 if (type & BTRFS_BLOCK_GROUP_DUP) {
2300 map->stripes[index].dev = device; 2588 map->stripes[index].dev = device;
2301 map->stripes[index].physical = 2589 map->stripes[index].physical =
2302 dev_offset + calc_size; 2590 devices_info[i].dev_offset +
2591 calc_size;
2303 index++; 2592 index++;
2304 } 2593 }
2305 } 2594 } else if (ret != -ENOSPC)
2306 } else if (device->in_fs_metadata && avail > max_avail) 2595 goto error;
2307 max_avail = avail; 2596
2597 devices_info[i].dev = device;
2598 i++;
2599 } else if (device->in_fs_metadata &&
2600 avail >= BTRFS_STRIPE_LEN) {
2601 devices_info[i].dev = device;
2602 devices_info[i].max_avail = avail;
2603 i++;
2604 }
2605
2308 if (cur == &fs_devices->alloc_list) 2606 if (cur == &fs_devices->alloc_list)
2309 break; 2607 break;
2310 } 2608 }
2609
2311 list_splice(&private_devs, &fs_devices->alloc_list); 2610 list_splice(&private_devs, &fs_devices->alloc_list);
2312 if (index < num_stripes) { 2611 if (index < num_stripes) {
2313 if (index >= min_stripes) { 2612 if (index >= min_stripes) {
@@ -2316,34 +2615,36 @@ again:
2316 num_stripes /= sub_stripes; 2615 num_stripes /= sub_stripes;
2317 num_stripes *= sub_stripes; 2616 num_stripes *= sub_stripes;
2318 } 2617 }
2319 looped = 1; 2618
2320 goto again; 2619 map = __shrink_map_lookup_stripes(map, num_stripes);
2321 } 2620 } else if (i >= min_devices) {
2322 if (!looped && max_avail > 0) { 2621 ret = __btrfs_alloc_tiny_space(trans, fs_devices,
2323 looped = 1; 2622 devices_info, i, type,
2324 calc_size = max_avail; 2623 &map, min_stripes,
2325 goto again; 2624 &calc_size);
2625 if (ret)
2626 goto error;
2627 } else {
2628 ret = -ENOSPC;
2629 goto error;
2326 } 2630 }
2327 kfree(map);
2328 return -ENOSPC;
2329 } 2631 }
2330 map->sector_size = extent_root->sectorsize; 2632 map->sector_size = extent_root->sectorsize;
2331 map->stripe_len = stripe_len; 2633 map->stripe_len = BTRFS_STRIPE_LEN;
2332 map->io_align = stripe_len; 2634 map->io_align = BTRFS_STRIPE_LEN;
2333 map->io_width = stripe_len; 2635 map->io_width = BTRFS_STRIPE_LEN;
2334 map->type = type; 2636 map->type = type;
2335 map->num_stripes = num_stripes;
2336 map->sub_stripes = sub_stripes; 2637 map->sub_stripes = sub_stripes;
2337 2638
2338 *map_ret = map; 2639 *map_ret = map;
2339 *stripe_size = calc_size; 2640 *stripe_size = calc_size;
2340 *num_bytes = chunk_bytes_by_type(type, calc_size, 2641 *num_bytes = chunk_bytes_by_type(type, calc_size,
2341 num_stripes, sub_stripes); 2642 map->num_stripes, sub_stripes);
2342 2643
2343 em = alloc_extent_map(GFP_NOFS); 2644 em = alloc_extent_map(GFP_NOFS);
2344 if (!em) { 2645 if (!em) {
2345 kfree(map); 2646 ret = -ENOMEM;
2346 return -ENOMEM; 2647 goto error;
2347 } 2648 }
2348 em->bdev = (struct block_device *)map; 2649 em->bdev = (struct block_device *)map;
2349 em->start = start; 2650 em->start = start;
@@ -2376,7 +2677,13 @@ again:
2376 index++; 2677 index++;
2377 } 2678 }
2378 2679
2680 kfree(devices_info);
2379 return 0; 2681 return 0;
2682
2683error:
2684 kfree(map);
2685 kfree(devices_info);
2686 return ret;
2380} 2687}
2381 2688
2382static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 2689static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 1be781079450..7fb59d45fe8c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -20,8 +20,11 @@
20#define __BTRFS_VOLUMES_ 20#define __BTRFS_VOLUMES_
21 21
22#include <linux/bio.h> 22#include <linux/bio.h>
23#include <linux/sort.h>
23#include "async-thread.h" 24#include "async-thread.h"
24 25
26#define BTRFS_STRIPE_LEN (64 * 1024)
27
25struct buffer_head; 28struct buffer_head;
26struct btrfs_pending_bios { 29struct btrfs_pending_bios {
27 struct bio *head; 30 struct bio *head;
@@ -136,6 +139,30 @@ struct btrfs_multi_bio {
136 struct btrfs_bio_stripe stripes[]; 139 struct btrfs_bio_stripe stripes[];
137}; 140};
138 141
142struct btrfs_device_info {
143 struct btrfs_device *dev;
144 u64 dev_offset;
145 u64 max_avail;
146};
147
148/* Used to sort the devices by max_avail(descending sort) */
149int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
150
151/*
152 * sort the devices by max_avail, in which max free extent size of each device
153 * is stored.(Descending Sort)
154 */
155static inline void btrfs_descending_sort_devices(
156 struct btrfs_device_info *devices,
157 size_t nr_devices)
158{
159 sort(devices, nr_devices, sizeof(struct btrfs_device_info),
160 btrfs_cmp_device_free_bytes, NULL);
161}
162
163int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
164 u64 end, u64 *length);
165
139#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ 166#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
140 (sizeof(struct btrfs_bio_stripe) * (n))) 167 (sizeof(struct btrfs_bio_stripe) * (n)))
141 168
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 698fdd2c739c..a5776531dc2b 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -316,6 +316,15 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
316int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, 316int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
317 size_t size, int flags) 317 size_t size, int flags)
318{ 318{
319 struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
320
321 /*
322 * The permission on security.* and system.* is not checked
323 * in permission().
324 */
325 if (btrfs_root_readonly(root))
326 return -EROFS;
327
319 /* 328 /*
320 * If this is a request for a synthetic attribute in the system.* 329 * If this is a request for a synthetic attribute in the system.*
321 * namespace use the generic infrastructure to resolve a handler 330 * namespace use the generic infrastructure to resolve a handler
@@ -336,6 +345,15 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
336 345
337int btrfs_removexattr(struct dentry *dentry, const char *name) 346int btrfs_removexattr(struct dentry *dentry, const char *name)
338{ 347{
348 struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
349
350 /*
351 * The permission on security.* and system.* is not checked
352 * in permission().
353 */
354 if (btrfs_root_readonly(root))
355 return -EROFS;
356
339 /* 357 /*
340 * If this is a request for a synthetic attribute in the system.* 358 * If this is a request for a synthetic attribute in the system.*
341 * namespace use the generic infrastructure to resolve a handler 359 * namespace use the generic infrastructure to resolve a handler
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b9cd5445f71c..f5ec2d44150d 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -32,15 +32,6 @@
32#include <linux/bio.h> 32#include <linux/bio.h>
33#include "compression.h" 33#include "compression.h"
34 34
35/* Plan: call deflate() with avail_in == *sourcelen,
36 avail_out = *dstlen - 12 and flush == Z_FINISH.
37 If it doesn't manage to finish, call it again with
38 avail_in == 0 and avail_out set to the remaining 12
39 bytes for it to clean up.
40 Q: Is 12 bytes sufficient?
41*/
42#define STREAM_END_SPACE 12
43
44struct workspace { 35struct workspace {
45 z_stream inf_strm; 36 z_stream inf_strm;
46 z_stream def_strm; 37 z_stream def_strm;
@@ -48,152 +39,51 @@ struct workspace {
48 struct list_head list; 39 struct list_head list;
49}; 40};
50 41
51static LIST_HEAD(idle_workspace); 42static void zlib_free_workspace(struct list_head *ws)
52static DEFINE_SPINLOCK(workspace_lock); 43{
53static unsigned long num_workspace; 44 struct workspace *workspace = list_entry(ws, struct workspace, list);
54static atomic_t alloc_workspace = ATOMIC_INIT(0);
55static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
56 45
57/* 46 vfree(workspace->def_strm.workspace);
58 * this finds an available zlib workspace or allocates a new one 47 vfree(workspace->inf_strm.workspace);
59 * NULL or an ERR_PTR is returned if things go bad. 48 kfree(workspace->buf);
60 */ 49 kfree(workspace);
61static struct workspace *find_zlib_workspace(void) 50}
51
52static struct list_head *zlib_alloc_workspace(void)
62{ 53{
63 struct workspace *workspace; 54 struct workspace *workspace;
64 int ret;
65 int cpus = num_online_cpus();
66
67again:
68 spin_lock(&workspace_lock);
69 if (!list_empty(&idle_workspace)) {
70 workspace = list_entry(idle_workspace.next, struct workspace,
71 list);
72 list_del(&workspace->list);
73 num_workspace--;
74 spin_unlock(&workspace_lock);
75 return workspace;
76 55
77 }
78 spin_unlock(&workspace_lock);
79 if (atomic_read(&alloc_workspace) > cpus) {
80 DEFINE_WAIT(wait);
81 prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
82 if (atomic_read(&alloc_workspace) > cpus)
83 schedule();
84 finish_wait(&workspace_wait, &wait);
85 goto again;
86 }
87 atomic_inc(&alloc_workspace);
88 workspace = kzalloc(sizeof(*workspace), GFP_NOFS); 56 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
89 if (!workspace) { 57 if (!workspace)
90 ret = -ENOMEM; 58 return ERR_PTR(-ENOMEM);
91 goto fail;
92 }
93 59
94 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); 60 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
95 if (!workspace->def_strm.workspace) {
96 ret = -ENOMEM;
97 goto fail;
98 }
99 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); 61 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
100 if (!workspace->inf_strm.workspace) {
101 ret = -ENOMEM;
102 goto fail_inflate;
103 }
104 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); 62 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
105 if (!workspace->buf) { 63 if (!workspace->def_strm.workspace ||
106 ret = -ENOMEM; 64 !workspace->inf_strm.workspace || !workspace->buf)
107 goto fail_kmalloc; 65 goto fail;
108 }
109 return workspace;
110
111fail_kmalloc:
112 vfree(workspace->inf_strm.workspace);
113fail_inflate:
114 vfree(workspace->def_strm.workspace);
115fail:
116 kfree(workspace);
117 atomic_dec(&alloc_workspace);
118 wake_up(&workspace_wait);
119 return ERR_PTR(ret);
120}
121
122/*
123 * put a workspace struct back on the list or free it if we have enough
124 * idle ones sitting around
125 */
126static int free_workspace(struct workspace *workspace)
127{
128 spin_lock(&workspace_lock);
129 if (num_workspace < num_online_cpus()) {
130 list_add_tail(&workspace->list, &idle_workspace);
131 num_workspace++;
132 spin_unlock(&workspace_lock);
133 if (waitqueue_active(&workspace_wait))
134 wake_up(&workspace_wait);
135 return 0;
136 }
137 spin_unlock(&workspace_lock);
138 vfree(workspace->def_strm.workspace);
139 vfree(workspace->inf_strm.workspace);
140 kfree(workspace->buf);
141 kfree(workspace);
142 66
143 atomic_dec(&alloc_workspace); 67 INIT_LIST_HEAD(&workspace->list);
144 if (waitqueue_active(&workspace_wait))
145 wake_up(&workspace_wait);
146 return 0;
147}
148 68
149/* 69 return &workspace->list;
150 * cleanup function for module exit 70fail:
151 */ 71 zlib_free_workspace(&workspace->list);
152static void free_workspaces(void) 72 return ERR_PTR(-ENOMEM);
153{
154 struct workspace *workspace;
155 while (!list_empty(&idle_workspace)) {
156 workspace = list_entry(idle_workspace.next, struct workspace,
157 list);
158 list_del(&workspace->list);
159 vfree(workspace->def_strm.workspace);
160 vfree(workspace->inf_strm.workspace);
161 kfree(workspace->buf);
162 kfree(workspace);
163 atomic_dec(&alloc_workspace);
164 }
165} 73}
166 74
167/* 75static int zlib_compress_pages(struct list_head *ws,
168 * given an address space and start/len, compress the bytes. 76 struct address_space *mapping,
169 * 77 u64 start, unsigned long len,
170 * pages are allocated to hold the compressed result and stored 78 struct page **pages,
171 * in 'pages' 79 unsigned long nr_dest_pages,
172 * 80 unsigned long *out_pages,
173 * out_pages is used to return the number of pages allocated. There 81 unsigned long *total_in,
174 * may be pages allocated even if we return an error 82 unsigned long *total_out,
175 * 83 unsigned long max_out)
176 * total_in is used to return the number of bytes actually read. It
177 * may be smaller then len if we had to exit early because we
178 * ran out of room in the pages array or because we cross the
179 * max_out threshold.
180 *
181 * total_out is used to return the total number of compressed bytes
182 *
183 * max_out tells us the max number of bytes that we're allowed to
184 * stuff into pages
185 */
186int btrfs_zlib_compress_pages(struct address_space *mapping,
187 u64 start, unsigned long len,
188 struct page **pages,
189 unsigned long nr_dest_pages,
190 unsigned long *out_pages,
191 unsigned long *total_in,
192 unsigned long *total_out,
193 unsigned long max_out)
194{ 84{
85 struct workspace *workspace = list_entry(ws, struct workspace, list);
195 int ret; 86 int ret;
196 struct workspace *workspace;
197 char *data_in; 87 char *data_in;
198 char *cpage_out; 88 char *cpage_out;
199 int nr_pages = 0; 89 int nr_pages = 0;
@@ -205,10 +95,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
205 *total_out = 0; 95 *total_out = 0;
206 *total_in = 0; 96 *total_in = 0;
207 97
208 workspace = find_zlib_workspace();
209 if (IS_ERR(workspace))
210 return -1;
211
212 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { 98 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
213 printk(KERN_WARNING "deflateInit failed\n"); 99 printk(KERN_WARNING "deflateInit failed\n");
214 ret = -1; 100 ret = -1;
@@ -222,6 +108,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
222 data_in = kmap(in_page); 108 data_in = kmap(in_page);
223 109
224 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 110 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
111 if (out_page == NULL) {
112 ret = -1;
113 goto out;
114 }
225 cpage_out = kmap(out_page); 115 cpage_out = kmap(out_page);
226 pages[0] = out_page; 116 pages[0] = out_page;
227 nr_pages = 1; 117 nr_pages = 1;
@@ -260,6 +150,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
260 goto out; 150 goto out;
261 } 151 }
262 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 152 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
153 if (out_page == NULL) {
154 ret = -1;
155 goto out;
156 }
263 cpage_out = kmap(out_page); 157 cpage_out = kmap(out_page);
264 pages[nr_pages] = out_page; 158 pages[nr_pages] = out_page;
265 nr_pages++; 159 nr_pages++;
@@ -314,55 +208,26 @@ out:
314 kunmap(in_page); 208 kunmap(in_page);
315 page_cache_release(in_page); 209 page_cache_release(in_page);
316 } 210 }
317 free_workspace(workspace);
318 return ret; 211 return ret;
319} 212}
320 213
321/* 214static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
322 * pages_in is an array of pages with compressed data. 215 u64 disk_start,
323 * 216 struct bio_vec *bvec,
324 * disk_start is the starting logical offset of this array in the file 217 int vcnt,
325 * 218 size_t srclen)
326 * bvec is a bio_vec of pages from the file that we want to decompress into
327 *
328 * vcnt is the count of pages in the biovec
329 *
330 * srclen is the number of bytes in pages_in
331 *
332 * The basic idea is that we have a bio that was created by readpages.
333 * The pages in the bio are for the uncompressed data, and they may not
334 * be contiguous. They all correspond to the range of bytes covered by
335 * the compressed extent.
336 */
337int btrfs_zlib_decompress_biovec(struct page **pages_in,
338 u64 disk_start,
339 struct bio_vec *bvec,
340 int vcnt,
341 size_t srclen)
342{ 219{
343 int ret = 0; 220 struct workspace *workspace = list_entry(ws, struct workspace, list);
221 int ret = 0, ret2;
344 int wbits = MAX_WBITS; 222 int wbits = MAX_WBITS;
345 struct workspace *workspace;
346 char *data_in; 223 char *data_in;
347 size_t total_out = 0; 224 size_t total_out = 0;
348 unsigned long page_bytes_left;
349 unsigned long page_in_index = 0; 225 unsigned long page_in_index = 0;
350 unsigned long page_out_index = 0; 226 unsigned long page_out_index = 0;
351 struct page *page_out;
352 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / 227 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
353 PAGE_CACHE_SIZE; 228 PAGE_CACHE_SIZE;
354 unsigned long buf_start; 229 unsigned long buf_start;
355 unsigned long buf_offset;
356 unsigned long bytes;
357 unsigned long working_bytes;
358 unsigned long pg_offset; 230 unsigned long pg_offset;
359 unsigned long start_byte;
360 unsigned long current_buf_start;
361 char *kaddr;
362
363 workspace = find_zlib_workspace();
364 if (IS_ERR(workspace))
365 return -ENOMEM;
366 231
367 data_in = kmap(pages_in[page_in_index]); 232 data_in = kmap(pages_in[page_in_index]);
368 workspace->inf_strm.next_in = data_in; 233 workspace->inf_strm.next_in = data_in;
@@ -372,8 +237,6 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
372 workspace->inf_strm.total_out = 0; 237 workspace->inf_strm.total_out = 0;
373 workspace->inf_strm.next_out = workspace->buf; 238 workspace->inf_strm.next_out = workspace->buf;
374 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 239 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
375 page_out = bvec[page_out_index].bv_page;
376 page_bytes_left = PAGE_CACHE_SIZE;
377 pg_offset = 0; 240 pg_offset = 0;
378 241
379 /* If it's deflate, and it's got no preset dictionary, then 242 /* If it's deflate, and it's got no preset dictionary, then
@@ -389,107 +252,29 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
389 252
390 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 253 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
391 printk(KERN_WARNING "inflateInit failed\n"); 254 printk(KERN_WARNING "inflateInit failed\n");
392 ret = -1; 255 return -1;
393 goto out;
394 } 256 }
395 while (workspace->inf_strm.total_in < srclen) { 257 while (workspace->inf_strm.total_in < srclen) {
396 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); 258 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
397 if (ret != Z_OK && ret != Z_STREAM_END) 259 if (ret != Z_OK && ret != Z_STREAM_END)
398 break; 260 break;
399 /*
400 * buf start is the byte offset we're of the start of
401 * our workspace buffer
402 */
403 buf_start = total_out;
404 261
405 /* total_out is the last byte of the workspace buffer */ 262 buf_start = total_out;
406 total_out = workspace->inf_strm.total_out; 263 total_out = workspace->inf_strm.total_out;
407 264
408 working_bytes = total_out - buf_start; 265 /* we didn't make progress in this inflate call, we're done */
409 266 if (buf_start == total_out)
410 /*
411 * start byte is the first byte of the page we're currently
412 * copying into relative to the start of the compressed data.
413 */
414 start_byte = page_offset(page_out) - disk_start;
415
416 if (working_bytes == 0) {
417 /* we didn't make progress in this inflate
418 * call, we're done
419 */
420 if (ret != Z_STREAM_END)
421 ret = -1;
422 break; 267 break;
423 }
424 268
425 /* we haven't yet hit data corresponding to this page */ 269 ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
426 if (total_out <= start_byte) 270 total_out, disk_start,
427 goto next; 271 bvec, vcnt,
428 272 &page_out_index, &pg_offset);
429 /* 273 if (ret2 == 0) {
430 * the start of the data we care about is offset into 274 ret = 0;
431 * the middle of our working buffer 275 goto done;
432 */
433 if (total_out > start_byte && buf_start < start_byte) {
434 buf_offset = start_byte - buf_start;
435 working_bytes -= buf_offset;
436 } else {
437 buf_offset = 0;
438 }
439 current_buf_start = buf_start;
440
441 /* copy bytes from the working buffer into the pages */
442 while (working_bytes > 0) {
443 bytes = min(PAGE_CACHE_SIZE - pg_offset,
444 PAGE_CACHE_SIZE - buf_offset);
445 bytes = min(bytes, working_bytes);
446 kaddr = kmap_atomic(page_out, KM_USER0);
447 memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
448 bytes);
449 kunmap_atomic(kaddr, KM_USER0);
450 flush_dcache_page(page_out);
451
452 pg_offset += bytes;
453 page_bytes_left -= bytes;
454 buf_offset += bytes;
455 working_bytes -= bytes;
456 current_buf_start += bytes;
457
458 /* check if we need to pick another page */
459 if (page_bytes_left == 0) {
460 page_out_index++;
461 if (page_out_index >= vcnt) {
462 ret = 0;
463 goto done;
464 }
465
466 page_out = bvec[page_out_index].bv_page;
467 pg_offset = 0;
468 page_bytes_left = PAGE_CACHE_SIZE;
469 start_byte = page_offset(page_out) - disk_start;
470
471 /*
472 * make sure our new page is covered by this
473 * working buffer
474 */
475 if (total_out <= start_byte)
476 goto next;
477
478 /* the next page in the biovec might not
479 * be adjacent to the last page, but it
480 * might still be found inside this working
481 * buffer. bump our offset pointer
482 */
483 if (total_out > start_byte &&
484 current_buf_start < start_byte) {
485 buf_offset = start_byte - buf_start;
486 working_bytes = total_out - start_byte;
487 current_buf_start = buf_start +
488 buf_offset;
489 }
490 }
491 } 276 }
492next: 277
493 workspace->inf_strm.next_out = workspace->buf; 278 workspace->inf_strm.next_out = workspace->buf;
494 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 279 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
495 280
@@ -516,35 +301,21 @@ done:
516 zlib_inflateEnd(&workspace->inf_strm); 301 zlib_inflateEnd(&workspace->inf_strm);
517 if (data_in) 302 if (data_in)
518 kunmap(pages_in[page_in_index]); 303 kunmap(pages_in[page_in_index]);
519out:
520 free_workspace(workspace);
521 return ret; 304 return ret;
522} 305}
523 306
524/* 307static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
525 * a less complex decompression routine. Our compressed data fits in a 308 struct page *dest_page,
526 * single page, and we want to read a single page out of it. 309 unsigned long start_byte,
527 * start_byte tells us the offset into the compressed data we're interested in 310 size_t srclen, size_t destlen)
528 */
529int btrfs_zlib_decompress(unsigned char *data_in,
530 struct page *dest_page,
531 unsigned long start_byte,
532 size_t srclen, size_t destlen)
533{ 311{
312 struct workspace *workspace = list_entry(ws, struct workspace, list);
534 int ret = 0; 313 int ret = 0;
535 int wbits = MAX_WBITS; 314 int wbits = MAX_WBITS;
536 struct workspace *workspace;
537 unsigned long bytes_left = destlen; 315 unsigned long bytes_left = destlen;
538 unsigned long total_out = 0; 316 unsigned long total_out = 0;
539 char *kaddr; 317 char *kaddr;
540 318
541 if (destlen > PAGE_CACHE_SIZE)
542 return -ENOMEM;
543
544 workspace = find_zlib_workspace();
545 if (IS_ERR(workspace))
546 return -ENOMEM;
547
548 workspace->inf_strm.next_in = data_in; 319 workspace->inf_strm.next_in = data_in;
549 workspace->inf_strm.avail_in = srclen; 320 workspace->inf_strm.avail_in = srclen;
550 workspace->inf_strm.total_in = 0; 321 workspace->inf_strm.total_in = 0;
@@ -565,8 +336,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
565 336
566 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 337 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
567 printk(KERN_WARNING "inflateInit failed\n"); 338 printk(KERN_WARNING "inflateInit failed\n");
568 ret = -1; 339 return -1;
569 goto out;
570 } 340 }
571 341
572 while (bytes_left > 0) { 342 while (bytes_left > 0) {
@@ -616,12 +386,13 @@ next:
616 ret = 0; 386 ret = 0;
617 387
618 zlib_inflateEnd(&workspace->inf_strm); 388 zlib_inflateEnd(&workspace->inf_strm);
619out:
620 free_workspace(workspace);
621 return ret; 389 return ret;
622} 390}
623 391
624void btrfs_zlib_exit(void) 392struct btrfs_compress_op btrfs_zlib_compress = {
625{ 393 .alloc_workspace = zlib_alloc_workspace,
626 free_workspaces(); 394 .free_workspace = zlib_free_workspace,
627} 395 .compress_pages = zlib_compress_pages,
396 .decompress_biovec = zlib_decompress_biovec,
397 .decompress = zlib_decompress,
398};
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 60d27bc9eb83..6b61ded701e1 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1560,9 +1560,10 @@ retry_locked:
1560 /* NOTE: no side-effects allowed, until we take s_mutex */ 1560 /* NOTE: no side-effects allowed, until we take s_mutex */
1561 1561
1562 revoking = cap->implemented & ~cap->issued; 1562 revoking = cap->implemented & ~cap->issued;
1563 if (revoking) 1563 dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
1564 dout(" mds%d revoking %s\n", cap->mds, 1564 cap->mds, cap, ceph_cap_string(cap->issued),
1565 ceph_cap_string(revoking)); 1565 ceph_cap_string(cap->implemented),
1566 ceph_cap_string(revoking));
1566 1567
1567 if (cap == ci->i_auth_cap && 1568 if (cap == ci->i_auth_cap &&
1568 (cap->issued & CEPH_CAP_FILE_WR)) { 1569 (cap->issued & CEPH_CAP_FILE_WR)) {
@@ -1658,6 +1659,8 @@ ack:
1658 1659
1659 if (cap == ci->i_auth_cap && ci->i_dirty_caps) 1660 if (cap == ci->i_auth_cap && ci->i_dirty_caps)
1660 flushing = __mark_caps_flushing(inode, session); 1661 flushing = __mark_caps_flushing(inode, session);
1662 else
1663 flushing = 0;
1661 1664
1662 mds = cap->mds; /* remember mds, so we don't repeat */ 1665 mds = cap->mds; /* remember mds, so we don't repeat */
1663 sent++; 1666 sent++;
@@ -1940,6 +1943,35 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1940 } 1943 }
1941} 1944}
1942 1945
1946static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
1947 struct ceph_mds_session *session,
1948 struct inode *inode)
1949{
1950 struct ceph_inode_info *ci = ceph_inode(inode);
1951 struct ceph_cap *cap;
1952 int delayed = 0;
1953
1954 spin_lock(&inode->i_lock);
1955 cap = ci->i_auth_cap;
1956 dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
1957 ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
1958 __ceph_flush_snaps(ci, &session, 1);
1959 if (ci->i_flushing_caps) {
1960 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1961 __ceph_caps_used(ci),
1962 __ceph_caps_wanted(ci),
1963 cap->issued | cap->implemented,
1964 ci->i_flushing_caps, NULL);
1965 if (delayed) {
1966 spin_lock(&inode->i_lock);
1967 __cap_delay_requeue(mdsc, ci);
1968 spin_unlock(&inode->i_lock);
1969 }
1970 } else {
1971 spin_unlock(&inode->i_lock);
1972 }
1973}
1974
1943 1975
1944/* 1976/*
1945 * Take references to capabilities we hold, so that we don't release 1977 * Take references to capabilities we hold, so that we don't release
@@ -2687,7 +2719,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
2687 ceph_add_cap(inode, session, cap_id, -1, 2719 ceph_add_cap(inode, session, cap_id, -1,
2688 issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, 2720 issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
2689 NULL /* no caps context */); 2721 NULL /* no caps context */);
2690 try_flush_caps(inode, session, NULL); 2722 kick_flushing_inode_caps(mdsc, session, inode);
2691 up_read(&mdsc->snap_rwsem); 2723 up_read(&mdsc->snap_rwsem);
2692 2724
2693 /* make sure we re-request max_size, if necessary */ 2725 /* make sure we re-request max_size, if necessary */
@@ -2785,8 +2817,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2785 case CEPH_CAP_OP_IMPORT: 2817 case CEPH_CAP_OP_IMPORT:
2786 handle_cap_import(mdsc, inode, h, session, 2818 handle_cap_import(mdsc, inode, h, session,
2787 snaptrace, snaptrace_len); 2819 snaptrace, snaptrace_len);
2788 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, 2820 ceph_check_caps(ceph_inode(inode), 0, session);
2789 session);
2790 goto done_unlocked; 2821 goto done_unlocked;
2791 } 2822 }
2792 2823
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e835eff551e3..5625463aa479 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -710,10 +710,6 @@ static int fill_inode(struct inode *inode,
710 ci->i_ceph_flags |= CEPH_I_COMPLETE; 710 ci->i_ceph_flags |= CEPH_I_COMPLETE;
711 ci->i_max_offset = 2; 711 ci->i_max_offset = 2;
712 } 712 }
713
714 /* it may be better to set st_size in getattr instead? */
715 if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
716 inode->i_size = ci->i_rbytes;
717 break; 713 break;
718 default: 714 default:
719 pr_err("fill_inode %llx.%llx BAD mode 0%o\n", 715 pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
@@ -1819,7 +1815,11 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
1819 else 1815 else
1820 stat->dev = 0; 1816 stat->dev = 0;
1821 if (S_ISDIR(inode->i_mode)) { 1817 if (S_ISDIR(inode->i_mode)) {
1822 stat->size = ci->i_rbytes; 1818 if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
1819 RBYTES))
1820 stat->size = ci->i_rbytes;
1821 else
1822 stat->size = ci->i_files + ci->i_subdirs;
1823 stat->blocks = 0; 1823 stat->blocks = 0;
1824 stat->blksize = 65536; 1824 stat->blksize = 65536;
1825 } 1825 }
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1e30d194a8e3..a1ee8fa3a8e7 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -693,9 +693,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
693 dout("choose_mds %p %llx.%llx " 693 dout("choose_mds %p %llx.%llx "
694 "frag %u mds%d (%d/%d)\n", 694 "frag %u mds%d (%d/%d)\n",
695 inode, ceph_vinop(inode), 695 inode, ceph_vinop(inode),
696 frag.frag, frag.mds, 696 frag.frag, mds,
697 (int)r, frag.ndist); 697 (int)r, frag.ndist);
698 return mds; 698 if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
699 CEPH_MDS_STATE_ACTIVE)
700 return mds;
699 } 701 }
700 702
701 /* since this file/dir wasn't known to be 703 /* since this file/dir wasn't known to be
@@ -708,7 +710,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
708 dout("choose_mds %p %llx.%llx " 710 dout("choose_mds %p %llx.%llx "
709 "frag %u mds%d (auth)\n", 711 "frag %u mds%d (auth)\n",
710 inode, ceph_vinop(inode), frag.frag, mds); 712 inode, ceph_vinop(inode), frag.frag, mds);
711 return mds; 713 if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
714 CEPH_MDS_STATE_ACTIVE)
715 return mds;
712 } 716 }
713 } 717 }
714 } 718 }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index bf6f0f34082a..9c5085465a63 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -290,6 +290,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
290 290
291 fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT; 291 fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
292 fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); 292 fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
293 fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
294 fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
293 fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; 295 fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
294 fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; 296 fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
295 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; 297 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 6e12a6ba5f79..8c9eba6ef9df 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -219,6 +219,7 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
219 struct rb_node **p; 219 struct rb_node **p;
220 struct rb_node *parent = NULL; 220 struct rb_node *parent = NULL;
221 struct ceph_inode_xattr *xattr = NULL; 221 struct ceph_inode_xattr *xattr = NULL;
222 int name_len = strlen(name);
222 int c; 223 int c;
223 224
224 p = &ci->i_xattrs.index.rb_node; 225 p = &ci->i_xattrs.index.rb_node;
@@ -226,6 +227,8 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
226 parent = *p; 227 parent = *p;
227 xattr = rb_entry(parent, struct ceph_inode_xattr, node); 228 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
228 c = strncmp(name, xattr->name, xattr->name_len); 229 c = strncmp(name, xattr->name, xattr->name_len);
230 if (c == 0 && name_len > xattr->name_len)
231 c = 1;
229 if (c < 0) 232 if (c < 0)
230 p = &(*p)->rb_left; 233 p = &(*p)->rb_left;
231 else if (c > 0) 234 else if (c > 0)
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index ee45648b0d1a..7cb0f7f847e4 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -3,6 +3,7 @@ config CIFS
3 depends on INET 3 depends on INET
4 select NLS 4 select NLS
5 select CRYPTO 5 select CRYPTO
6 select CRYPTO_MD4
6 select CRYPTO_MD5 7 select CRYPTO_MD5
7 select CRYPTO_HMAC 8 select CRYPTO_HMAC
8 select CRYPTO_ARC4 9 select CRYPTO_ARC4
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 43b19dd39191..d87558448e3d 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_CIFS) += cifs.o
5 5
6cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ 6cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
7 link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \ 7 link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
8 md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \ 8 cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
9 readdir.o ioctl.o sess.o export.o 9 readdir.o ioctl.o sess.o export.o
10 10
11cifs-$(CONFIG_CIFS_ACL) += cifsacl.o 11cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
diff --git a/fs/cifs/README b/fs/cifs/README
index 46af99ab3614..fe1683590828 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -452,6 +452,11 @@ A partial list of the supported mount options follows:
452 if oplock (caching token) is granted and held. Note that 452 if oplock (caching token) is granted and held. Note that
453 direct allows write operations larger than page size 453 direct allows write operations larger than page size
454 to be sent to the server. 454 to be sent to the server.
455 strictcache Use for switching on strict cache mode. In this mode the
456 client read from the cache all the time it has Oplock Level II,
457 otherwise - read from the server. All written data are stored
458 in the cache, but if the client doesn't have Exclusive Oplock,
459 it writes the data to the server.
455 acl Allow setfacl and getfacl to manage posix ACLs if server 460 acl Allow setfacl and getfacl to manage posix ACLs if server
456 supports them. (default) 461 supports them. (default)
457 noacl Do not allow setfacl and getfacl calls on this mount 462 noacl Do not allow setfacl and getfacl calls on this mount
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index ede98300a8cd..65829d32128c 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -79,11 +79,11 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
79 spin_lock(&GlobalMid_Lock); 79 spin_lock(&GlobalMid_Lock);
80 list_for_each(tmp, &server->pending_mid_q) { 80 list_for_each(tmp, &server->pending_mid_q) {
81 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 81 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
82 cERROR(1, "State: %d Cmd: %d Pid: %d Tsk: %p Mid %d", 82 cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %d",
83 mid_entry->midState, 83 mid_entry->midState,
84 (int)mid_entry->command, 84 (int)mid_entry->command,
85 mid_entry->pid, 85 mid_entry->pid,
86 mid_entry->tsk, 86 mid_entry->callback_data,
87 mid_entry->mid); 87 mid_entry->mid);
88#ifdef CONFIG_CIFS_STATS2 88#ifdef CONFIG_CIFS_STATS2
89 cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld", 89 cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
@@ -218,11 +218,11 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
218 mid_entry = list_entry(tmp3, struct mid_q_entry, 218 mid_entry = list_entry(tmp3, struct mid_q_entry,
219 qhead); 219 qhead);
220 seq_printf(m, "\tState: %d com: %d pid:" 220 seq_printf(m, "\tState: %d com: %d pid:"
221 " %d tsk: %p mid %d\n", 221 " %d cbdata: %p mid %d\n",
222 mid_entry->midState, 222 mid_entry->midState,
223 (int)mid_entry->command, 223 (int)mid_entry->command,
224 mid_entry->pid, 224 mid_entry->pid,
225 mid_entry->tsk, 225 mid_entry->callback_data,
226 mid_entry->mid); 226 mid_entry->mid);
227 } 227 }
228 spin_unlock(&GlobalMid_Lock); 228 spin_unlock(&GlobalMid_Lock);
@@ -331,7 +331,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
331 atomic_read(&totSmBufAllocCount)); 331 atomic_read(&totSmBufAllocCount));
332#endif /* CONFIG_CIFS_STATS2 */ 332#endif /* CONFIG_CIFS_STATS2 */
333 333
334 seq_printf(m, "Operations (MIDs): %d\n", midCount.counter); 334 seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&midCount));
335 seq_printf(m, 335 seq_printf(m,
336 "\n%d session %d share reconnects\n", 336 "\n%d session %d share reconnects\n",
337 tcpSesReconnectCount.counter, tconInfoReconnectCount.counter); 337 tcpSesReconnectCount.counter, tconInfoReconnectCount.counter);
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index c68a056f27fd..0a265ad9e426 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -255,35 +255,6 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
255 255
256} 256}
257 257
258static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
259 struct list_head *mntlist)
260{
261 /* stolen from afs code */
262 int err;
263
264 mntget(newmnt);
265 err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags | MNT_SHRINKABLE, mntlist);
266 switch (err) {
267 case 0:
268 path_put(&nd->path);
269 nd->path.mnt = newmnt;
270 nd->path.dentry = dget(newmnt->mnt_root);
271 schedule_delayed_work(&cifs_dfs_automount_task,
272 cifs_dfs_mountpoint_expiry_timeout);
273 break;
274 case -EBUSY:
275 /* someone else made a mount here whilst we were busy */
276 while (d_mountpoint(nd->path.dentry) &&
277 follow_down(&nd->path))
278 ;
279 err = 0;
280 default:
281 mntput(newmnt);
282 break;
283 }
284 return err;
285}
286
287static void dump_referral(const struct dfs_info3_param *ref) 258static void dump_referral(const struct dfs_info3_param *ref)
288{ 259{
289 cFYI(1, "DFS: ref path: %s", ref->path_name); 260 cFYI(1, "DFS: ref path: %s", ref->path_name);
@@ -293,27 +264,23 @@ static void dump_referral(const struct dfs_info3_param *ref)
293 ref->path_consumed); 264 ref->path_consumed);
294} 265}
295 266
296 267/*
297static void* 268 * Create a vfsmount that we can automount
298cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) 269 */
270static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
299{ 271{
300 struct dfs_info3_param *referrals = NULL; 272 struct dfs_info3_param *referrals = NULL;
301 unsigned int num_referrals = 0; 273 unsigned int num_referrals = 0;
302 struct cifs_sb_info *cifs_sb; 274 struct cifs_sb_info *cifs_sb;
303 struct cifsSesInfo *ses; 275 struct cifsSesInfo *ses;
304 char *full_path = NULL; 276 char *full_path;
305 int xid, i; 277 int xid, i;
306 int rc = 0; 278 int rc;
307 struct vfsmount *mnt = ERR_PTR(-ENOENT); 279 struct vfsmount *mnt;
308 struct tcon_link *tlink; 280 struct tcon_link *tlink;
309 281
310 cFYI(1, "in %s", __func__); 282 cFYI(1, "in %s", __func__);
311 BUG_ON(IS_ROOT(dentry)); 283 BUG_ON(IS_ROOT(mntpt));
312
313 xid = GetXid();
314
315 dput(nd->path.dentry);
316 nd->path.dentry = dget(dentry);
317 284
318 /* 285 /*
319 * The MSDFS spec states that paths in DFS referral requests and 286 * The MSDFS spec states that paths in DFS referral requests and
@@ -321,66 +288,83 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
321 * the double backslashes usually used in the UNC. This function 288 * the double backslashes usually used in the UNC. This function
322 * gives us the latter, so we must adjust the result. 289 * gives us the latter, so we must adjust the result.
323 */ 290 */
324 full_path = build_path_from_dentry(dentry); 291 mnt = ERR_PTR(-ENOMEM);
325 if (full_path == NULL) { 292 full_path = build_path_from_dentry(mntpt);
326 rc = -ENOMEM; 293 if (full_path == NULL)
327 goto out_err; 294 goto cdda_exit;
328 }
329 295
330 cifs_sb = CIFS_SB(dentry->d_inode->i_sb); 296 cifs_sb = CIFS_SB(mntpt->d_inode->i_sb);
331 tlink = cifs_sb_tlink(cifs_sb); 297 tlink = cifs_sb_tlink(cifs_sb);
332 if (IS_ERR(tlink)) { 298 if (IS_ERR(tlink)) {
333 rc = PTR_ERR(tlink); 299 mnt = ERR_CAST(tlink);
334 goto out_err; 300 goto free_full_path;
335 } 301 }
336 ses = tlink_tcon(tlink)->ses; 302 ses = tlink_tcon(tlink)->ses;
337 303
304 xid = GetXid();
338 rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls, 305 rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
339 &num_referrals, &referrals, 306 &num_referrals, &referrals,
340 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 307 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
308 FreeXid(xid);
341 309
342 cifs_put_tlink(tlink); 310 cifs_put_tlink(tlink);
343 311
312 mnt = ERR_PTR(-ENOENT);
344 for (i = 0; i < num_referrals; i++) { 313 for (i = 0; i < num_referrals; i++) {
345 int len; 314 int len;
346 dump_referral(referrals+i); 315 dump_referral(referrals + i);
347 /* connect to a node */ 316 /* connect to a node */
348 len = strlen(referrals[i].node_name); 317 len = strlen(referrals[i].node_name);
349 if (len < 2) { 318 if (len < 2) {
350 cERROR(1, "%s: Net Address path too short: %s", 319 cERROR(1, "%s: Net Address path too short: %s",
351 __func__, referrals[i].node_name); 320 __func__, referrals[i].node_name);
352 rc = -EINVAL; 321 mnt = ERR_PTR(-EINVAL);
353 goto out_err; 322 break;
354 } 323 }
355 mnt = cifs_dfs_do_refmount(cifs_sb, 324 mnt = cifs_dfs_do_refmount(cifs_sb,
356 full_path, referrals + i); 325 full_path, referrals + i);
357 cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__, 326 cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
358 referrals[i].node_name, mnt); 327 referrals[i].node_name, mnt);
359
360 /* complete mount procedure if we accured submount */
361 if (!IS_ERR(mnt)) 328 if (!IS_ERR(mnt))
362 break; 329 goto success;
363 } 330 }
364 331
365 /* we need it cause for() above could exit without valid submount */ 332 /* no valid submounts were found; return error from get_dfs_path() by
366 rc = PTR_ERR(mnt); 333 * preference */
367 if (IS_ERR(mnt)) 334 if (rc != 0)
368 goto out_err; 335 mnt = ERR_PTR(rc);
369 336
370 rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list); 337success:
371
372out:
373 FreeXid(xid);
374 free_dfs_info_array(referrals, num_referrals); 338 free_dfs_info_array(referrals, num_referrals);
339free_full_path:
375 kfree(full_path); 340 kfree(full_path);
341cdda_exit:
376 cFYI(1, "leaving %s" , __func__); 342 cFYI(1, "leaving %s" , __func__);
377 return ERR_PTR(rc); 343 return mnt;
378out_err: 344}
379 path_put(&nd->path); 345
380 goto out; 346/*
347 * Attempt to automount the referral
348 */
349struct vfsmount *cifs_dfs_d_automount(struct path *path)
350{
351 struct vfsmount *newmnt;
352
353 cFYI(1, "in %s", __func__);
354
355 newmnt = cifs_dfs_do_automount(path->dentry);
356 if (IS_ERR(newmnt)) {
357 cFYI(1, "leaving %s [automount failed]" , __func__);
358 return newmnt;
359 }
360
361 mntget(newmnt); /* prevent immediate expiration */
362 mnt_set_expiry(newmnt, &cifs_dfs_automount_list);
363 schedule_delayed_work(&cifs_dfs_automount_task,
364 cifs_dfs_mountpoint_expiry_timeout);
365 cFYI(1, "leaving %s [ok]" , __func__);
366 return newmnt;
381} 367}
382 368
383const struct inode_operations cifs_dfs_referral_inode_operations = { 369const struct inode_operations cifs_dfs_referral_inode_operations = {
384 .follow_link = cifs_dfs_follow_mountpoint,
385}; 370};
386
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 7852cd677051..ac51cd2d33ae 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -40,6 +40,7 @@
40#define CIFS_MOUNT_FSCACHE 0x8000 /* local caching enabled */ 40#define CIFS_MOUNT_FSCACHE 0x8000 /* local caching enabled */
41#define CIFS_MOUNT_MF_SYMLINKS 0x10000 /* Minshall+French Symlinks enabled */ 41#define CIFS_MOUNT_MF_SYMLINKS 0x10000 /* Minshall+French Symlinks enabled */
42#define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */ 42#define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */
43#define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */
43 44
44struct cifs_sb_info { 45struct cifs_sb_info {
45 struct rb_root tlink_tree; 46 struct rb_root tlink_tree;
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 430f510a1720..fc0fd4fde306 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -44,10 +44,14 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
44 int charlen, outlen = 0; 44 int charlen, outlen = 0;
45 int maxwords = maxbytes / 2; 45 int maxwords = maxbytes / 2;
46 char tmp[NLS_MAX_CHARSET_SIZE]; 46 char tmp[NLS_MAX_CHARSET_SIZE];
47 __u16 ftmp;
47 48
48 for (i = 0; i < maxwords && from[i]; i++) { 49 for (i = 0; i < maxwords; i++) {
49 charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp, 50 ftmp = get_unaligned_le16(&from[i]);
50 NLS_MAX_CHARSET_SIZE); 51 if (ftmp == 0)
52 break;
53
54 charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE);
51 if (charlen > 0) 55 if (charlen > 0)
52 outlen += charlen; 56 outlen += charlen;
53 else 57 else
@@ -58,9 +62,9 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
58} 62}
59 63
60/* 64/*
61 * cifs_mapchar - convert a little-endian char to proper char in codepage 65 * cifs_mapchar - convert a host-endian char to proper char in codepage
62 * @target - where converted character should be copied 66 * @target - where converted character should be copied
63 * @src_char - 2 byte little-endian source character 67 * @src_char - 2 byte host-endian source character
64 * @cp - codepage to which character should be converted 68 * @cp - codepage to which character should be converted
65 * @mapchar - should character be mapped according to mapchars mount option? 69 * @mapchar - should character be mapped according to mapchars mount option?
66 * 70 *
@@ -69,7 +73,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
69 * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE). 73 * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
70 */ 74 */
71static int 75static int
72cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp, 76cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
73 bool mapchar) 77 bool mapchar)
74{ 78{
75 int len = 1; 79 int len = 1;
@@ -82,7 +86,7 @@ cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
82 * build_path_from_dentry are modified, as they use slash as 86 * build_path_from_dentry are modified, as they use slash as
83 * separator. 87 * separator.
84 */ 88 */
85 switch (le16_to_cpu(src_char)) { 89 switch (src_char) {
86 case UNI_COLON: 90 case UNI_COLON:
87 *target = ':'; 91 *target = ':';
88 break; 92 break;
@@ -109,8 +113,7 @@ out:
109 return len; 113 return len;
110 114
111cp_convert: 115cp_convert:
112 len = cp->uni2char(le16_to_cpu(src_char), target, 116 len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
113 NLS_MAX_CHARSET_SIZE);
114 if (len <= 0) { 117 if (len <= 0) {
115 *target = '?'; 118 *target = '?';
116 len = 1; 119 len = 1;
@@ -149,6 +152,7 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
149 int nullsize = nls_nullsize(codepage); 152 int nullsize = nls_nullsize(codepage);
150 int fromwords = fromlen / 2; 153 int fromwords = fromlen / 2;
151 char tmp[NLS_MAX_CHARSET_SIZE]; 154 char tmp[NLS_MAX_CHARSET_SIZE];
155 __u16 ftmp;
152 156
153 /* 157 /*
154 * because the chars can be of varying widths, we need to take care 158 * because the chars can be of varying widths, we need to take care
@@ -158,19 +162,23 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
158 */ 162 */
159 safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize); 163 safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
160 164
161 for (i = 0; i < fromwords && from[i]; i++) { 165 for (i = 0; i < fromwords; i++) {
166 ftmp = get_unaligned_le16(&from[i]);
167 if (ftmp == 0)
168 break;
169
162 /* 170 /*
163 * check to see if converting this character might make the 171 * check to see if converting this character might make the
164 * conversion bleed into the null terminator 172 * conversion bleed into the null terminator
165 */ 173 */
166 if (outlen >= safelen) { 174 if (outlen >= safelen) {
167 charlen = cifs_mapchar(tmp, from[i], codepage, mapchar); 175 charlen = cifs_mapchar(tmp, ftmp, codepage, mapchar);
168 if ((outlen + charlen) > (tolen - nullsize)) 176 if ((outlen + charlen) > (tolen - nullsize))
169 break; 177 break;
170 } 178 }
171 179
172 /* put converted char into 'to' buffer */ 180 /* put converted char into 'to' buffer */
173 charlen = cifs_mapchar(&to[outlen], from[i], codepage, mapchar); 181 charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar);
174 outlen += charlen; 182 outlen += charlen;
175 } 183 }
176 184
@@ -193,24 +201,21 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
193{ 201{
194 int charlen; 202 int charlen;
195 int i; 203 int i;
196 wchar_t *wchar_to = (wchar_t *)to; /* needed to quiet sparse */ 204 wchar_t wchar_to; /* needed to quiet sparse */
197 205
198 for (i = 0; len && *from; i++, from += charlen, len -= charlen) { 206 for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
199 207 charlen = codepage->char2uni(from, len, &wchar_to);
200 /* works for 2.4.0 kernel or later */
201 charlen = codepage->char2uni(from, len, &wchar_to[i]);
202 if (charlen < 1) { 208 if (charlen < 1) {
203 cERROR(1, "strtoUCS: char2uni of %d returned %d", 209 cERROR(1, "strtoUCS: char2uni of 0x%x returned %d",
204 (int)*from, charlen); 210 *from, charlen);
205 /* A question mark */ 211 /* A question mark */
206 to[i] = cpu_to_le16(0x003f); 212 wchar_to = 0x003f;
207 charlen = 1; 213 charlen = 1;
208 } else 214 }
209 to[i] = cpu_to_le16(wchar_to[i]); 215 put_unaligned_le16(wchar_to, &to[i]);
210
211 } 216 }
212 217
213 to[i] = 0; 218 put_unaligned_le16(0, &to[i]);
214 return i; 219 return i;
215} 220}
216 221
@@ -252,3 +257,79 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
252 return dst; 257 return dst;
253} 258}
254 259
260/*
261 * Convert 16 bit Unicode pathname to wire format from string in current code
262 * page. Conversion may involve remapping up the six characters that are
263 * only legal in POSIX-like OS (if they are present in the string). Path
264 * names are little endian 16 bit Unicode on the wire
265 */
266int
267cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
268 const struct nls_table *cp, int mapChars)
269{
270 int i, j, charlen;
271 int len_remaining = maxlen;
272 char src_char;
273 __u16 temp;
274
275 if (!mapChars)
276 return cifs_strtoUCS(target, source, PATH_MAX, cp);
277
278 for (i = 0, j = 0; i < maxlen; j++) {
279 src_char = source[i];
280 switch (src_char) {
281 case 0:
282 put_unaligned_le16(0, &target[j]);
283 goto ctoUCS_out;
284 case ':':
285 temp = UNI_COLON;
286 break;
287 case '*':
288 temp = UNI_ASTERIK;
289 break;
290 case '?':
291 temp = UNI_QUESTION;
292 break;
293 case '<':
294 temp = UNI_LESSTHAN;
295 break;
296 case '>':
297 temp = UNI_GRTRTHAN;
298 break;
299 case '|':
300 temp = UNI_PIPE;
301 break;
302 /*
303 * FIXME: We can not handle remapping backslash (UNI_SLASH)
304 * until all the calls to build_path_from_dentry are modified,
305 * as they use backslash as separator.
306 */
307 default:
308 charlen = cp->char2uni(source+i, len_remaining,
309 &temp);
310 /*
311 * if no match, use question mark, which at least in
312 * some cases serves as wild card
313 */
314 if (charlen < 1) {
315 temp = 0x003f;
316 charlen = 1;
317 }
318 len_remaining -= charlen;
319 /*
320 * character may take more than one byte in the source
321 * string, but will take exactly two bytes in the
322 * target string
323 */
324 i += charlen;
325 continue;
326 }
327 put_unaligned_le16(temp, &target[j]);
328 i++; /* move to next char in source string */
329 len_remaining--;
330 }
331
332ctoUCS_out:
333 return i;
334}
335
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index a437ec391a01..beeebf194234 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -41,9 +41,12 @@ static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
41; 41;
42 42
43 43
44/* security id for everyone */ 44/* security id for everyone/world system group */
45static const struct cifs_sid sid_everyone = { 45static const struct cifs_sid sid_everyone = {
46 1, 1, {0, 0, 0, 0, 0, 1}, {0} }; 46 1, 1, {0, 0, 0, 0, 0, 1}, {0} };
47/* security id for Authenticated Users system group */
48static const struct cifs_sid sid_authusers = {
49 1, 1, {0, 0, 0, 0, 0, 5}, {11} };
47/* group users */ 50/* group users */
48static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; 51static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
49 52
@@ -365,10 +368,14 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
365 if (num_aces > 0) { 368 if (num_aces > 0) {
366 umode_t user_mask = S_IRWXU; 369 umode_t user_mask = S_IRWXU;
367 umode_t group_mask = S_IRWXG; 370 umode_t group_mask = S_IRWXG;
368 umode_t other_mask = S_IRWXO; 371 umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
369 372
370 ppace = kmalloc(num_aces * sizeof(struct cifs_ace *), 373 ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
371 GFP_KERNEL); 374 GFP_KERNEL);
375 if (!ppace) {
376 cERROR(1, "DACL memory allocation error");
377 return;
378 }
372 379
373 for (i = 0; i < num_aces; ++i) { 380 for (i = 0; i < num_aces; ++i) {
374 ppace[i] = (struct cifs_ace *) (acl_base + acl_size); 381 ppace[i] = (struct cifs_ace *) (acl_base + acl_size);
@@ -390,6 +397,12 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
390 ppace[i]->type, 397 ppace[i]->type,
391 &fattr->cf_mode, 398 &fattr->cf_mode,
392 &other_mask); 399 &other_mask);
400 if (compare_sids(&(ppace[i]->sid), &sid_authusers))
401 access_flags_to_mode(ppace[i]->access_req,
402 ppace[i]->type,
403 &fattr->cf_mode,
404 &other_mask);
405
393 406
394/* memcpy((void *)(&(cifscred->aces[i])), 407/* memcpy((void *)(&(cifscred->aces[i])),
395 (void *)ppace[i], 408 (void *)ppace[i],
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 66f3d50d0676..a51585f9852b 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -24,7 +24,6 @@
24#include "cifspdu.h" 24#include "cifspdu.h"
25#include "cifsglob.h" 25#include "cifsglob.h"
26#include "cifs_debug.h" 26#include "cifs_debug.h"
27#include "md5.h"
28#include "cifs_unicode.h" 27#include "cifs_unicode.h"
29#include "cifsproto.h" 28#include "cifsproto.h"
30#include "ntlmssp.h" 29#include "ntlmssp.h"
@@ -37,11 +36,6 @@
37/* Note that the smb header signature field on input contains the 36/* Note that the smb header signature field on input contains the
38 sequence number before this function is called */ 37 sequence number before this function is called */
39 38
40extern void mdfour(unsigned char *out, unsigned char *in, int n);
41extern void E_md4hash(const unsigned char *passwd, unsigned char *p16);
42extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
43 unsigned char *p24);
44
45static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, 39static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
46 struct TCP_Server_Info *server, char *signature) 40 struct TCP_Server_Info *server, char *signature)
47{ 41{
@@ -234,6 +228,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
234/* first calculate 24 bytes ntlm response and then 16 byte session key */ 228/* first calculate 24 bytes ntlm response and then 16 byte session key */
235int setup_ntlm_response(struct cifsSesInfo *ses) 229int setup_ntlm_response(struct cifsSesInfo *ses)
236{ 230{
231 int rc = 0;
237 unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE; 232 unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
238 char temp_key[CIFS_SESS_KEY_SIZE]; 233 char temp_key[CIFS_SESS_KEY_SIZE];
239 234
@@ -247,13 +242,26 @@ int setup_ntlm_response(struct cifsSesInfo *ses)
247 } 242 }
248 ses->auth_key.len = temp_len; 243 ses->auth_key.len = temp_len;
249 244
250 SMBNTencrypt(ses->password, ses->server->cryptkey, 245 rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
251 ses->auth_key.response + CIFS_SESS_KEY_SIZE); 246 ses->auth_key.response + CIFS_SESS_KEY_SIZE);
247 if (rc) {
248 cFYI(1, "%s Can't generate NTLM response, error: %d",
249 __func__, rc);
250 return rc;
251 }
252 252
253 E_md4hash(ses->password, temp_key); 253 rc = E_md4hash(ses->password, temp_key);
254 mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE); 254 if (rc) {
255 cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
256 return rc;
257 }
255 258
256 return 0; 259 rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
260 if (rc)
261 cFYI(1, "%s Can't generate NTLM session key, error: %d",
262 __func__, rc);
263
264 return rc;
257} 265}
258 266
259#ifdef CONFIG_CIFS_WEAK_PW_HASH 267#ifdef CONFIG_CIFS_WEAK_PW_HASH
@@ -649,9 +657,10 @@ calc_seckey(struct cifsSesInfo *ses)
649 get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE); 657 get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
650 658
651 tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); 659 tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
652 if (!tfm_arc4 || IS_ERR(tfm_arc4)) { 660 if (IS_ERR(tfm_arc4)) {
661 rc = PTR_ERR(tfm_arc4);
653 cERROR(1, "could not allocate crypto API arc4\n"); 662 cERROR(1, "could not allocate crypto API arc4\n");
654 return PTR_ERR(tfm_arc4); 663 return rc;
655 } 664 }
656 665
657 desc.tfm = tfm_arc4; 666 desc.tfm = tfm_arc4;
@@ -700,14 +709,13 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
700 unsigned int size; 709 unsigned int size;
701 710
702 server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0); 711 server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
703 if (!server->secmech.hmacmd5 || 712 if (IS_ERR(server->secmech.hmacmd5)) {
704 IS_ERR(server->secmech.hmacmd5)) {
705 cERROR(1, "could not allocate crypto hmacmd5\n"); 713 cERROR(1, "could not allocate crypto hmacmd5\n");
706 return PTR_ERR(server->secmech.hmacmd5); 714 return PTR_ERR(server->secmech.hmacmd5);
707 } 715 }
708 716
709 server->secmech.md5 = crypto_alloc_shash("md5", 0, 0); 717 server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
710 if (!server->secmech.md5 || IS_ERR(server->secmech.md5)) { 718 if (IS_ERR(server->secmech.md5)) {
711 cERROR(1, "could not allocate crypto md5\n"); 719 cERROR(1, "could not allocate crypto md5\n");
712 rc = PTR_ERR(server->secmech.md5); 720 rc = PTR_ERR(server->secmech.md5);
713 goto crypto_allocate_md5_fail; 721 goto crypto_allocate_md5_fail;
diff --git a/fs/cifs/cifsencrypt.h b/fs/cifs/cifsencrypt.h
deleted file mode 100644
index 15d2ec006474..000000000000
--- a/fs/cifs/cifsencrypt.h
+++ /dev/null
@@ -1,33 +0,0 @@
1/*
2 * fs/cifs/cifsencrypt.h
3 *
4 * Copyright (c) International Business Machines Corp., 2005
5 * Author(s): Steve French (sfrench@us.ibm.com)
6 *
7 * Externs for misc. small encryption routines
8 * so we do not have to put them in cifsproto.h
9 *
10 * This library is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU Lesser General Public License as published
12 * by the Free Software Foundation; either version 2.1 of the License, or
13 * (at your option) any later version.
14 *
15 * This library is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
18 * the GNU Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this library; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
24
25/* md4.c */
26extern void mdfour(unsigned char *out, unsigned char *in, int n);
27/* smbdes.c */
28extern void E_P16(unsigned char *p14, unsigned char *p16);
29extern void E_P24(unsigned char *p21, const unsigned char *c8,
30 unsigned char *p24);
31
32
33
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index d9f652a522a6..f2970136d17d 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -77,7 +77,11 @@ unsigned int cifs_max_pending = CIFS_MAX_REQ;
77module_param(cifs_max_pending, int, 0); 77module_param(cifs_max_pending, int, 0);
78MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " 78MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
79 "Default: 50 Range: 2 to 256"); 79 "Default: 50 Range: 2 to 256");
80 80unsigned short echo_retries = 5;
81module_param(echo_retries, ushort, 0644);
82MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and "
83 "reconnecting server. Default: 5. 0 means "
84 "never reconnect.");
81extern mempool_t *cifs_sm_req_poolp; 85extern mempool_t *cifs_sm_req_poolp;
82extern mempool_t *cifs_req_poolp; 86extern mempool_t *cifs_req_poolp;
83extern mempool_t *cifs_mid_poolp; 87extern mempool_t *cifs_mid_poolp;
@@ -596,10 +600,17 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
596{ 600{
597 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 601 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
598 ssize_t written; 602 ssize_t written;
603 int rc;
599 604
600 written = generic_file_aio_write(iocb, iov, nr_segs, pos); 605 written = generic_file_aio_write(iocb, iov, nr_segs, pos);
601 if (!CIFS_I(inode)->clientCanCacheAll) 606
602 filemap_fdatawrite(inode->i_mapping); 607 if (CIFS_I(inode)->clientCanCacheAll)
608 return written;
609
610 rc = filemap_fdatawrite(inode->i_mapping);
611 if (rc)
612 cFYI(1, "cifs_file_aio_write: %d rc on %p inode", rc, inode);
613
603 return written; 614 return written;
604} 615}
605 616
@@ -729,6 +740,25 @@ const struct file_operations cifs_file_ops = {
729 .setlease = cifs_setlease, 740 .setlease = cifs_setlease,
730}; 741};
731 742
743const struct file_operations cifs_file_strict_ops = {
744 .read = do_sync_read,
745 .write = do_sync_write,
746 .aio_read = cifs_strict_readv,
747 .aio_write = cifs_strict_writev,
748 .open = cifs_open,
749 .release = cifs_close,
750 .lock = cifs_lock,
751 .fsync = cifs_strict_fsync,
752 .flush = cifs_flush,
753 .mmap = cifs_file_strict_mmap,
754 .splice_read = generic_file_splice_read,
755 .llseek = cifs_llseek,
756#ifdef CONFIG_CIFS_POSIX
757 .unlocked_ioctl = cifs_ioctl,
758#endif /* CONFIG_CIFS_POSIX */
759 .setlease = cifs_setlease,
760};
761
732const struct file_operations cifs_file_direct_ops = { 762const struct file_operations cifs_file_direct_ops = {
733 /* no aio, no readv - 763 /* no aio, no readv -
734 BB reevaluate whether they can be done with directio, no cache */ 764 BB reevaluate whether they can be done with directio, no cache */
@@ -747,6 +777,7 @@ const struct file_operations cifs_file_direct_ops = {
747 .llseek = cifs_llseek, 777 .llseek = cifs_llseek,
748 .setlease = cifs_setlease, 778 .setlease = cifs_setlease,
749}; 779};
780
750const struct file_operations cifs_file_nobrl_ops = { 781const struct file_operations cifs_file_nobrl_ops = {
751 .read = do_sync_read, 782 .read = do_sync_read,
752 .write = do_sync_write, 783 .write = do_sync_write,
@@ -765,6 +796,24 @@ const struct file_operations cifs_file_nobrl_ops = {
765 .setlease = cifs_setlease, 796 .setlease = cifs_setlease,
766}; 797};
767 798
799const struct file_operations cifs_file_strict_nobrl_ops = {
800 .read = do_sync_read,
801 .write = do_sync_write,
802 .aio_read = cifs_strict_readv,
803 .aio_write = cifs_strict_writev,
804 .open = cifs_open,
805 .release = cifs_close,
806 .fsync = cifs_strict_fsync,
807 .flush = cifs_flush,
808 .mmap = cifs_file_strict_mmap,
809 .splice_read = generic_file_splice_read,
810 .llseek = cifs_llseek,
811#ifdef CONFIG_CIFS_POSIX
812 .unlocked_ioctl = cifs_ioctl,
813#endif /* CONFIG_CIFS_POSIX */
814 .setlease = cifs_setlease,
815};
816
768const struct file_operations cifs_file_direct_nobrl_ops = { 817const struct file_operations cifs_file_direct_nobrl_ops = {
769 /* no mmap, no aio, no readv - 818 /* no mmap, no aio, no readv -
770 BB reevaluate whether they can be done with directio, no cache */ 819 BB reevaluate whether they can be done with directio, no cache */
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 897b2b2b28b5..4a3330235d55 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,6 +61,7 @@ extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
61 struct dentry *); 61 struct dentry *);
62extern int cifs_revalidate_file(struct file *filp); 62extern int cifs_revalidate_file(struct file *filp);
63extern int cifs_revalidate_dentry(struct dentry *); 63extern int cifs_revalidate_dentry(struct dentry *);
64extern void cifs_invalidate_mapping(struct inode *inode);
64extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 65extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
65extern int cifs_setattr(struct dentry *, struct iattr *); 66extern int cifs_setattr(struct dentry *, struct iattr *);
66 67
@@ -72,19 +73,27 @@ extern const struct inode_operations cifs_dfs_referral_inode_operations;
72/* Functions related to files and directories */ 73/* Functions related to files and directories */
73extern const struct file_operations cifs_file_ops; 74extern const struct file_operations cifs_file_ops;
74extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */ 75extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */
75extern const struct file_operations cifs_file_nobrl_ops; 76extern const struct file_operations cifs_file_strict_ops; /* if strictio mnt */
76extern const struct file_operations cifs_file_direct_nobrl_ops; /* no brlocks */ 77extern const struct file_operations cifs_file_nobrl_ops; /* no brlocks */
78extern const struct file_operations cifs_file_direct_nobrl_ops;
79extern const struct file_operations cifs_file_strict_nobrl_ops;
77extern int cifs_open(struct inode *inode, struct file *file); 80extern int cifs_open(struct inode *inode, struct file *file);
78extern int cifs_close(struct inode *inode, struct file *file); 81extern int cifs_close(struct inode *inode, struct file *file);
79extern int cifs_closedir(struct inode *inode, struct file *file); 82extern int cifs_closedir(struct inode *inode, struct file *file);
80extern ssize_t cifs_user_read(struct file *file, char __user *read_data, 83extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
81 size_t read_size, loff_t *poffset); 84 size_t read_size, loff_t *poffset);
85extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
86 unsigned long nr_segs, loff_t pos);
82extern ssize_t cifs_user_write(struct file *file, const char __user *write_data, 87extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
83 size_t write_size, loff_t *poffset); 88 size_t write_size, loff_t *poffset);
89extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
90 unsigned long nr_segs, loff_t pos);
84extern int cifs_lock(struct file *, int, struct file_lock *); 91extern int cifs_lock(struct file *, int, struct file_lock *);
85extern int cifs_fsync(struct file *, int); 92extern int cifs_fsync(struct file *, int);
93extern int cifs_strict_fsync(struct file *, int);
86extern int cifs_flush(struct file *, fl_owner_t id); 94extern int cifs_flush(struct file *, fl_owner_t id);
87extern int cifs_file_mmap(struct file * , struct vm_area_struct *); 95extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
96extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
88extern const struct file_operations cifs_dir_ops; 97extern const struct file_operations cifs_dir_ops;
89extern int cifs_dir_open(struct inode *inode, struct file *file); 98extern int cifs_dir_open(struct inode *inode, struct file *file);
90extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir); 99extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
@@ -93,6 +102,12 @@ extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
93extern const struct dentry_operations cifs_dentry_ops; 102extern const struct dentry_operations cifs_dentry_ops;
94extern const struct dentry_operations cifs_ci_dentry_ops; 103extern const struct dentry_operations cifs_ci_dentry_ops;
95 104
105#ifdef CONFIG_CIFS_DFS_UPCALL
106extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
107#else
108#define cifs_dfs_d_automount NULL
109#endif
110
96/* Functions related to symlinks */ 111/* Functions related to symlinks */
97extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd); 112extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
98extern void cifs_put_link(struct dentry *direntry, 113extern void cifs_put_link(struct dentry *direntry,
@@ -112,5 +127,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
112extern const struct export_operations cifs_export_ops; 127extern const struct export_operations cifs_export_ops;
113#endif /* EXPERIMENTAL */ 128#endif /* EXPERIMENTAL */
114 129
115#define CIFS_VERSION "1.68" 130#define CIFS_VERSION "1.70"
116#endif /* _CIFSFS_H */ 131#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 606ca8bb7102..17afb0fbcaed 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -161,46 +161,41 @@ struct TCP_Server_Info {
161 int srv_count; /* reference counter */ 161 int srv_count; /* reference counter */
162 /* 15 character server name + 0x20 16th byte indicating type = srv */ 162 /* 15 character server name + 0x20 16th byte indicating type = srv */
163 char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; 163 char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
164 enum statusEnum tcpStatus; /* what we think the status is */
164 char *hostname; /* hostname portion of UNC string */ 165 char *hostname; /* hostname portion of UNC string */
165 struct socket *ssocket; 166 struct socket *ssocket;
166 struct sockaddr_storage dstaddr; 167 struct sockaddr_storage dstaddr;
167 struct sockaddr_storage srcaddr; /* locally bind to this IP */ 168 struct sockaddr_storage srcaddr; /* locally bind to this IP */
169#ifdef CONFIG_NET_NS
170 struct net *net;
171#endif
168 wait_queue_head_t response_q; 172 wait_queue_head_t response_q;
169 wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/ 173 wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
170 struct list_head pending_mid_q; 174 struct list_head pending_mid_q;
171 void *Server_NlsInfo; /* BB - placeholder for future NLS info */
172 unsigned short server_codepage; /* codepage for the server */
173 enum protocolEnum protocolType;
174 char versionMajor;
175 char versionMinor;
176 bool svlocal:1; /* local server or remote */
177 bool noblocksnd; /* use blocking sendmsg */ 175 bool noblocksnd; /* use blocking sendmsg */
178 bool noautotune; /* do not autotune send buf sizes */ 176 bool noautotune; /* do not autotune send buf sizes */
179 bool tcp_nodelay; 177 bool tcp_nodelay;
180 atomic_t inFlight; /* number of requests on the wire to server */ 178 atomic_t inFlight; /* number of requests on the wire to server */
181#ifdef CONFIG_CIFS_STATS2
182 atomic_t inSend; /* requests trying to send */
183 atomic_t num_waiters; /* blocked waiting to get in sendrecv */
184#endif
185 enum statusEnum tcpStatus; /* what we think the status is */
186 struct mutex srv_mutex; 179 struct mutex srv_mutex;
187 struct task_struct *tsk; 180 struct task_struct *tsk;
188 char server_GUID[16]; 181 char server_GUID[16];
189 char secMode; 182 char secMode;
183 bool session_estab; /* mark when very first sess is established */
184 u16 dialect; /* dialect index that server chose */
190 enum securityEnum secType; 185 enum securityEnum secType;
191 unsigned int maxReq; /* Clients should submit no more */ 186 unsigned int maxReq; /* Clients should submit no more */
192 /* than maxReq distinct unanswered SMBs to the server when using */ 187 /* than maxReq distinct unanswered SMBs to the server when using */
193 /* multiplexed reads or writes */ 188 /* multiplexed reads or writes */
194 unsigned int maxBuf; /* maxBuf specifies the maximum */ 189 unsigned int maxBuf; /* maxBuf specifies the maximum */
195 /* message size the server can send or receive for non-raw SMBs */ 190 /* message size the server can send or receive for non-raw SMBs */
191 /* maxBuf is returned by SMB NegotiateProtocol so maxBuf is only 0 */
192 /* when socket is setup (and during reconnect) before NegProt sent */
196 unsigned int max_rw; /* maxRw specifies the maximum */ 193 unsigned int max_rw; /* maxRw specifies the maximum */
197 /* message size the server can send or receive for */ 194 /* message size the server can send or receive for */
198 /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */ 195 /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
199 unsigned int max_vcs; /* maximum number of smb sessions, at least 196 unsigned int max_vcs; /* maximum number of smb sessions, at least
200 those that can be specified uniquely with 197 those that can be specified uniquely with
201 vcnumbers */ 198 vcnumbers */
202 char sessid[4]; /* unique token id for this session */
203 /* (returned on Negotiate */
204 int capabilities; /* allow selective disabling of caps by smb sess */ 199 int capabilities; /* allow selective disabling of caps by smb sess */
205 int timeAdj; /* Adjust for difference in server time zone in sec */ 200 int timeAdj; /* Adjust for difference in server time zone in sec */
206 __u16 CurrentMid; /* multiplex id - rotating counter */ 201 __u16 CurrentMid; /* multiplex id - rotating counter */
@@ -210,20 +205,53 @@ struct TCP_Server_Info {
210 __u32 sequence_number; /* for signing, protected by srv_mutex */ 205 __u32 sequence_number; /* for signing, protected by srv_mutex */
211 struct session_key session_key; 206 struct session_key session_key;
212 unsigned long lstrp; /* when we got last response from this server */ 207 unsigned long lstrp; /* when we got last response from this server */
213 u16 dialect; /* dialect index that server chose */
214 struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */ 208 struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
215 /* extended security flavors that server supports */ 209 /* extended security flavors that server supports */
210 bool sec_ntlmssp; /* supports NTLMSSP */
211 bool sec_kerberosu2u; /* supports U2U Kerberos */
216 bool sec_kerberos; /* supports plain Kerberos */ 212 bool sec_kerberos; /* supports plain Kerberos */
217 bool sec_mskerberos; /* supports legacy MS Kerberos */ 213 bool sec_mskerberos; /* supports legacy MS Kerberos */
218 bool sec_kerberosu2u; /* supports U2U Kerberos */ 214 struct delayed_work echo; /* echo ping workqueue job */
219 bool sec_ntlmssp; /* supports NTLMSSP */
220 bool session_estab; /* mark when very first sess is established */
221#ifdef CONFIG_CIFS_FSCACHE 215#ifdef CONFIG_CIFS_FSCACHE
222 struct fscache_cookie *fscache; /* client index cache cookie */ 216 struct fscache_cookie *fscache; /* client index cache cookie */
223#endif 217#endif
218#ifdef CONFIG_CIFS_STATS2
219 atomic_t inSend; /* requests trying to send */
220 atomic_t num_waiters; /* blocked waiting to get in sendrecv */
221#endif
224}; 222};
225 223
226/* 224/*
225 * Macros to allow the TCP_Server_Info->net field and related code to drop out
226 * when CONFIG_NET_NS isn't set.
227 */
228
229#ifdef CONFIG_NET_NS
230
231static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
232{
233 return srv->net;
234}
235
236static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
237{
238 srv->net = net;
239}
240
241#else
242
243static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
244{
245 return &init_net;
246}
247
248static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
249{
250}
251
252#endif
253
254/*
227 * Session structure. One of these for each uid session with a particular host 255 * Session structure. One of these for each uid session with a particular host
228 */ 256 */
229struct cifsSesInfo { 257struct cifsSesInfo {
@@ -446,11 +474,11 @@ struct cifsInodeInfo {
446 /* BB add in lists for dirty pages i.e. write caching info for oplock */ 474 /* BB add in lists for dirty pages i.e. write caching info for oplock */
447 struct list_head openFileList; 475 struct list_head openFileList;
448 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ 476 __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
449 unsigned long time; /* jiffies of last update/check of inode */ 477 bool clientCanCacheRead; /* read oplock */
450 bool clientCanCacheRead:1; /* read oplock */ 478 bool clientCanCacheAll; /* read and writebehind oplock */
451 bool clientCanCacheAll:1; /* read and writebehind oplock */ 479 bool delete_pending; /* DELETE_ON_CLOSE is set */
452 bool delete_pending:1; /* DELETE_ON_CLOSE is set */ 480 bool invalid_mapping; /* pagecache is invalid */
453 bool invalid_mapping:1; /* pagecache is invalid */ 481 unsigned long time; /* jiffies of last update of inode */
454 u64 server_eof; /* current file size on server */ 482 u64 server_eof; /* current file size on server */
455 u64 uniqueid; /* server inode number */ 483 u64 uniqueid; /* server inode number */
456 u64 createtime; /* creation time on server */ 484 u64 createtime; /* creation time on server */
@@ -508,6 +536,18 @@ static inline void cifs_stats_bytes_read(struct cifsTconInfo *tcon,
508 536
509#endif 537#endif
510 538
539struct mid_q_entry;
540
541/*
542 * This is the prototype for the mid callback function. When creating one,
543 * take special care to avoid deadlocks. Things to bear in mind:
544 *
545 * - it will be called by cifsd
546 * - the GlobalMid_Lock will be held
547 * - the mid will be removed from the pending_mid_q list
548 */
549typedef void (mid_callback_t)(struct mid_q_entry *mid);
550
511/* one of these for every pending CIFS request to the server */ 551/* one of these for every pending CIFS request to the server */
512struct mid_q_entry { 552struct mid_q_entry {
513 struct list_head qhead; /* mids waiting on reply from this server */ 553 struct list_head qhead; /* mids waiting on reply from this server */
@@ -519,7 +559,8 @@ struct mid_q_entry {
519 unsigned long when_sent; /* time when smb send finished */ 559 unsigned long when_sent; /* time when smb send finished */
520 unsigned long when_received; /* when demux complete (taken off wire) */ 560 unsigned long when_received; /* when demux complete (taken off wire) */
521#endif 561#endif
522 struct task_struct *tsk; /* task waiting for response */ 562 mid_callback_t *callback; /* call completion callback */
563 void *callback_data; /* general purpose pointer for callback */
523 struct smb_hdr *resp_buf; /* response buffer */ 564 struct smb_hdr *resp_buf; /* response buffer */
524 int midState; /* wish this were enum but can not pass to wait_event */ 565 int midState; /* wish this were enum but can not pass to wait_event */
525 __u8 command; /* smb command code */ 566 __u8 command; /* smb command code */
@@ -613,7 +654,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
613#define MID_REQUEST_SUBMITTED 2 654#define MID_REQUEST_SUBMITTED 2
614#define MID_RESPONSE_RECEIVED 4 655#define MID_RESPONSE_RECEIVED 4
615#define MID_RETRY_NEEDED 8 /* session closed while this request out */ 656#define MID_RETRY_NEEDED 8 /* session closed while this request out */
616#define MID_NO_RESP_NEEDED 0x10 657#define MID_RESPONSE_MALFORMED 0x10
617 658
618/* Types of response buffer returned from SendReceive2 */ 659/* Types of response buffer returned from SendReceive2 */
619#define CIFS_NO_BUFFER 0 /* Response buffer not returned */ 660#define CIFS_NO_BUFFER 0 /* Response buffer not returned */
@@ -622,12 +663,9 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
622#define CIFS_IOVEC 4 /* array of response buffers */ 663#define CIFS_IOVEC 4 /* array of response buffers */
623 664
624/* Type of Request to SendReceive2 */ 665/* Type of Request to SendReceive2 */
625#define CIFS_STD_OP 0 /* normal request timeout */ 666#define CIFS_BLOCKING_OP 1 /* operation can block */
626#define CIFS_LONG_OP 1 /* long op (up to 45 sec, oplock time) */ 667#define CIFS_ASYNC_OP 2 /* do not wait for response */
627#define CIFS_VLONG_OP 2 /* sloow op - can take up to 180 seconds */ 668#define CIFS_TIMEOUT_MASK 0x003 /* only one of above set in req */
628#define CIFS_BLOCKING_OP 4 /* operation can block */
629#define CIFS_ASYNC_OP 8 /* do not wait for response */
630#define CIFS_TIMEOUT_MASK 0x00F /* only one of 5 above set in req */
631#define CIFS_LOG_ERROR 0x010 /* log NT STATUS if non-zero */ 669#define CIFS_LOG_ERROR 0x010 /* log NT STATUS if non-zero */
632#define CIFS_LARGE_BUF_OP 0x020 /* large request buffer */ 670#define CIFS_LARGE_BUF_OP 0x020 /* large request buffer */
633#define CIFS_NO_RESP 0x040 /* no response buffer required */ 671#define CIFS_NO_RESP 0x040 /* no response buffer required */
@@ -790,6 +828,9 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
790GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ 828GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
791GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ 829GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
792 830
831/* reconnect after this many failed echo attempts */
832GLOBAL_EXTERN unsigned short echo_retries;
833
793void cifs_oplock_break(struct work_struct *work); 834void cifs_oplock_break(struct work_struct *work);
794void cifs_oplock_break_get(struct cifsFileInfo *cfile); 835void cifs_oplock_break_get(struct cifsFileInfo *cfile);
795void cifs_oplock_break_put(struct cifsFileInfo *cfile); 836void cifs_oplock_break_put(struct cifsFileInfo *cfile);
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index de36b09763a8..b5c8cc5d7a7f 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -23,6 +23,7 @@
23#define _CIFSPDU_H 23#define _CIFSPDU_H
24 24
25#include <net/sock.h> 25#include <net/sock.h>
26#include <asm/unaligned.h>
26#include "smbfsctl.h" 27#include "smbfsctl.h"
27 28
28#ifdef CONFIG_CIFS_WEAK_PW_HASH 29#ifdef CONFIG_CIFS_WEAK_PW_HASH
@@ -50,6 +51,7 @@
50#define SMB_COM_SETATTR 0x09 /* trivial response */ 51#define SMB_COM_SETATTR 0x09 /* trivial response */
51#define SMB_COM_LOCKING_ANDX 0x24 /* trivial response */ 52#define SMB_COM_LOCKING_ANDX 0x24 /* trivial response */
52#define SMB_COM_COPY 0x29 /* trivial rsp, fail filename ignrd*/ 53#define SMB_COM_COPY 0x29 /* trivial rsp, fail filename ignrd*/
54#define SMB_COM_ECHO 0x2B /* echo request */
53#define SMB_COM_OPEN_ANDX 0x2D /* Legacy open for old servers */ 55#define SMB_COM_OPEN_ANDX 0x2D /* Legacy open for old servers */
54#define SMB_COM_READ_ANDX 0x2E 56#define SMB_COM_READ_ANDX 0x2E
55#define SMB_COM_WRITE_ANDX 0x2F 57#define SMB_COM_WRITE_ANDX 0x2F
@@ -425,11 +427,49 @@ struct smb_hdr {
425 __u16 Mid; 427 __u16 Mid;
426 __u8 WordCount; 428 __u8 WordCount;
427} __attribute__((packed)); 429} __attribute__((packed));
428/* given a pointer to an smb_hdr retrieve the value of byte count */ 430
429#define BCC(smb_var) (*(__u16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount))) 431/* given a pointer to an smb_hdr retrieve a char pointer to the byte count */
430#define BCC_LE(smb_var) (*(__le16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount))) 432#define BCC(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + \
433 (2 * (smb_var)->WordCount))
434
431/* given a pointer to an smb_hdr retrieve the pointer to the byte area */ 435/* given a pointer to an smb_hdr retrieve the pointer to the byte area */
432#define pByteArea(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount) + 2) 436#define pByteArea(smb_var) (BCC(smb_var) + 2)
437
438/* get the converted ByteCount for a SMB packet and return it */
439static inline __u16
440get_bcc(struct smb_hdr *hdr)
441{
442 __u16 *bc_ptr = (__u16 *)BCC(hdr);
443
444 return get_unaligned(bc_ptr);
445}
446
447/* get the unconverted ByteCount for a SMB packet and return it */
448static inline __u16
449get_bcc_le(struct smb_hdr *hdr)
450{
451 __le16 *bc_ptr = (__le16 *)BCC(hdr);
452
453 return get_unaligned_le16(bc_ptr);
454}
455
456/* set the ByteCount for a SMB packet in host-byte order */
457static inline void
458put_bcc(__u16 count, struct smb_hdr *hdr)
459{
460 __u16 *bc_ptr = (__u16 *)BCC(hdr);
461
462 put_unaligned(count, bc_ptr);
463}
464
465/* set the ByteCount for a SMB packet in little-endian */
466static inline void
467put_bcc_le(__u16 count, struct smb_hdr *hdr)
468{
469 __le16 *bc_ptr = (__le16 *)BCC(hdr);
470
471 put_unaligned_le16(count, bc_ptr);
472}
433 473
434/* 474/*
435 * Computer Name Length (since Netbios name was length 16 with last byte 0x20) 475 * Computer Name Length (since Netbios name was length 16 with last byte 0x20)
@@ -760,6 +800,20 @@ typedef struct smb_com_tconx_rsp_ext {
760 * 800 *
761 */ 801 */
762 802
803typedef struct smb_com_echo_req {
804 struct smb_hdr hdr;
805 __le16 EchoCount;
806 __le16 ByteCount;
807 char Data[1];
808} __attribute__((packed)) ECHO_REQ;
809
810typedef struct smb_com_echo_rsp {
811 struct smb_hdr hdr;
812 __le16 SequenceNumber;
813 __le16 ByteCount;
814 char Data[1];
815} __attribute__((packed)) ECHO_RSP;
816
763typedef struct smb_com_logoff_andx_req { 817typedef struct smb_com_logoff_andx_req {
764 struct smb_hdr hdr; /* wct = 2 */ 818 struct smb_hdr hdr; /* wct = 2 */
765 __u8 AndXCommand; 819 __u8 AndXCommand;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index e6d1481b16c1..8096f27ad9a8 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -61,6 +61,12 @@ extern char *cifs_compose_mount_options(const char *sb_mountdata,
61 const char *fullpath, const struct dfs_info3_param *ref, 61 const char *fullpath, const struct dfs_info3_param *ref,
62 char **devname); 62 char **devname);
63/* extern void renew_parental_timestamps(struct dentry *direntry);*/ 63/* extern void renew_parental_timestamps(struct dentry *direntry);*/
64extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
65 struct TCP_Server_Info *server);
66extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
67extern int cifs_call_async(struct TCP_Server_Info *server,
68 struct smb_hdr *in_buf, mid_callback_t *callback,
69 void *cbdata);
64extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *, 70extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
65 struct smb_hdr * /* input */ , 71 struct smb_hdr * /* input */ ,
66 struct smb_hdr * /* out */ , 72 struct smb_hdr * /* out */ ,
@@ -79,6 +85,8 @@ extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
79extern bool is_valid_oplock_break(struct smb_hdr *smb, 85extern bool is_valid_oplock_break(struct smb_hdr *smb,
80 struct TCP_Server_Info *); 86 struct TCP_Server_Info *);
81extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); 87extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
88extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
89 unsigned int bytes_written);
82extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool); 90extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
83extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); 91extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
84extern unsigned int smbCalcSize(struct smb_hdr *ptr); 92extern unsigned int smbCalcSize(struct smb_hdr *ptr);
@@ -347,12 +355,13 @@ extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
347 const __u16 netfid, const __u64 len, 355 const __u16 netfid, const __u64 len,
348 const __u64 offset, const __u32 numUnlock, 356 const __u64 offset, const __u32 numUnlock,
349 const __u32 numLock, const __u8 lockType, 357 const __u32 numLock, const __u8 lockType,
350 const bool waitFlag); 358 const bool waitFlag, const __u8 oplock_level);
351extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, 359extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
352 const __u16 smb_file_id, const int get_flag, 360 const __u16 smb_file_id, const int get_flag,
353 const __u64 len, struct file_lock *, 361 const __u64 len, struct file_lock *,
354 const __u16 lock_type, const bool waitFlag); 362 const __u16 lock_type, const bool waitFlag);
355extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon); 363extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon);
364extern int CIFSSMBEcho(struct TCP_Server_Info *server);
356extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses); 365extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses);
357 366
358extern struct cifsSesInfo *sesInfoAlloc(void); 367extern struct cifsSesInfo *sesInfoAlloc(void);
@@ -366,7 +375,7 @@ extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
366extern int cifs_verify_signature(struct smb_hdr *, 375extern int cifs_verify_signature(struct smb_hdr *,
367 struct TCP_Server_Info *server, 376 struct TCP_Server_Info *server,
368 __u32 expected_sequence_number); 377 __u32 expected_sequence_number);
369extern void SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *); 378extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
370extern int setup_ntlm_response(struct cifsSesInfo *); 379extern int setup_ntlm_response(struct cifsSesInfo *);
371extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *); 380extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *);
372extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *); 381extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
@@ -416,4 +425,11 @@ extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
416extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr, 425extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
417 const unsigned char *path, 426 const unsigned char *path,
418 struct cifs_sb_info *cifs_sb, int xid); 427 struct cifs_sb_info *cifs_sb, int xid);
428extern int mdfour(unsigned char *, unsigned char *, int);
429extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
430extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
431 unsigned char *p24);
432extern void E_P16(unsigned char *p14, unsigned char *p16);
433extern void E_P24(unsigned char *p21, const unsigned char *c8,
434 unsigned char *p24);
419#endif /* _CIFSPROTO_H */ 435#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 2f6795e524d3..904aa47e3515 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -136,9 +136,6 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
136 } 136 }
137 } 137 }
138 138
139 if (ses->status == CifsExiting)
140 return -EIO;
141
142 /* 139 /*
143 * Give demultiplex thread up to 10 seconds to reconnect, should be 140 * Give demultiplex thread up to 10 seconds to reconnect, should be
144 * greater than cifs socket timeout which is 7 seconds 141 * greater than cifs socket timeout which is 7 seconds
@@ -156,7 +153,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
156 * retrying until process is killed or server comes 153 * retrying until process is killed or server comes
157 * back on-line 154 * back on-line
158 */ 155 */
159 if (!tcon->retry || ses->status == CifsExiting) { 156 if (!tcon->retry) {
160 cFYI(1, "gave up waiting on reconnect in smb_init"); 157 cFYI(1, "gave up waiting on reconnect in smb_init");
161 return -EHOSTDOWN; 158 return -EHOSTDOWN;
162 } 159 }
@@ -331,37 +328,35 @@ smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
331 328
332static int validate_t2(struct smb_t2_rsp *pSMB) 329static int validate_t2(struct smb_t2_rsp *pSMB)
333{ 330{
334 int rc = -EINVAL; 331 unsigned int total_size;
335 int total_size; 332
336 char *pBCC; 333 /* check for plausible wct */
334 if (pSMB->hdr.WordCount < 10)
335 goto vt2_err;
337 336
338 /* check for plausible wct, bcc and t2 data and parm sizes */
339 /* check for parm and data offset going beyond end of smb */ 337 /* check for parm and data offset going beyond end of smb */
340 if (pSMB->hdr.WordCount >= 10) { 338 if (get_unaligned_le16(&pSMB->t2_rsp.ParameterOffset) > 1024 ||
341 if ((le16_to_cpu(pSMB->t2_rsp.ParameterOffset) <= 1024) && 339 get_unaligned_le16(&pSMB->t2_rsp.DataOffset) > 1024)
342 (le16_to_cpu(pSMB->t2_rsp.DataOffset) <= 1024)) { 340 goto vt2_err;
343 /* check that bcc is at least as big as parms + data */ 341
344 /* check that bcc is less than negotiated smb buffer */ 342 /* check that bcc is at least as big as parms + data */
345 total_size = le16_to_cpu(pSMB->t2_rsp.ParameterCount); 343 /* check that bcc is less than negotiated smb buffer */
346 if (total_size < 512) { 344 total_size = get_unaligned_le16(&pSMB->t2_rsp.ParameterCount);
347 total_size += 345 if (total_size >= 512)
348 le16_to_cpu(pSMB->t2_rsp.DataCount); 346 goto vt2_err;
349 /* BCC le converted in SendReceive */ 347
350 pBCC = (pSMB->hdr.WordCount * 2) + 348 total_size += get_unaligned_le16(&pSMB->t2_rsp.DataCount);
351 sizeof(struct smb_hdr) + 349 if (total_size > get_bcc(&pSMB->hdr) ||
352 (char *)pSMB; 350 total_size >= CIFSMaxBufSize + MAX_CIFS_HDR_SIZE)
353 if ((total_size <= (*(u16 *)pBCC)) && 351 goto vt2_err;
354 (total_size < 352
355 CIFSMaxBufSize+MAX_CIFS_HDR_SIZE)) { 353 return 0;
356 return 0; 354vt2_err:
357 }
358 }
359 }
360 }
361 cifs_dump_mem("Invalid transact2 SMB: ", (char *)pSMB, 355 cifs_dump_mem("Invalid transact2 SMB: ", (char *)pSMB,
362 sizeof(struct smb_t2_rsp) + 16); 356 sizeof(struct smb_t2_rsp) + 16);
363 return rc; 357 return -EINVAL;
364} 358}
359
365int 360int
366CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) 361CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
367{ 362{
@@ -452,7 +447,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
452 server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize), 447 server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
453 (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); 448 (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
454 server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs); 449 server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
455 GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey);
456 /* even though we do not use raw we might as well set this 450 /* even though we do not use raw we might as well set this
457 accurately, in case we ever find a need for it */ 451 accurately, in case we ever find a need for it */
458 if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) { 452 if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
@@ -566,7 +560,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
566 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); 560 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
567 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); 561 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
568 cFYI(DBG2, "Max buf = %d", ses->server->maxBuf); 562 cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
569 GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
570 server->capabilities = le32_to_cpu(pSMBr->Capabilities); 563 server->capabilities = le32_to_cpu(pSMBr->Capabilities);
571 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); 564 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
572 server->timeAdj *= 60; 565 server->timeAdj *= 60;
@@ -706,6 +699,53 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
706 return rc; 699 return rc;
707} 700}
708 701
702/*
703 * This is a no-op for now. We're not really interested in the reply, but
704 * rather in the fact that the server sent one and that server->lstrp
705 * gets updated.
706 *
707 * FIXME: maybe we should consider checking that the reply matches request?
708 */
709static void
710cifs_echo_callback(struct mid_q_entry *mid)
711{
712 struct TCP_Server_Info *server = mid->callback_data;
713
714 DeleteMidQEntry(mid);
715 atomic_dec(&server->inFlight);
716 wake_up(&server->request_q);
717}
718
719int
720CIFSSMBEcho(struct TCP_Server_Info *server)
721{
722 ECHO_REQ *smb;
723 int rc = 0;
724
725 cFYI(1, "In echo request");
726
727 rc = small_smb_init(SMB_COM_ECHO, 0, NULL, (void **)&smb);
728 if (rc)
729 return rc;
730
731 /* set up echo request */
732 smb->hdr.Tid = cpu_to_le16(0xffff);
733 smb->hdr.WordCount = 1;
734 put_unaligned_le16(1, &smb->EchoCount);
735 put_bcc_le(1, &smb->hdr);
736 smb->Data[0] = 'a';
737 smb->hdr.smb_buf_length += 3;
738
739 rc = cifs_call_async(server, (struct smb_hdr *)smb,
740 cifs_echo_callback, server);
741 if (rc)
742 cFYI(1, "Echo request failed: %d", rc);
743
744 cifs_small_buf_release(smb);
745
746 return rc;
747}
748
709int 749int
710CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses) 750CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
711{ 751{
@@ -1193,7 +1233,7 @@ OldOpenRetry:
1193 pSMB->ByteCount = cpu_to_le16(count); 1233 pSMB->ByteCount = cpu_to_le16(count);
1194 /* long_op set to 1 to allow for oplock break timeouts */ 1234 /* long_op set to 1 to allow for oplock break timeouts */
1195 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 1235 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
1196 (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP); 1236 (struct smb_hdr *)pSMBr, &bytes_returned, 0);
1197 cifs_stats_inc(&tcon->num_opens); 1237 cifs_stats_inc(&tcon->num_opens);
1198 if (rc) { 1238 if (rc) {
1199 cFYI(1, "Error in Open = %d", rc); 1239 cFYI(1, "Error in Open = %d", rc);
@@ -1306,7 +1346,7 @@ openRetry:
1306 pSMB->ByteCount = cpu_to_le16(count); 1346 pSMB->ByteCount = cpu_to_le16(count);
1307 /* long_op set to 1 to allow for oplock break timeouts */ 1347 /* long_op set to 1 to allow for oplock break timeouts */
1308 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 1348 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
1309 (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP); 1349 (struct smb_hdr *)pSMBr, &bytes_returned, 0);
1310 cifs_stats_inc(&tcon->num_opens); 1350 cifs_stats_inc(&tcon->num_opens);
1311 if (rc) { 1351 if (rc) {
1312 cFYI(1, "Error in Open = %d", rc); 1352 cFYI(1, "Error in Open = %d", rc);
@@ -1388,7 +1428,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
1388 iov[0].iov_base = (char *)pSMB; 1428 iov[0].iov_base = (char *)pSMB;
1389 iov[0].iov_len = pSMB->hdr.smb_buf_length + 4; 1429 iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
1390 rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */, 1430 rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
1391 &resp_buf_type, CIFS_STD_OP | CIFS_LOG_ERROR); 1431 &resp_buf_type, CIFS_LOG_ERROR);
1392 cifs_stats_inc(&tcon->num_reads); 1432 cifs_stats_inc(&tcon->num_reads);
1393 pSMBr = (READ_RSP *)iov[0].iov_base; 1433 pSMBr = (READ_RSP *)iov[0].iov_base;
1394 if (rc) { 1434 if (rc) {
@@ -1663,7 +1703,8 @@ int
1663CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, 1703CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
1664 const __u16 smb_file_id, const __u64 len, 1704 const __u16 smb_file_id, const __u64 len,
1665 const __u64 offset, const __u32 numUnlock, 1705 const __u64 offset, const __u32 numUnlock,
1666 const __u32 numLock, const __u8 lockType, const bool waitFlag) 1706 const __u32 numLock, const __u8 lockType,
1707 const bool waitFlag, const __u8 oplock_level)
1667{ 1708{
1668 int rc = 0; 1709 int rc = 0;
1669 LOCK_REQ *pSMB = NULL; 1710 LOCK_REQ *pSMB = NULL;
@@ -1691,6 +1732,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
1691 pSMB->NumberOfLocks = cpu_to_le16(numLock); 1732 pSMB->NumberOfLocks = cpu_to_le16(numLock);
1692 pSMB->NumberOfUnlocks = cpu_to_le16(numUnlock); 1733 pSMB->NumberOfUnlocks = cpu_to_le16(numUnlock);
1693 pSMB->LockType = lockType; 1734 pSMB->LockType = lockType;
1735 pSMB->OplockLevel = oplock_level;
1694 pSMB->AndXCommand = 0xFF; /* none */ 1736 pSMB->AndXCommand = 0xFF; /* none */
1695 pSMB->Fid = smb_file_id; /* netfid stays le */ 1737 pSMB->Fid = smb_file_id; /* netfid stays le */
1696 1738
@@ -3087,7 +3129,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
3087 iov[0].iov_len = pSMB->hdr.smb_buf_length + 4; 3129 iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
3088 3130
3089 rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type, 3131 rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type,
3090 CIFS_STD_OP); 3132 0);
3091 cifs_stats_inc(&tcon->num_acl_get); 3133 cifs_stats_inc(&tcon->num_acl_get);
3092 if (rc) { 3134 if (rc) {
3093 cFYI(1, "Send error in QuerySecDesc = %d", rc); 3135 cFYI(1, "Send error in QuerySecDesc = %d", rc);
@@ -4869,7 +4911,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
4869 __u16 fid, __u32 pid_of_opener, bool SetAllocation) 4911 __u16 fid, __u32 pid_of_opener, bool SetAllocation)
4870{ 4912{
4871 struct smb_com_transaction2_sfi_req *pSMB = NULL; 4913 struct smb_com_transaction2_sfi_req *pSMB = NULL;
4872 char *data_offset;
4873 struct file_end_of_file_info *parm_data; 4914 struct file_end_of_file_info *parm_data;
4874 int rc = 0; 4915 int rc = 0;
4875 __u16 params, param_offset, offset, byte_count, count; 4916 __u16 params, param_offset, offset, byte_count, count;
@@ -4893,8 +4934,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
4893 param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; 4934 param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
4894 offset = param_offset + params; 4935 offset = param_offset + params;
4895 4936
4896 data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
4897
4898 count = sizeof(struct file_end_of_file_info); 4937 count = sizeof(struct file_end_of_file_info);
4899 pSMB->MaxParameterCount = cpu_to_le16(2); 4938 pSMB->MaxParameterCount = cpu_to_le16(2);
4900 /* BB find exact max SMB PDU from sess structure BB */ 4939 /* BB find exact max SMB PDU from sess structure BB */
@@ -5562,7 +5601,7 @@ QAllEAsRetry:
5562 } 5601 }
5563 5602
5564 /* make sure list_len doesn't go past end of SMB */ 5603 /* make sure list_len doesn't go past end of SMB */
5565 end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr); 5604 end_of_smb = (char *)pByteArea(&pSMBr->hdr) + get_bcc(&pSMBr->hdr);
5566 if ((char *)ea_response_data + list_len > end_of_smb) { 5605 if ((char *)ea_response_data + list_len > end_of_smb) {
5567 cFYI(1, "EA list appears to go beyond SMB"); 5606 cFYI(1, "EA list appears to go beyond SMB");
5568 rc = -EIO; 5607 rc = -EIO;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a65d311d163a..8d6c17ab593d 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -52,8 +52,8 @@
52#define CIFS_PORT 445 52#define CIFS_PORT 445
53#define RFC1001_PORT 139 53#define RFC1001_PORT 139
54 54
55extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, 55/* SMB echo "timeout" -- FIXME: tunable? */
56 unsigned char *p24); 56#define SMB_ECHO_INTERVAL (60 * HZ)
57 57
58extern mempool_t *cifs_req_poolp; 58extern mempool_t *cifs_req_poolp;
59 59
@@ -84,6 +84,7 @@ struct smb_vol {
84 bool no_xattr:1; /* set if xattr (EA) support should be disabled*/ 84 bool no_xattr:1; /* set if xattr (EA) support should be disabled*/
85 bool server_ino:1; /* use inode numbers from server ie UniqueId */ 85 bool server_ino:1; /* use inode numbers from server ie UniqueId */
86 bool direct_io:1; 86 bool direct_io:1;
87 bool strict_io:1; /* strict cache behavior */
87 bool remap:1; /* set to remap seven reserved chars in filenames */ 88 bool remap:1; /* set to remap seven reserved chars in filenames */
88 bool posix_paths:1; /* unset to not ask for posix pathnames. */ 89 bool posix_paths:1; /* unset to not ask for posix pathnames. */
89 bool no_linux_ext:1; 90 bool no_linux_ext:1;
@@ -152,6 +153,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
152 153
153 /* before reconnecting the tcp session, mark the smb session (uid) 154 /* before reconnecting the tcp session, mark the smb session (uid)
154 and the tid bad so they are not used until reconnected */ 155 and the tid bad so they are not used until reconnected */
156 cFYI(1, "%s: marking sessions and tcons for reconnect", __func__);
155 spin_lock(&cifs_tcp_ses_lock); 157 spin_lock(&cifs_tcp_ses_lock);
156 list_for_each(tmp, &server->smb_ses_list) { 158 list_for_each(tmp, &server->smb_ses_list) {
157 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list); 159 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
@@ -163,7 +165,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
163 } 165 }
164 } 166 }
165 spin_unlock(&cifs_tcp_ses_lock); 167 spin_unlock(&cifs_tcp_ses_lock);
168
166 /* do not want to be sending data on a socket we are freeing */ 169 /* do not want to be sending data on a socket we are freeing */
170 cFYI(1, "%s: tearing down socket", __func__);
167 mutex_lock(&server->srv_mutex); 171 mutex_lock(&server->srv_mutex);
168 if (server->ssocket) { 172 if (server->ssocket) {
169 cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state, 173 cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
@@ -180,22 +184,20 @@ cifs_reconnect(struct TCP_Server_Info *server)
180 kfree(server->session_key.response); 184 kfree(server->session_key.response);
181 server->session_key.response = NULL; 185 server->session_key.response = NULL;
182 server->session_key.len = 0; 186 server->session_key.len = 0;
187 server->lstrp = jiffies;
188 mutex_unlock(&server->srv_mutex);
183 189
190 /* mark submitted MIDs for retry and issue callback */
191 cFYI(1, "%s: issuing mid callbacks", __func__);
184 spin_lock(&GlobalMid_Lock); 192 spin_lock(&GlobalMid_Lock);
185 list_for_each(tmp, &server->pending_mid_q) { 193 list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
186 mid_entry = list_entry(tmp, struct 194 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
187 mid_q_entry, 195 if (mid_entry->midState == MID_REQUEST_SUBMITTED)
188 qhead);
189 if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
190 /* Mark other intransit requests as needing
191 retry so we do not immediately mark the
192 session bad again (ie after we reconnect
193 below) as they timeout too */
194 mid_entry->midState = MID_RETRY_NEEDED; 196 mid_entry->midState = MID_RETRY_NEEDED;
195 } 197 list_del_init(&mid_entry->qhead);
198 mid_entry->callback(mid_entry);
196 } 199 }
197 spin_unlock(&GlobalMid_Lock); 200 spin_unlock(&GlobalMid_Lock);
198 mutex_unlock(&server->srv_mutex);
199 201
200 while ((server->tcpStatus != CifsExiting) && 202 while ((server->tcpStatus != CifsExiting) &&
201 (server->tcpStatus != CifsGood)) { 203 (server->tcpStatus != CifsGood)) {
@@ -212,10 +214,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
212 if (server->tcpStatus != CifsExiting) 214 if (server->tcpStatus != CifsExiting)
213 server->tcpStatus = CifsGood; 215 server->tcpStatus = CifsGood;
214 spin_unlock(&GlobalMid_Lock); 216 spin_unlock(&GlobalMid_Lock);
215 /* atomic_set(&server->inFlight,0);*/
216 wake_up(&server->response_q);
217 } 217 }
218 } 218 }
219
219 return rc; 220 return rc;
220} 221}
221 222
@@ -229,9 +230,8 @@ cifs_reconnect(struct TCP_Server_Info *server)
229static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize) 230static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
230{ 231{
231 struct smb_t2_rsp *pSMBt; 232 struct smb_t2_rsp *pSMBt;
232 int total_data_size;
233 int data_in_this_rsp;
234 int remaining; 233 int remaining;
234 __u16 total_data_size, data_in_this_rsp;
235 235
236 if (pSMB->Command != SMB_COM_TRANSACTION2) 236 if (pSMB->Command != SMB_COM_TRANSACTION2)
237 return 0; 237 return 0;
@@ -245,8 +245,8 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
245 245
246 pSMBt = (struct smb_t2_rsp *)pSMB; 246 pSMBt = (struct smb_t2_rsp *)pSMB;
247 247
248 total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount); 248 total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
249 data_in_this_rsp = le16_to_cpu(pSMBt->t2_rsp.DataCount); 249 data_in_this_rsp = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
250 250
251 remaining = total_data_size - data_in_this_rsp; 251 remaining = total_data_size - data_in_this_rsp;
252 252
@@ -272,21 +272,18 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
272{ 272{
273 struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond; 273 struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond;
274 struct smb_t2_rsp *pSMBt = (struct smb_t2_rsp *)pTargetSMB; 274 struct smb_t2_rsp *pSMBt = (struct smb_t2_rsp *)pTargetSMB;
275 int total_data_size;
276 int total_in_buf;
277 int remaining;
278 int total_in_buf2;
279 char *data_area_of_target; 275 char *data_area_of_target;
280 char *data_area_of_buf2; 276 char *data_area_of_buf2;
281 __u16 byte_count; 277 int remaining;
278 __u16 byte_count, total_data_size, total_in_buf, total_in_buf2;
282 279
283 total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount); 280 total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
284 281
285 if (total_data_size != le16_to_cpu(pSMB2->t2_rsp.TotalDataCount)) { 282 if (total_data_size !=
283 get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount))
286 cFYI(1, "total data size of primary and secondary t2 differ"); 284 cFYI(1, "total data size of primary and secondary t2 differ");
287 }
288 285
289 total_in_buf = le16_to_cpu(pSMBt->t2_rsp.DataCount); 286 total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
290 287
291 remaining = total_data_size - total_in_buf; 288 remaining = total_data_size - total_in_buf;
292 289
@@ -296,28 +293,28 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
296 if (remaining == 0) /* nothing to do, ignore */ 293 if (remaining == 0) /* nothing to do, ignore */
297 return 0; 294 return 0;
298 295
299 total_in_buf2 = le16_to_cpu(pSMB2->t2_rsp.DataCount); 296 total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount);
300 if (remaining < total_in_buf2) { 297 if (remaining < total_in_buf2) {
301 cFYI(1, "transact2 2nd response contains too much data"); 298 cFYI(1, "transact2 2nd response contains too much data");
302 } 299 }
303 300
304 /* find end of first SMB data area */ 301 /* find end of first SMB data area */
305 data_area_of_target = (char *)&pSMBt->hdr.Protocol + 302 data_area_of_target = (char *)&pSMBt->hdr.Protocol +
306 le16_to_cpu(pSMBt->t2_rsp.DataOffset); 303 get_unaligned_le16(&pSMBt->t2_rsp.DataOffset);
307 /* validate target area */ 304 /* validate target area */
308 305
309 data_area_of_buf2 = (char *) &pSMB2->hdr.Protocol + 306 data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol +
310 le16_to_cpu(pSMB2->t2_rsp.DataOffset); 307 get_unaligned_le16(&pSMB2->t2_rsp.DataOffset);
311 308
312 data_area_of_target += total_in_buf; 309 data_area_of_target += total_in_buf;
313 310
314 /* copy second buffer into end of first buffer */ 311 /* copy second buffer into end of first buffer */
315 memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2); 312 memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2);
316 total_in_buf += total_in_buf2; 313 total_in_buf += total_in_buf2;
317 pSMBt->t2_rsp.DataCount = cpu_to_le16(total_in_buf); 314 put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount);
318 byte_count = le16_to_cpu(BCC_LE(pTargetSMB)); 315 byte_count = get_bcc_le(pTargetSMB);
319 byte_count += total_in_buf2; 316 byte_count += total_in_buf2;
320 BCC_LE(pTargetSMB) = cpu_to_le16(byte_count); 317 put_bcc_le(byte_count, pTargetSMB);
321 318
322 byte_count = pTargetSMB->smb_buf_length; 319 byte_count = pTargetSMB->smb_buf_length;
323 byte_count += total_in_buf2; 320 byte_count += total_in_buf2;
@@ -331,7 +328,31 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
331 return 0; /* we are done */ 328 return 0; /* we are done */
332 } else /* more responses to go */ 329 } else /* more responses to go */
333 return 1; 330 return 1;
331}
332
333static void
334cifs_echo_request(struct work_struct *work)
335{
336 int rc;
337 struct TCP_Server_Info *server = container_of(work,
338 struct TCP_Server_Info, echo.work);
339
340 /*
341 * We cannot send an echo until the NEGOTIATE_PROTOCOL request is
342 * done, which is indicated by maxBuf != 0. Also, no need to ping if
343 * we got a response recently
344 */
345 if (server->maxBuf == 0 ||
346 time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
347 goto requeue_echo;
348
349 rc = CIFSSMBEcho(server);
350 if (rc)
351 cFYI(1, "Unable to send echo request to server: %s",
352 server->hostname);
334 353
354requeue_echo:
355 queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL);
335} 356}
336 357
337static int 358static int
@@ -345,8 +366,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
345 struct msghdr smb_msg; 366 struct msghdr smb_msg;
346 struct kvec iov; 367 struct kvec iov;
347 struct socket *csocket = server->ssocket; 368 struct socket *csocket = server->ssocket;
348 struct list_head *tmp; 369 struct list_head *tmp, *tmp2;
349 struct cifsSesInfo *ses;
350 struct task_struct *task_to_wake = NULL; 370 struct task_struct *task_to_wake = NULL;
351 struct mid_q_entry *mid_entry; 371 struct mid_q_entry *mid_entry;
352 char temp; 372 char temp;
@@ -399,7 +419,20 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
399 smb_msg.msg_control = NULL; 419 smb_msg.msg_control = NULL;
400 smb_msg.msg_controllen = 0; 420 smb_msg.msg_controllen = 0;
401 pdu_length = 4; /* enough to get RFC1001 header */ 421 pdu_length = 4; /* enough to get RFC1001 header */
422
402incomplete_rcv: 423incomplete_rcv:
424 if (echo_retries > 0 &&
425 time_after(jiffies, server->lstrp +
426 (echo_retries * SMB_ECHO_INTERVAL))) {
427 cERROR(1, "Server %s has not responded in %d seconds. "
428 "Reconnecting...", server->hostname,
429 (echo_retries * SMB_ECHO_INTERVAL / HZ));
430 cifs_reconnect(server);
431 csocket = server->ssocket;
432 wake_up(&server->response_q);
433 continue;
434 }
435
403 length = 436 length =
404 kernel_recvmsg(csocket, &smb_msg, 437 kernel_recvmsg(csocket, &smb_msg,
405 &iov, 1, pdu_length, 0 /* BB other flags? */); 438 &iov, 1, pdu_length, 0 /* BB other flags? */);
@@ -550,25 +583,36 @@ incomplete_rcv:
550 else if (reconnect == 1) 583 else if (reconnect == 1)
551 continue; 584 continue;
552 585
553 length += 4; /* account for rfc1002 hdr */ 586 total_read += 4; /* account for rfc1002 hdr */
554 587
588 dump_smb(smb_buffer, total_read);
555 589
556 dump_smb(smb_buffer, length); 590 /*
557 if (checkSMB(smb_buffer, smb_buffer->Mid, total_read+4)) { 591 * We know that we received enough to get to the MID as we
558 cifs_dump_mem("Bad SMB: ", smb_buffer, 48); 592 * checked the pdu_length earlier. Now check to see
559 continue; 593 * if the rest of the header is OK. We borrow the length
560 } 594 * var for the rest of the loop to avoid a new stack var.
595 *
596 * 48 bytes is enough to display the header and a little bit
597 * into the payload for debugging purposes.
598 */
599 length = checkSMB(smb_buffer, smb_buffer->Mid, total_read);
600 if (length != 0)
601 cifs_dump_mem("Bad SMB: ", smb_buffer,
602 min_t(unsigned int, total_read, 48));
561 603
604 mid_entry = NULL;
605 server->lstrp = jiffies;
562 606
563 task_to_wake = NULL;
564 spin_lock(&GlobalMid_Lock); 607 spin_lock(&GlobalMid_Lock);
565 list_for_each(tmp, &server->pending_mid_q) { 608 list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
566 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 609 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
567 610
568 if ((mid_entry->mid == smb_buffer->Mid) && 611 if ((mid_entry->mid == smb_buffer->Mid) &&
569 (mid_entry->midState == MID_REQUEST_SUBMITTED) && 612 (mid_entry->midState == MID_REQUEST_SUBMITTED) &&
570 (mid_entry->command == smb_buffer->Command)) { 613 (mid_entry->command == smb_buffer->Command)) {
571 if (check2ndT2(smb_buffer,server->maxBuf) > 0) { 614 if (length == 0 &&
615 check2ndT2(smb_buffer, server->maxBuf) > 0) {
572 /* We have a multipart transact2 resp */ 616 /* We have a multipart transact2 resp */
573 isMultiRsp = true; 617 isMultiRsp = true;
574 if (mid_entry->resp_buf) { 618 if (mid_entry->resp_buf) {
@@ -603,20 +647,24 @@ incomplete_rcv:
603 mid_entry->resp_buf = smb_buffer; 647 mid_entry->resp_buf = smb_buffer;
604 mid_entry->largeBuf = isLargeBuf; 648 mid_entry->largeBuf = isLargeBuf;
605multi_t2_fnd: 649multi_t2_fnd:
606 task_to_wake = mid_entry->tsk; 650 if (length == 0)
607 mid_entry->midState = MID_RESPONSE_RECEIVED; 651 mid_entry->midState =
652 MID_RESPONSE_RECEIVED;
653 else
654 mid_entry->midState =
655 MID_RESPONSE_MALFORMED;
608#ifdef CONFIG_CIFS_STATS2 656#ifdef CONFIG_CIFS_STATS2
609 mid_entry->when_received = jiffies; 657 mid_entry->when_received = jiffies;
610#endif 658#endif
611 /* so we do not time out requests to server 659 list_del_init(&mid_entry->qhead);
612 which is still responding (since server could 660 mid_entry->callback(mid_entry);
613 be busy but not dead) */
614 server->lstrp = jiffies;
615 break; 661 break;
616 } 662 }
663 mid_entry = NULL;
617 } 664 }
618 spin_unlock(&GlobalMid_Lock); 665 spin_unlock(&GlobalMid_Lock);
619 if (task_to_wake) { 666
667 if (mid_entry != NULL) {
620 /* Was previous buf put in mpx struct for multi-rsp? */ 668 /* Was previous buf put in mpx struct for multi-rsp? */
621 if (!isMultiRsp) { 669 if (!isMultiRsp) {
622 /* smb buffer will be freed by user thread */ 670 /* smb buffer will be freed by user thread */
@@ -625,11 +673,13 @@ multi_t2_fnd:
625 else 673 else
626 smallbuf = NULL; 674 smallbuf = NULL;
627 } 675 }
628 wake_up_process(task_to_wake); 676 } else if (length != 0) {
677 /* response sanity checks failed */
678 continue;
629 } else if (!is_valid_oplock_break(smb_buffer, server) && 679 } else if (!is_valid_oplock_break(smb_buffer, server) &&
630 !isMultiRsp) { 680 !isMultiRsp) {
631 cERROR(1, "No task to wake, unknown frame received! " 681 cERROR(1, "No task to wake, unknown frame received! "
632 "NumMids %d", midCount.counter); 682 "NumMids %d", atomic_read(&midCount));
633 cifs_dump_mem("Received Data is: ", (char *)smb_buffer, 683 cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
634 sizeof(struct smb_hdr)); 684 sizeof(struct smb_hdr));
635#ifdef CONFIG_CIFS_DEBUG2 685#ifdef CONFIG_CIFS_DEBUG2
@@ -677,44 +727,16 @@ multi_t2_fnd:
677 if (smallbuf) /* no sense logging a debug message if NULL */ 727 if (smallbuf) /* no sense logging a debug message if NULL */
678 cifs_small_buf_release(smallbuf); 728 cifs_small_buf_release(smallbuf);
679 729
680 /* 730 if (!list_empty(&server->pending_mid_q)) {
681 * BB: we shouldn't have to do any of this. It shouldn't be
682 * possible to exit from the thread with active SMB sessions
683 */
684 spin_lock(&cifs_tcp_ses_lock);
685 if (list_empty(&server->pending_mid_q)) {
686 /* loop through server session structures attached to this and
687 mark them dead */
688 list_for_each(tmp, &server->smb_ses_list) {
689 ses = list_entry(tmp, struct cifsSesInfo,
690 smb_ses_list);
691 ses->status = CifsExiting;
692 ses->server = NULL;
693 }
694 spin_unlock(&cifs_tcp_ses_lock);
695 } else {
696 /* although we can not zero the server struct pointer yet,
697 since there are active requests which may depnd on them,
698 mark the corresponding SMB sessions as exiting too */
699 list_for_each(tmp, &server->smb_ses_list) {
700 ses = list_entry(tmp, struct cifsSesInfo,
701 smb_ses_list);
702 ses->status = CifsExiting;
703 }
704
705 spin_lock(&GlobalMid_Lock); 731 spin_lock(&GlobalMid_Lock);
706 list_for_each(tmp, &server->pending_mid_q) { 732 list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
707 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 733 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
708 if (mid_entry->midState == MID_REQUEST_SUBMITTED) { 734 cFYI(1, "Clearing Mid 0x%x - issuing callback",
709 cFYI(1, "Clearing Mid 0x%x - waking up ",
710 mid_entry->mid); 735 mid_entry->mid);
711 task_to_wake = mid_entry->tsk; 736 list_del_init(&mid_entry->qhead);
712 if (task_to_wake) 737 mid_entry->callback(mid_entry);
713 wake_up_process(task_to_wake);
714 }
715 } 738 }
716 spin_unlock(&GlobalMid_Lock); 739 spin_unlock(&GlobalMid_Lock);
717 spin_unlock(&cifs_tcp_ses_lock);
718 /* 1/8th of sec is more than enough time for them to exit */ 740 /* 1/8th of sec is more than enough time for them to exit */
719 msleep(125); 741 msleep(125);
720 } 742 }
@@ -732,18 +754,6 @@ multi_t2_fnd:
732 coming home not much else we can do but free the memory */ 754 coming home not much else we can do but free the memory */
733 } 755 }
734 756
735 /* last chance to mark ses pointers invalid
736 if there are any pointing to this (e.g
737 if a crazy root user tried to kill cifsd
738 kernel thread explicitly this might happen) */
739 /* BB: This shouldn't be necessary, see above */
740 spin_lock(&cifs_tcp_ses_lock);
741 list_for_each(tmp, &server->smb_ses_list) {
742 ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
743 ses->server = NULL;
744 }
745 spin_unlock(&cifs_tcp_ses_lock);
746
747 kfree(server->hostname); 757 kfree(server->hostname);
748 task_to_wake = xchg(&server->tsk, NULL); 758 task_to_wake = xchg(&server->tsk, NULL);
749 kfree(server); 759 kfree(server);
@@ -1113,6 +1123,8 @@ cifs_parse_mount_options(char *options, const char *devname,
1113 } else if (!strnicmp(data, "uid", 3) && value && *value) { 1123 } else if (!strnicmp(data, "uid", 3) && value && *value) {
1114 vol->linux_uid = simple_strtoul(value, &value, 0); 1124 vol->linux_uid = simple_strtoul(value, &value, 0);
1115 uid_specified = true; 1125 uid_specified = true;
1126 } else if (!strnicmp(data, "cruid", 5) && value && *value) {
1127 vol->cred_uid = simple_strtoul(value, &value, 0);
1116 } else if (!strnicmp(data, "forceuid", 8)) { 1128 } else if (!strnicmp(data, "forceuid", 8)) {
1117 override_uid = 1; 1129 override_uid = 1;
1118 } else if (!strnicmp(data, "noforceuid", 10)) { 1130 } else if (!strnicmp(data, "noforceuid", 10)) {
@@ -1353,6 +1365,8 @@ cifs_parse_mount_options(char *options, const char *devname,
1353 vol->direct_io = 1; 1365 vol->direct_io = 1;
1354 } else if (strnicmp(data, "forcedirectio", 13) == 0) { 1366 } else if (strnicmp(data, "forcedirectio", 13) == 0) {
1355 vol->direct_io = 1; 1367 vol->direct_io = 1;
1368 } else if (strnicmp(data, "strictcache", 11) == 0) {
1369 vol->strict_io = 1;
1356 } else if (strnicmp(data, "noac", 4) == 0) { 1370 } else if (strnicmp(data, "noac", 4) == 0) {
1357 printk(KERN_WARNING "CIFS: Mount option noac not " 1371 printk(KERN_WARNING "CIFS: Mount option noac not "
1358 "supported. Instead set " 1372 "supported. Instead set "
@@ -1577,6 +1591,9 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
1577 1591
1578 spin_lock(&cifs_tcp_ses_lock); 1592 spin_lock(&cifs_tcp_ses_lock);
1579 list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { 1593 list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
1594 if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
1595 continue;
1596
1580 if (!match_address(server, addr, 1597 if (!match_address(server, addr,
1581 (struct sockaddr *)&vol->srcaddr)) 1598 (struct sockaddr *)&vol->srcaddr))
1582 continue; 1599 continue;
@@ -1607,9 +1624,13 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
1607 return; 1624 return;
1608 } 1625 }
1609 1626
1627 put_net(cifs_net_ns(server));
1628
1610 list_del_init(&server->tcp_ses_list); 1629 list_del_init(&server->tcp_ses_list);
1611 spin_unlock(&cifs_tcp_ses_lock); 1630 spin_unlock(&cifs_tcp_ses_lock);
1612 1631
1632 cancel_delayed_work_sync(&server->echo);
1633
1613 spin_lock(&GlobalMid_Lock); 1634 spin_lock(&GlobalMid_Lock);
1614 server->tcpStatus = CifsExiting; 1635 server->tcpStatus = CifsExiting;
1615 spin_unlock(&GlobalMid_Lock); 1636 spin_unlock(&GlobalMid_Lock);
@@ -1679,6 +1700,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1679 goto out_err; 1700 goto out_err;
1680 } 1701 }
1681 1702
1703 cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
1682 tcp_ses->hostname = extract_hostname(volume_info->UNC); 1704 tcp_ses->hostname = extract_hostname(volume_info->UNC);
1683 if (IS_ERR(tcp_ses->hostname)) { 1705 if (IS_ERR(tcp_ses->hostname)) {
1684 rc = PTR_ERR(tcp_ses->hostname); 1706 rc = PTR_ERR(tcp_ses->hostname);
@@ -1699,8 +1721,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1699 volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); 1721 volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
1700 tcp_ses->session_estab = false; 1722 tcp_ses->session_estab = false;
1701 tcp_ses->sequence_number = 0; 1723 tcp_ses->sequence_number = 0;
1724 tcp_ses->lstrp = jiffies;
1702 INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); 1725 INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
1703 INIT_LIST_HEAD(&tcp_ses->smb_ses_list); 1726 INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
1727 INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
1704 1728
1705 /* 1729 /*
1706 * at this point we are the only ones with the pointer 1730 * at this point we are the only ones with the pointer
@@ -1749,11 +1773,16 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1749 1773
1750 cifs_fscache_get_client_cookie(tcp_ses); 1774 cifs_fscache_get_client_cookie(tcp_ses);
1751 1775
1776 /* queue echo request delayed work */
1777 queue_delayed_work(system_nrt_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL);
1778
1752 return tcp_ses; 1779 return tcp_ses;
1753 1780
1754out_err_crypto_release: 1781out_err_crypto_release:
1755 cifs_crypto_shash_release(tcp_ses); 1782 cifs_crypto_shash_release(tcp_ses);
1756 1783
1784 put_net(cifs_net_ns(tcp_ses));
1785
1757out_err: 1786out_err:
1758 if (tcp_ses) { 1787 if (tcp_ses) {
1759 if (!IS_ERR(tcp_ses->hostname)) 1788 if (!IS_ERR(tcp_ses->hostname))
@@ -2265,8 +2294,8 @@ generic_ip_connect(struct TCP_Server_Info *server)
2265 } 2294 }
2266 2295
2267 if (socket == NULL) { 2296 if (socket == NULL) {
2268 rc = sock_create_kern(sfamily, SOCK_STREAM, 2297 rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM,
2269 IPPROTO_TCP, &socket); 2298 IPPROTO_TCP, &socket, 1);
2270 if (rc < 0) { 2299 if (rc < 0) {
2271 cERROR(1, "Error %d creating socket", rc); 2300 cERROR(1, "Error %d creating socket", rc);
2272 server->ssocket = NULL; 2301 server->ssocket = NULL;
@@ -2578,6 +2607,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2578 if (pvolume_info->multiuser) 2607 if (pvolume_info->multiuser)
2579 cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER | 2608 cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER |
2580 CIFS_MOUNT_NO_PERM); 2609 CIFS_MOUNT_NO_PERM);
2610 if (pvolume_info->strict_io)
2611 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO;
2581 if (pvolume_info->direct_io) { 2612 if (pvolume_info->direct_io) {
2582 cFYI(1, "mounting share using direct i/o"); 2613 cFYI(1, "mounting share using direct i/o");
2583 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; 2614 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
@@ -2934,8 +2965,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2934 TCONX_RSP *pSMBr; 2965 TCONX_RSP *pSMBr;
2935 unsigned char *bcc_ptr; 2966 unsigned char *bcc_ptr;
2936 int rc = 0; 2967 int rc = 0;
2937 int length, bytes_left; 2968 int length;
2938 __u16 count; 2969 __u16 bytes_left, count;
2939 2970
2940 if (ses == NULL) 2971 if (ses == NULL)
2941 return -EIO; 2972 return -EIO;
@@ -2963,7 +2994,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2963 bcc_ptr++; /* skip password */ 2994 bcc_ptr++; /* skip password */
2964 /* already aligned so no need to do it below */ 2995 /* already aligned so no need to do it below */
2965 } else { 2996 } else {
2966 pSMB->PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); 2997 pSMB->PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
2967 /* BB FIXME add code to fail this if NTLMv2 or Kerberos 2998 /* BB FIXME add code to fail this if NTLMv2 or Kerberos
2968 specified as required (when that support is added to 2999 specified as required (when that support is added to
2969 the vfs in the future) as only NTLM or the much 3000 the vfs in the future) as only NTLM or the much
@@ -2979,9 +3010,10 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2979 bcc_ptr); 3010 bcc_ptr);
2980 else 3011 else
2981#endif /* CIFS_WEAK_PW_HASH */ 3012#endif /* CIFS_WEAK_PW_HASH */
2982 SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr); 3013 rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
3014 bcc_ptr);
2983 3015
2984 bcc_ptr += CIFS_SESS_KEY_SIZE; 3016 bcc_ptr += CIFS_AUTH_RESP_SIZE;
2985 if (ses->capabilities & CAP_UNICODE) { 3017 if (ses->capabilities & CAP_UNICODE) {
2986 /* must align unicode strings */ 3018 /* must align unicode strings */
2987 *bcc_ptr = 0; /* null byte password */ 3019 *bcc_ptr = 0; /* null byte password */
@@ -3019,7 +3051,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3019 pSMB->ByteCount = cpu_to_le16(count); 3051 pSMB->ByteCount = cpu_to_le16(count);
3020 3052
3021 rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length, 3053 rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
3022 CIFS_STD_OP); 3054 0);
3023 3055
3024 /* above now done in SendReceive */ 3056 /* above now done in SendReceive */
3025 if ((rc == 0) && (tcon != NULL)) { 3057 if ((rc == 0) && (tcon != NULL)) {
@@ -3029,7 +3061,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3029 tcon->need_reconnect = false; 3061 tcon->need_reconnect = false;
3030 tcon->tid = smb_buffer_response->Tid; 3062 tcon->tid = smb_buffer_response->Tid;
3031 bcc_ptr = pByteArea(smb_buffer_response); 3063 bcc_ptr = pByteArea(smb_buffer_response);
3032 bytes_left = BCC(smb_buffer_response); 3064 bytes_left = get_bcc(smb_buffer_response);
3033 length = strnlen(bcc_ptr, bytes_left - 2); 3065 length = strnlen(bcc_ptr, bytes_left - 2);
3034 if (smb_buffer->Flags2 & SMBFLG2_UNICODE) 3066 if (smb_buffer->Flags2 & SMBFLG2_UNICODE)
3035 is_unicode = true; 3067 is_unicode = true;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 1e95dd635632..dd5f22918c33 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -675,6 +675,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
675 675
676const struct dentry_operations cifs_dentry_ops = { 676const struct dentry_operations cifs_dentry_ops = {
677 .d_revalidate = cifs_d_revalidate, 677 .d_revalidate = cifs_d_revalidate,
678 .d_automount = cifs_dfs_d_automount,
678/* d_delete: cifs_d_delete, */ /* not needed except for debugging */ 679/* d_delete: cifs_d_delete, */ /* not needed except for debugging */
679}; 680};
680 681
@@ -711,4 +712,5 @@ const struct dentry_operations cifs_ci_dentry_ops = {
711 .d_revalidate = cifs_d_revalidate, 712 .d_revalidate = cifs_d_revalidate,
712 .d_hash = cifs_ci_hash, 713 .d_hash = cifs_ci_hash,
713 .d_compare = cifs_ci_compare, 714 .d_compare = cifs_ci_compare,
715 .d_automount = cifs_dfs_d_automount,
714}; 716};
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d843631c028d..e964b1cd5dd0 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -287,6 +287,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
287 struct inode *inode = cifs_file->dentry->d_inode; 287 struct inode *inode = cifs_file->dentry->d_inode;
288 struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink); 288 struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
289 struct cifsInodeInfo *cifsi = CIFS_I(inode); 289 struct cifsInodeInfo *cifsi = CIFS_I(inode);
290 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
290 struct cifsLockInfo *li, *tmp; 291 struct cifsLockInfo *li, *tmp;
291 292
292 spin_lock(&cifs_file_list_lock); 293 spin_lock(&cifs_file_list_lock);
@@ -302,6 +303,13 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
302 if (list_empty(&cifsi->openFileList)) { 303 if (list_empty(&cifsi->openFileList)) {
303 cFYI(1, "closing last open instance for inode %p", 304 cFYI(1, "closing last open instance for inode %p",
304 cifs_file->dentry->d_inode); 305 cifs_file->dentry->d_inode);
306
307 /* in strict cache mode we need invalidate mapping on the last
308 close because it may cause a error when we open this file
309 again and get at least level II oplock */
310 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
311 CIFS_I(inode)->invalid_mapping = true;
312
305 cifs_set_oplock_level(cifsi, 0); 313 cifs_set_oplock_level(cifsi, 0);
306 } 314 }
307 spin_unlock(&cifs_file_list_lock); 315 spin_unlock(&cifs_file_list_lock);
@@ -338,7 +346,6 @@ int cifs_open(struct inode *inode, struct file *file)
338 struct cifsTconInfo *tcon; 346 struct cifsTconInfo *tcon;
339 struct tcon_link *tlink; 347 struct tcon_link *tlink;
340 struct cifsFileInfo *pCifsFile = NULL; 348 struct cifsFileInfo *pCifsFile = NULL;
341 struct cifsInodeInfo *pCifsInode;
342 char *full_path = NULL; 349 char *full_path = NULL;
343 bool posix_open_ok = false; 350 bool posix_open_ok = false;
344 __u16 netfid; 351 __u16 netfid;
@@ -353,8 +360,6 @@ int cifs_open(struct inode *inode, struct file *file)
353 } 360 }
354 tcon = tlink_tcon(tlink); 361 tcon = tlink_tcon(tlink);
355 362
356 pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
357
358 full_path = build_path_from_dentry(file->f_path.dentry); 363 full_path = build_path_from_dentry(file->f_path.dentry);
359 if (full_path == NULL) { 364 if (full_path == NULL) {
360 rc = -ENOMEM; 365 rc = -ENOMEM;
@@ -726,12 +731,12 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
726 731
727 /* BB we could chain these into one lock request BB */ 732 /* BB we could chain these into one lock request BB */
728 rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start, 733 rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start,
729 0, 1, lockType, 0 /* wait flag */ ); 734 0, 1, lockType, 0 /* wait flag */, 0);
730 if (rc == 0) { 735 if (rc == 0) {
731 rc = CIFSSMBLock(xid, tcon, netfid, length, 736 rc = CIFSSMBLock(xid, tcon, netfid, length,
732 pfLock->fl_start, 1 /* numUnlock */ , 737 pfLock->fl_start, 1 /* numUnlock */ ,
733 0 /* numLock */ , lockType, 738 0 /* numLock */ , lockType,
734 0 /* wait flag */ ); 739 0 /* wait flag */, 0);
735 pfLock->fl_type = F_UNLCK; 740 pfLock->fl_type = F_UNLCK;
736 if (rc != 0) 741 if (rc != 0)
737 cERROR(1, "Error unlocking previously locked " 742 cERROR(1, "Error unlocking previously locked "
@@ -748,13 +753,13 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
748 rc = CIFSSMBLock(xid, tcon, netfid, length, 753 rc = CIFSSMBLock(xid, tcon, netfid, length,
749 pfLock->fl_start, 0, 1, 754 pfLock->fl_start, 0, 1,
750 lockType | LOCKING_ANDX_SHARED_LOCK, 755 lockType | LOCKING_ANDX_SHARED_LOCK,
751 0 /* wait flag */); 756 0 /* wait flag */, 0);
752 if (rc == 0) { 757 if (rc == 0) {
753 rc = CIFSSMBLock(xid, tcon, netfid, 758 rc = CIFSSMBLock(xid, tcon, netfid,
754 length, pfLock->fl_start, 1, 0, 759 length, pfLock->fl_start, 1, 0,
755 lockType | 760 lockType |
756 LOCKING_ANDX_SHARED_LOCK, 761 LOCKING_ANDX_SHARED_LOCK,
757 0 /* wait flag */); 762 0 /* wait flag */, 0);
758 pfLock->fl_type = F_RDLCK; 763 pfLock->fl_type = F_RDLCK;
759 if (rc != 0) 764 if (rc != 0)
760 cERROR(1, "Error unlocking " 765 cERROR(1, "Error unlocking "
@@ -797,8 +802,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
797 802
798 if (numLock) { 803 if (numLock) {
799 rc = CIFSSMBLock(xid, tcon, netfid, length, 804 rc = CIFSSMBLock(xid, tcon, netfid, length,
800 pfLock->fl_start, 805 pfLock->fl_start, 0, numLock, lockType,
801 0, numLock, lockType, wait_flag); 806 wait_flag, 0);
802 807
803 if (rc == 0) { 808 if (rc == 0) {
804 /* For Windows locks we must store them. */ 809 /* For Windows locks we must store them. */
@@ -818,9 +823,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
818 (pfLock->fl_start + length) >= 823 (pfLock->fl_start + length) >=
819 (li->offset + li->length)) { 824 (li->offset + li->length)) {
820 stored_rc = CIFSSMBLock(xid, tcon, 825 stored_rc = CIFSSMBLock(xid, tcon,
821 netfid, 826 netfid, li->length,
822 li->length, li->offset, 827 li->offset, 1, 0,
823 1, 0, li->type, false); 828 li->type, false, 0);
824 if (stored_rc) 829 if (stored_rc)
825 rc = stored_rc; 830 rc = stored_rc;
826 else { 831 else {
@@ -839,31 +844,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
839 return rc; 844 return rc;
840} 845}
841 846
842/*
843 * Set the timeout on write requests past EOF. For some servers (Windows)
844 * these calls can be very long.
845 *
846 * If we're writing >10M past the EOF we give a 180s timeout. Anything less
847 * than that gets a 45s timeout. Writes not past EOF get 15s timeouts.
848 * The 10M cutoff is totally arbitrary. A better scheme for this would be
849 * welcome if someone wants to suggest one.
850 *
851 * We may be able to do a better job with this if there were some way to
852 * declare that a file should be sparse.
853 */
854static int
855cifs_write_timeout(struct cifsInodeInfo *cifsi, loff_t offset)
856{
857 if (offset <= cifsi->server_eof)
858 return CIFS_STD_OP;
859 else if (offset > (cifsi->server_eof + (10 * 1024 * 1024)))
860 return CIFS_VLONG_OP;
861 else
862 return CIFS_LONG_OP;
863}
864
865/* update the file size (if needed) after a write */ 847/* update the file size (if needed) after a write */
866static void 848void
867cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, 849cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
868 unsigned int bytes_written) 850 unsigned int bytes_written)
869{ 851{
@@ -882,7 +864,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
882 unsigned int total_written; 864 unsigned int total_written;
883 struct cifs_sb_info *cifs_sb; 865 struct cifs_sb_info *cifs_sb;
884 struct cifsTconInfo *pTcon; 866 struct cifsTconInfo *pTcon;
885 int xid, long_op; 867 int xid;
886 struct cifsFileInfo *open_file; 868 struct cifsFileInfo *open_file;
887 struct cifsInodeInfo *cifsi = CIFS_I(inode); 869 struct cifsInodeInfo *cifsi = CIFS_I(inode);
888 870
@@ -903,7 +885,6 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
903 885
904 xid = GetXid(); 886 xid = GetXid();
905 887
906 long_op = cifs_write_timeout(cifsi, *poffset);
907 for (total_written = 0; write_size > total_written; 888 for (total_written = 0; write_size > total_written;
908 total_written += bytes_written) { 889 total_written += bytes_written) {
909 rc = -EAGAIN; 890 rc = -EAGAIN;
@@ -931,7 +912,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
931 min_t(const int, cifs_sb->wsize, 912 min_t(const int, cifs_sb->wsize,
932 write_size - total_written), 913 write_size - total_written),
933 *poffset, &bytes_written, 914 *poffset, &bytes_written,
934 NULL, write_data + total_written, long_op); 915 NULL, write_data + total_written, 0);
935 } 916 }
936 if (rc || (bytes_written == 0)) { 917 if (rc || (bytes_written == 0)) {
937 if (total_written) 918 if (total_written)
@@ -944,8 +925,6 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
944 cifs_update_eof(cifsi, *poffset, bytes_written); 925 cifs_update_eof(cifsi, *poffset, bytes_written);
945 *poffset += bytes_written; 926 *poffset += bytes_written;
946 } 927 }
947 long_op = CIFS_STD_OP; /* subsequent writes fast -
948 15 seconds is plenty */
949 } 928 }
950 929
951 cifs_stats_bytes_written(pTcon, total_written); 930 cifs_stats_bytes_written(pTcon, total_written);
@@ -974,7 +953,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
974 unsigned int total_written; 953 unsigned int total_written;
975 struct cifs_sb_info *cifs_sb; 954 struct cifs_sb_info *cifs_sb;
976 struct cifsTconInfo *pTcon; 955 struct cifsTconInfo *pTcon;
977 int xid, long_op; 956 int xid;
978 struct dentry *dentry = open_file->dentry; 957 struct dentry *dentry = open_file->dentry;
979 struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode); 958 struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
980 959
@@ -987,7 +966,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
987 966
988 xid = GetXid(); 967 xid = GetXid();
989 968
990 long_op = cifs_write_timeout(cifsi, *poffset);
991 for (total_written = 0; write_size > total_written; 969 for (total_written = 0; write_size > total_written;
992 total_written += bytes_written) { 970 total_written += bytes_written) {
993 rc = -EAGAIN; 971 rc = -EAGAIN;
@@ -1017,7 +995,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
1017 rc = CIFSSMBWrite2(xid, pTcon, 995 rc = CIFSSMBWrite2(xid, pTcon,
1018 open_file->netfid, len, 996 open_file->netfid, len,
1019 *poffset, &bytes_written, 997 *poffset, &bytes_written,
1020 iov, 1, long_op); 998 iov, 1, 0);
1021 } else 999 } else
1022 rc = CIFSSMBWrite(xid, pTcon, 1000 rc = CIFSSMBWrite(xid, pTcon,
1023 open_file->netfid, 1001 open_file->netfid,
@@ -1025,7 +1003,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
1025 write_size - total_written), 1003 write_size - total_written),
1026 *poffset, &bytes_written, 1004 *poffset, &bytes_written,
1027 write_data + total_written, 1005 write_data + total_written,
1028 NULL, long_op); 1006 NULL, 0);
1029 } 1007 }
1030 if (rc || (bytes_written == 0)) { 1008 if (rc || (bytes_written == 0)) {
1031 if (total_written) 1009 if (total_written)
@@ -1038,8 +1016,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
1038 cifs_update_eof(cifsi, *poffset, bytes_written); 1016 cifs_update_eof(cifsi, *poffset, bytes_written);
1039 *poffset += bytes_written; 1017 *poffset += bytes_written;
1040 } 1018 }
1041 long_op = CIFS_STD_OP; /* subsequent writes fast -
1042 15 seconds is plenty */
1043 } 1019 }
1044 1020
1045 cifs_stats_bytes_written(pTcon, total_written); 1021 cifs_stats_bytes_written(pTcon, total_written);
@@ -1167,7 +1143,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1167 char *write_data; 1143 char *write_data;
1168 int rc = -EFAULT; 1144 int rc = -EFAULT;
1169 int bytes_written = 0; 1145 int bytes_written = 0;
1170 struct cifs_sb_info *cifs_sb;
1171 struct inode *inode; 1146 struct inode *inode;
1172 struct cifsFileInfo *open_file; 1147 struct cifsFileInfo *open_file;
1173 1148
@@ -1175,7 +1150,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1175 return -EFAULT; 1150 return -EFAULT;
1176 1151
1177 inode = page->mapping->host; 1152 inode = page->mapping->host;
1178 cifs_sb = CIFS_SB(inode->i_sb);
1179 1153
1180 offset += (loff_t)from; 1154 offset += (loff_t)from;
1181 write_data = kmap(page); 1155 write_data = kmap(page);
@@ -1239,7 +1213,7 @@ static int cifs_writepages(struct address_space *mapping,
1239 struct pagevec pvec; 1213 struct pagevec pvec;
1240 int rc = 0; 1214 int rc = 0;
1241 int scanned = 0; 1215 int scanned = 0;
1242 int xid, long_op; 1216 int xid;
1243 1217
1244 cifs_sb = CIFS_SB(mapping->host->i_sb); 1218 cifs_sb = CIFS_SB(mapping->host->i_sb);
1245 1219
@@ -1377,43 +1351,67 @@ retry:
1377 break; 1351 break;
1378 } 1352 }
1379 if (n_iov) { 1353 if (n_iov) {
1354retry_write:
1380 open_file = find_writable_file(CIFS_I(mapping->host), 1355 open_file = find_writable_file(CIFS_I(mapping->host),
1381 false); 1356 false);
1382 if (!open_file) { 1357 if (!open_file) {
1383 cERROR(1, "No writable handles for inode"); 1358 cERROR(1, "No writable handles for inode");
1384 rc = -EBADF; 1359 rc = -EBADF;
1385 } else { 1360 } else {
1386 long_op = cifs_write_timeout(cifsi, offset);
1387 rc = CIFSSMBWrite2(xid, tcon, open_file->netfid, 1361 rc = CIFSSMBWrite2(xid, tcon, open_file->netfid,
1388 bytes_to_write, offset, 1362 bytes_to_write, offset,
1389 &bytes_written, iov, n_iov, 1363 &bytes_written, iov, n_iov,
1390 long_op); 1364 0);
1391 cifsFileInfo_put(open_file); 1365 cifsFileInfo_put(open_file);
1392 cifs_update_eof(cifsi, offset, bytes_written);
1393 } 1366 }
1394 1367
1395 if (rc || bytes_written < bytes_to_write) { 1368 cFYI(1, "Write2 rc=%d, wrote=%u", rc, bytes_written);
1396 cERROR(1, "Write2 ret %d, wrote %d", 1369
1397 rc, bytes_written); 1370 /*
1398 mapping_set_error(mapping, rc); 1371 * For now, treat a short write as if nothing got
1399 } else { 1372 * written. A zero length write however indicates
1373 * ENOSPC or EFBIG. We have no way to know which
1374 * though, so call it ENOSPC for now. EFBIG would
1375 * get translated to AS_EIO anyway.
1376 *
1377 * FIXME: make it take into account the data that did
1378 * get written
1379 */
1380 if (rc == 0) {
1381 if (bytes_written == 0)
1382 rc = -ENOSPC;
1383 else if (bytes_written < bytes_to_write)
1384 rc = -EAGAIN;
1385 }
1386
1387 /* retry on data-integrity flush */
1388 if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN)
1389 goto retry_write;
1390
1391 /* fix the stats and EOF */
1392 if (bytes_written > 0) {
1400 cifs_stats_bytes_written(tcon, bytes_written); 1393 cifs_stats_bytes_written(tcon, bytes_written);
1394 cifs_update_eof(cifsi, offset, bytes_written);
1401 } 1395 }
1402 1396
1403 for (i = 0; i < n_iov; i++) { 1397 for (i = 0; i < n_iov; i++) {
1404 page = pvec.pages[first + i]; 1398 page = pvec.pages[first + i];
1405 /* Should we also set page error on 1399 /* on retryable write error, redirty page */
1406 success rc but too little data written? */ 1400 if (rc == -EAGAIN)
1407 /* BB investigate retry logic on temporary 1401 redirty_page_for_writepage(wbc, page);
1408 server crash cases and how recovery works 1402 else if (rc != 0)
1409 when page marked as error */
1410 if (rc)
1411 SetPageError(page); 1403 SetPageError(page);
1412 kunmap(page); 1404 kunmap(page);
1413 unlock_page(page); 1405 unlock_page(page);
1414 end_page_writeback(page); 1406 end_page_writeback(page);
1415 page_cache_release(page); 1407 page_cache_release(page);
1416 } 1408 }
1409
1410 if (rc != -EAGAIN)
1411 mapping_set_error(mapping, rc);
1412 else
1413 rc = 0;
1414
1417 if ((wbc->nr_to_write -= n_iov) <= 0) 1415 if ((wbc->nr_to_write -= n_iov) <= 0)
1418 done = 1; 1416 done = 1;
1419 index = next; 1417 index = next;
@@ -1525,27 +1523,47 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
1525 return rc; 1523 return rc;
1526} 1524}
1527 1525
1528int cifs_fsync(struct file *file, int datasync) 1526int cifs_strict_fsync(struct file *file, int datasync)
1529{ 1527{
1530 int xid; 1528 int xid;
1531 int rc = 0; 1529 int rc = 0;
1532 struct cifsTconInfo *tcon; 1530 struct cifsTconInfo *tcon;
1533 struct cifsFileInfo *smbfile = file->private_data; 1531 struct cifsFileInfo *smbfile = file->private_data;
1534 struct inode *inode = file->f_path.dentry->d_inode; 1532 struct inode *inode = file->f_path.dentry->d_inode;
1533 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1535 1534
1536 xid = GetXid(); 1535 xid = GetXid();
1537 1536
1538 cFYI(1, "Sync file - name: %s datasync: 0x%x", 1537 cFYI(1, "Sync file - name: %s datasync: 0x%x",
1539 file->f_path.dentry->d_name.name, datasync); 1538 file->f_path.dentry->d_name.name, datasync);
1540 1539
1541 rc = filemap_write_and_wait(inode->i_mapping); 1540 if (!CIFS_I(inode)->clientCanCacheRead)
1542 if (rc == 0) { 1541 cifs_invalidate_mapping(inode);
1543 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1544 1542
1545 tcon = tlink_tcon(smbfile->tlink); 1543 tcon = tlink_tcon(smbfile->tlink);
1546 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) 1544 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
1547 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid); 1545 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
1548 } 1546
1547 FreeXid(xid);
1548 return rc;
1549}
1550
1551int cifs_fsync(struct file *file, int datasync)
1552{
1553 int xid;
1554 int rc = 0;
1555 struct cifsTconInfo *tcon;
1556 struct cifsFileInfo *smbfile = file->private_data;
1557 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1558
1559 xid = GetXid();
1560
1561 cFYI(1, "Sync file - name: %s datasync: 0x%x",
1562 file->f_path.dentry->d_name.name, datasync);
1563
1564 tcon = tlink_tcon(smbfile->tlink);
1565 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
1566 rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
1549 1567
1550 FreeXid(xid); 1568 FreeXid(xid);
1551 return rc; 1569 return rc;
@@ -1596,42 +1614,244 @@ int cifs_flush(struct file *file, fl_owner_t id)
1596 return rc; 1614 return rc;
1597} 1615}
1598 1616
1599ssize_t cifs_user_read(struct file *file, char __user *read_data, 1617static int
1600 size_t read_size, loff_t *poffset) 1618cifs_write_allocate_pages(struct page **pages, unsigned long num_pages)
1601{ 1619{
1602 int rc = -EACCES; 1620 int rc = 0;
1621 unsigned long i;
1622
1623 for (i = 0; i < num_pages; i++) {
1624 pages[i] = alloc_page(__GFP_HIGHMEM);
1625 if (!pages[i]) {
1626 /*
1627 * save number of pages we have already allocated and
1628 * return with ENOMEM error
1629 */
1630 num_pages = i;
1631 rc = -ENOMEM;
1632 goto error;
1633 }
1634 }
1635
1636 return rc;
1637
1638error:
1639 for (i = 0; i < num_pages; i++)
1640 put_page(pages[i]);
1641 return rc;
1642}
1643
1644static inline
1645size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len)
1646{
1647 size_t num_pages;
1648 size_t clen;
1649
1650 clen = min_t(const size_t, len, wsize);
1651 num_pages = clen / PAGE_CACHE_SIZE;
1652 if (clen % PAGE_CACHE_SIZE)
1653 num_pages++;
1654
1655 if (cur_len)
1656 *cur_len = clen;
1657
1658 return num_pages;
1659}
1660
1661static ssize_t
1662cifs_iovec_write(struct file *file, const struct iovec *iov,
1663 unsigned long nr_segs, loff_t *poffset)
1664{
1665 unsigned int written;
1666 unsigned long num_pages, npages, i;
1667 size_t copied, len, cur_len;
1668 ssize_t total_written = 0;
1669 struct kvec *to_send;
1670 struct page **pages;
1671 struct iov_iter it;
1672 struct inode *inode;
1673 struct cifsFileInfo *open_file;
1674 struct cifsTconInfo *pTcon;
1675 struct cifs_sb_info *cifs_sb;
1676 int xid, rc;
1677
1678 len = iov_length(iov, nr_segs);
1679 if (!len)
1680 return 0;
1681
1682 rc = generic_write_checks(file, poffset, &len, 0);
1683 if (rc)
1684 return rc;
1685
1686 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1687 num_pages = get_numpages(cifs_sb->wsize, len, &cur_len);
1688
1689 pages = kmalloc(sizeof(struct pages *)*num_pages, GFP_KERNEL);
1690 if (!pages)
1691 return -ENOMEM;
1692
1693 to_send = kmalloc(sizeof(struct kvec)*(num_pages + 1), GFP_KERNEL);
1694 if (!to_send) {
1695 kfree(pages);
1696 return -ENOMEM;
1697 }
1698
1699 rc = cifs_write_allocate_pages(pages, num_pages);
1700 if (rc) {
1701 kfree(pages);
1702 kfree(to_send);
1703 return rc;
1704 }
1705
1706 xid = GetXid();
1707 open_file = file->private_data;
1708 pTcon = tlink_tcon(open_file->tlink);
1709 inode = file->f_path.dentry->d_inode;
1710
1711 iov_iter_init(&it, iov, nr_segs, len, 0);
1712 npages = num_pages;
1713
1714 do {
1715 size_t save_len = cur_len;
1716 for (i = 0; i < npages; i++) {
1717 copied = min_t(const size_t, cur_len, PAGE_CACHE_SIZE);
1718 copied = iov_iter_copy_from_user(pages[i], &it, 0,
1719 copied);
1720 cur_len -= copied;
1721 iov_iter_advance(&it, copied);
1722 to_send[i+1].iov_base = kmap(pages[i]);
1723 to_send[i+1].iov_len = copied;
1724 }
1725
1726 cur_len = save_len - cur_len;
1727
1728 do {
1729 if (open_file->invalidHandle) {
1730 rc = cifs_reopen_file(open_file, false);
1731 if (rc != 0)
1732 break;
1733 }
1734 rc = CIFSSMBWrite2(xid, pTcon, open_file->netfid,
1735 cur_len, *poffset, &written,
1736 to_send, npages, 0);
1737 } while (rc == -EAGAIN);
1738
1739 for (i = 0; i < npages; i++)
1740 kunmap(pages[i]);
1741
1742 if (written) {
1743 len -= written;
1744 total_written += written;
1745 cifs_update_eof(CIFS_I(inode), *poffset, written);
1746 *poffset += written;
1747 } else if (rc < 0) {
1748 if (!total_written)
1749 total_written = rc;
1750 break;
1751 }
1752
1753 /* get length and number of kvecs of the next write */
1754 npages = get_numpages(cifs_sb->wsize, len, &cur_len);
1755 } while (len > 0);
1756
1757 if (total_written > 0) {
1758 spin_lock(&inode->i_lock);
1759 if (*poffset > inode->i_size)
1760 i_size_write(inode, *poffset);
1761 spin_unlock(&inode->i_lock);
1762 }
1763
1764 cifs_stats_bytes_written(pTcon, total_written);
1765 mark_inode_dirty_sync(inode);
1766
1767 for (i = 0; i < num_pages; i++)
1768 put_page(pages[i]);
1769 kfree(to_send);
1770 kfree(pages);
1771 FreeXid(xid);
1772 return total_written;
1773}
1774
1775static ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
1776 unsigned long nr_segs, loff_t pos)
1777{
1778 ssize_t written;
1779 struct inode *inode;
1780
1781 inode = iocb->ki_filp->f_path.dentry->d_inode;
1782
1783 /*
1784 * BB - optimize the way when signing is disabled. We can drop this
1785 * extra memory-to-memory copying and use iovec buffers for constructing
1786 * write request.
1787 */
1788
1789 written = cifs_iovec_write(iocb->ki_filp, iov, nr_segs, &pos);
1790 if (written > 0) {
1791 CIFS_I(inode)->invalid_mapping = true;
1792 iocb->ki_pos = pos;
1793 }
1794
1795 return written;
1796}
1797
1798ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
1799 unsigned long nr_segs, loff_t pos)
1800{
1801 struct inode *inode;
1802
1803 inode = iocb->ki_filp->f_path.dentry->d_inode;
1804
1805 if (CIFS_I(inode)->clientCanCacheAll)
1806 return generic_file_aio_write(iocb, iov, nr_segs, pos);
1807
1808 /*
1809 * In strict cache mode we need to write the data to the server exactly
1810 * from the pos to pos+len-1 rather than flush all affected pages
1811 * because it may cause a error with mandatory locks on these pages but
1812 * not on the region from pos to ppos+len-1.
1813 */
1814
1815 return cifs_user_writev(iocb, iov, nr_segs, pos);
1816}
1817
1818static ssize_t
1819cifs_iovec_read(struct file *file, const struct iovec *iov,
1820 unsigned long nr_segs, loff_t *poffset)
1821{
1822 int rc;
1823 int xid;
1824 ssize_t total_read;
1603 unsigned int bytes_read = 0; 1825 unsigned int bytes_read = 0;
1604 unsigned int total_read = 0; 1826 size_t len, cur_len;
1605 unsigned int current_read_size; 1827 int iov_offset = 0;
1606 struct cifs_sb_info *cifs_sb; 1828 struct cifs_sb_info *cifs_sb;
1607 struct cifsTconInfo *pTcon; 1829 struct cifsTconInfo *pTcon;
1608 int xid;
1609 struct cifsFileInfo *open_file; 1830 struct cifsFileInfo *open_file;
1610 char *smb_read_data;
1611 char __user *current_offset;
1612 struct smb_com_read_rsp *pSMBr; 1831 struct smb_com_read_rsp *pSMBr;
1832 char *read_data;
1833
1834 if (!nr_segs)
1835 return 0;
1836
1837 len = iov_length(iov, nr_segs);
1838 if (!len)
1839 return 0;
1613 1840
1614 xid = GetXid(); 1841 xid = GetXid();
1615 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 1842 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
1616 1843
1617 if (file->private_data == NULL) {
1618 rc = -EBADF;
1619 FreeXid(xid);
1620 return rc;
1621 }
1622 open_file = file->private_data; 1844 open_file = file->private_data;
1623 pTcon = tlink_tcon(open_file->tlink); 1845 pTcon = tlink_tcon(open_file->tlink);
1624 1846
1625 if ((file->f_flags & O_ACCMODE) == O_WRONLY) 1847 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
1626 cFYI(1, "attempting read on write only file instance"); 1848 cFYI(1, "attempting read on write only file instance");
1627 1849
1628 for (total_read = 0, current_offset = read_data; 1850 for (total_read = 0; total_read < len; total_read += bytes_read) {
1629 read_size > total_read; 1851 cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
1630 total_read += bytes_read, current_offset += bytes_read) {
1631 current_read_size = min_t(const int, read_size - total_read,
1632 cifs_sb->rsize);
1633 rc = -EAGAIN; 1852 rc = -EAGAIN;
1634 smb_read_data = NULL; 1853 read_data = NULL;
1854
1635 while (rc == -EAGAIN) { 1855 while (rc == -EAGAIN) {
1636 int buf_type = CIFS_NO_BUFFER; 1856 int buf_type = CIFS_NO_BUFFER;
1637 if (open_file->invalidHandle) { 1857 if (open_file->invalidHandle) {
@@ -1639,27 +1859,25 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1639 if (rc != 0) 1859 if (rc != 0)
1640 break; 1860 break;
1641 } 1861 }
1642 rc = CIFSSMBRead(xid, pTcon, 1862 rc = CIFSSMBRead(xid, pTcon, open_file->netfid,
1643 open_file->netfid, 1863 cur_len, *poffset, &bytes_read,
1644 current_read_size, *poffset, 1864 &read_data, &buf_type);
1645 &bytes_read, &smb_read_data, 1865 pSMBr = (struct smb_com_read_rsp *)read_data;
1646 &buf_type); 1866 if (read_data) {
1647 pSMBr = (struct smb_com_read_rsp *)smb_read_data; 1867 char *data_offset = read_data + 4 +
1648 if (smb_read_data) { 1868 le16_to_cpu(pSMBr->DataOffset);
1649 if (copy_to_user(current_offset, 1869 if (memcpy_toiovecend(iov, data_offset,
1650 smb_read_data + 1870 iov_offset, bytes_read))
1651 4 /* RFC1001 length field */ +
1652 le16_to_cpu(pSMBr->DataOffset),
1653 bytes_read))
1654 rc = -EFAULT; 1871 rc = -EFAULT;
1655
1656 if (buf_type == CIFS_SMALL_BUFFER) 1872 if (buf_type == CIFS_SMALL_BUFFER)
1657 cifs_small_buf_release(smb_read_data); 1873 cifs_small_buf_release(read_data);
1658 else if (buf_type == CIFS_LARGE_BUFFER) 1874 else if (buf_type == CIFS_LARGE_BUFFER)
1659 cifs_buf_release(smb_read_data); 1875 cifs_buf_release(read_data);
1660 smb_read_data = NULL; 1876 read_data = NULL;
1877 iov_offset += bytes_read;
1661 } 1878 }
1662 } 1879 }
1880
1663 if (rc || (bytes_read == 0)) { 1881 if (rc || (bytes_read == 0)) {
1664 if (total_read) { 1882 if (total_read) {
1665 break; 1883 break;
@@ -1672,13 +1890,57 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
1672 *poffset += bytes_read; 1890 *poffset += bytes_read;
1673 } 1891 }
1674 } 1892 }
1893
1675 FreeXid(xid); 1894 FreeXid(xid);
1676 return total_read; 1895 return total_read;
1677} 1896}
1678 1897
1898ssize_t cifs_user_read(struct file *file, char __user *read_data,
1899 size_t read_size, loff_t *poffset)
1900{
1901 struct iovec iov;
1902 iov.iov_base = read_data;
1903 iov.iov_len = read_size;
1904
1905 return cifs_iovec_read(file, &iov, 1, poffset);
1906}
1907
1908static ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
1909 unsigned long nr_segs, loff_t pos)
1910{
1911 ssize_t read;
1912
1913 read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos);
1914 if (read > 0)
1915 iocb->ki_pos = pos;
1916
1917 return read;
1918}
1919
1920ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
1921 unsigned long nr_segs, loff_t pos)
1922{
1923 struct inode *inode;
1924
1925 inode = iocb->ki_filp->f_path.dentry->d_inode;
1926
1927 if (CIFS_I(inode)->clientCanCacheRead)
1928 return generic_file_aio_read(iocb, iov, nr_segs, pos);
1929
1930 /*
1931 * In strict cache mode we need to read from the server all the time
1932 * if we don't have level II oplock because the server can delay mtime
1933 * change - so we can't make a decision about inode invalidating.
1934 * And we can also fail with pagereading if there are mandatory locks
1935 * on pages affected by this read but not on the region from pos to
1936 * pos+len-1.
1937 */
1938
1939 return cifs_user_readv(iocb, iov, nr_segs, pos);
1940}
1679 1941
1680static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, 1942static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1681 loff_t *poffset) 1943 loff_t *poffset)
1682{ 1944{
1683 int rc = -EACCES; 1945 int rc = -EACCES;
1684 unsigned int bytes_read = 0; 1946 unsigned int bytes_read = 0;
@@ -1746,6 +2008,21 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1746 return total_read; 2008 return total_read;
1747} 2009}
1748 2010
2011int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
2012{
2013 int rc, xid;
2014 struct inode *inode = file->f_path.dentry->d_inode;
2015
2016 xid = GetXid();
2017
2018 if (!CIFS_I(inode)->clientCanCacheRead)
2019 cifs_invalidate_mapping(inode);
2020
2021 rc = generic_file_mmap(file, vma);
2022 FreeXid(xid);
2023 return rc;
2024}
2025
1749int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) 2026int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1750{ 2027{
1751 int rc, xid; 2028 int rc, xid;
@@ -2192,7 +2469,8 @@ void cifs_oplock_break(struct work_struct *work)
2192 */ 2469 */
2193 if (!cfile->oplock_break_cancelled) { 2470 if (!cfile->oplock_break_cancelled) {
2194 rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0, 2471 rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
2195 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false); 2472 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false,
2473 cinode->clientCanCacheRead ? 1 : 0);
2196 cFYI(1, "Oplock release rc = %d", rc); 2474 cFYI(1, "Oplock release rc = %d", rc);
2197 } 2475 }
2198 2476
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index b06b60620240..8852470b4fbb 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -32,7 +32,7 @@
32#include "fscache.h" 32#include "fscache.h"
33 33
34 34
35static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral) 35static void cifs_set_ops(struct inode *inode)
36{ 36{
37 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 37 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
38 38
@@ -44,13 +44,17 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
44 inode->i_fop = &cifs_file_direct_nobrl_ops; 44 inode->i_fop = &cifs_file_direct_nobrl_ops;
45 else 45 else
46 inode->i_fop = &cifs_file_direct_ops; 46 inode->i_fop = &cifs_file_direct_ops;
47 } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
48 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
49 inode->i_fop = &cifs_file_strict_nobrl_ops;
50 else
51 inode->i_fop = &cifs_file_strict_ops;
47 } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) 52 } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
48 inode->i_fop = &cifs_file_nobrl_ops; 53 inode->i_fop = &cifs_file_nobrl_ops;
49 else { /* not direct, send byte range locks */ 54 else { /* not direct, send byte range locks */
50 inode->i_fop = &cifs_file_ops; 55 inode->i_fop = &cifs_file_ops;
51 } 56 }
52 57
53
54 /* check if server can support readpages */ 58 /* check if server can support readpages */
55 if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf < 59 if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf <
56 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE) 60 PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
@@ -60,7 +64,7 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
60 break; 64 break;
61 case S_IFDIR: 65 case S_IFDIR:
62#ifdef CONFIG_CIFS_DFS_UPCALL 66#ifdef CONFIG_CIFS_DFS_UPCALL
63 if (is_dfs_referral) { 67 if (IS_AUTOMOUNT(inode)) {
64 inode->i_op = &cifs_dfs_referral_inode_operations; 68 inode->i_op = &cifs_dfs_referral_inode_operations;
65 } else { 69 } else {
66#else /* NO DFS support, treat as a directory */ 70#else /* NO DFS support, treat as a directory */
@@ -167,7 +171,9 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
167 } 171 }
168 spin_unlock(&inode->i_lock); 172 spin_unlock(&inode->i_lock);
169 173
170 cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL); 174 if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL)
175 inode->i_flags |= S_AUTOMOUNT;
176 cifs_set_ops(inode);
171} 177}
172 178
173void 179void
@@ -1677,7 +1683,7 @@ cifs_inode_needs_reval(struct inode *inode)
1677/* 1683/*
1678 * Zap the cache. Called when invalid_mapping flag is set. 1684 * Zap the cache. Called when invalid_mapping flag is set.
1679 */ 1685 */
1680static void 1686void
1681cifs_invalidate_mapping(struct inode *inode) 1687cifs_invalidate_mapping(struct inode *inode)
1682{ 1688{
1683 int rc; 1689 int rc;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 306769de2fb5..e8804d373404 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -28,7 +28,6 @@
28#include "cifsproto.h" 28#include "cifsproto.h"
29#include "cifs_debug.h" 29#include "cifs_debug.h"
30#include "cifs_fs_sb.h" 30#include "cifs_fs_sb.h"
31#include "md5.h"
32 31
33#define CIFS_MF_SYMLINK_LEN_OFFSET (4+1) 32#define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
34#define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1)) 33#define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
@@ -47,6 +46,45 @@
47 md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15] 46 md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15]
48 47
49static int 48static int
49symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
50{
51 int rc;
52 unsigned int size;
53 struct crypto_shash *md5;
54 struct sdesc *sdescmd5;
55
56 md5 = crypto_alloc_shash("md5", 0, 0);
57 if (IS_ERR(md5)) {
58 rc = PTR_ERR(md5);
59 cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc);
60 return rc;
61 }
62 size = sizeof(struct shash_desc) + crypto_shash_descsize(md5);
63 sdescmd5 = kmalloc(size, GFP_KERNEL);
64 if (!sdescmd5) {
65 rc = -ENOMEM;
66 cERROR(1, "%s: Memory allocation failure\n", __func__);
67 goto symlink_hash_err;
68 }
69 sdescmd5->shash.tfm = md5;
70 sdescmd5->shash.flags = 0x0;
71
72 rc = crypto_shash_init(&sdescmd5->shash);
73 if (rc) {
74 cERROR(1, "%s: Could not init md5 shash\n", __func__);
75 goto symlink_hash_err;
76 }
77 crypto_shash_update(&sdescmd5->shash, link_str, link_len);
78 rc = crypto_shash_final(&sdescmd5->shash, md5_hash);
79
80symlink_hash_err:
81 crypto_free_shash(md5);
82 kfree(sdescmd5);
83
84 return rc;
85}
86
87static int
50CIFSParseMFSymlink(const u8 *buf, 88CIFSParseMFSymlink(const u8 *buf,
51 unsigned int buf_len, 89 unsigned int buf_len,
52 unsigned int *_link_len, 90 unsigned int *_link_len,
@@ -56,7 +94,6 @@ CIFSParseMFSymlink(const u8 *buf,
56 unsigned int link_len; 94 unsigned int link_len;
57 const char *md5_str1; 95 const char *md5_str1;
58 const char *link_str; 96 const char *link_str;
59 struct MD5Context md5_ctx;
60 u8 md5_hash[16]; 97 u8 md5_hash[16];
61 char md5_str2[34]; 98 char md5_str2[34];
62 99
@@ -70,9 +107,11 @@ CIFSParseMFSymlink(const u8 *buf,
70 if (rc != 1) 107 if (rc != 1)
71 return -EINVAL; 108 return -EINVAL;
72 109
73 cifs_MD5_init(&md5_ctx); 110 rc = symlink_hash(link_len, link_str, md5_hash);
74 cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len); 111 if (rc) {
75 cifs_MD5_final(md5_hash, &md5_ctx); 112 cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
113 return rc;
114 }
76 115
77 snprintf(md5_str2, sizeof(md5_str2), 116 snprintf(md5_str2, sizeof(md5_str2),
78 CIFS_MF_SYMLINK_MD5_FORMAT, 117 CIFS_MF_SYMLINK_MD5_FORMAT,
@@ -94,9 +133,9 @@ CIFSParseMFSymlink(const u8 *buf,
94static int 133static int
95CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str) 134CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
96{ 135{
136 int rc;
97 unsigned int link_len; 137 unsigned int link_len;
98 unsigned int ofs; 138 unsigned int ofs;
99 struct MD5Context md5_ctx;
100 u8 md5_hash[16]; 139 u8 md5_hash[16];
101 140
102 if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE) 141 if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
@@ -107,9 +146,11 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
107 if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN) 146 if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
108 return -ENAMETOOLONG; 147 return -ENAMETOOLONG;
109 148
110 cifs_MD5_init(&md5_ctx); 149 rc = symlink_hash(link_len, link_str, md5_hash);
111 cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len); 150 if (rc) {
112 cifs_MD5_final(md5_hash, &md5_ctx); 151 cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
152 return rc;
153 }
113 154
114 snprintf(buf, buf_len, 155 snprintf(buf, buf_len,
115 CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT, 156 CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
diff --git a/fs/cifs/md4.c b/fs/cifs/md4.c
deleted file mode 100644
index a725c2609d67..000000000000
--- a/fs/cifs/md4.c
+++ /dev/null
@@ -1,205 +0,0 @@
1/*
2 Unix SMB/Netbios implementation.
3 Version 1.9.
4 a implementation of MD4 designed for use in the SMB authentication protocol
5 Copyright (C) Andrew Tridgell 1997-1998.
6 Modified by Steve French (sfrench@us.ibm.com) 2002-2003
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21*/
22#include <linux/module.h>
23#include <linux/fs.h>
24#include "cifsencrypt.h"
25
26/* NOTE: This code makes no attempt to be fast! */
27
28static __u32
29F(__u32 X, __u32 Y, __u32 Z)
30{
31 return (X & Y) | ((~X) & Z);
32}
33
34static __u32
35G(__u32 X, __u32 Y, __u32 Z)
36{
37 return (X & Y) | (X & Z) | (Y & Z);
38}
39
40static __u32
41H(__u32 X, __u32 Y, __u32 Z)
42{
43 return X ^ Y ^ Z;
44}
45
46static __u32
47lshift(__u32 x, int s)
48{
49 x &= 0xFFFFFFFF;
50 return ((x << s) & 0xFFFFFFFF) | (x >> (32 - s));
51}
52
53#define ROUND1(a,b,c,d,k,s) (*a) = lshift((*a) + F(*b,*c,*d) + X[k], s)
54#define ROUND2(a,b,c,d,k,s) (*a) = lshift((*a) + G(*b,*c,*d) + X[k] + (__u32)0x5A827999,s)
55#define ROUND3(a,b,c,d,k,s) (*a) = lshift((*a) + H(*b,*c,*d) + X[k] + (__u32)0x6ED9EBA1,s)
56
57/* this applies md4 to 64 byte chunks */
58static void
59mdfour64(__u32 *M, __u32 *A, __u32 *B, __u32 *C, __u32 *D)
60{
61 int j;
62 __u32 AA, BB, CC, DD;
63 __u32 X[16];
64
65
66 for (j = 0; j < 16; j++)
67 X[j] = M[j];
68
69 AA = *A;
70 BB = *B;
71 CC = *C;
72 DD = *D;
73
74 ROUND1(A, B, C, D, 0, 3);
75 ROUND1(D, A, B, C, 1, 7);
76 ROUND1(C, D, A, B, 2, 11);
77 ROUND1(B, C, D, A, 3, 19);
78 ROUND1(A, B, C, D, 4, 3);
79 ROUND1(D, A, B, C, 5, 7);
80 ROUND1(C, D, A, B, 6, 11);
81 ROUND1(B, C, D, A, 7, 19);
82 ROUND1(A, B, C, D, 8, 3);
83 ROUND1(D, A, B, C, 9, 7);
84 ROUND1(C, D, A, B, 10, 11);
85 ROUND1(B, C, D, A, 11, 19);
86 ROUND1(A, B, C, D, 12, 3);
87 ROUND1(D, A, B, C, 13, 7);
88 ROUND1(C, D, A, B, 14, 11);
89 ROUND1(B, C, D, A, 15, 19);
90
91 ROUND2(A, B, C, D, 0, 3);
92 ROUND2(D, A, B, C, 4, 5);
93 ROUND2(C, D, A, B, 8, 9);
94 ROUND2(B, C, D, A, 12, 13);
95 ROUND2(A, B, C, D, 1, 3);
96 ROUND2(D, A, B, C, 5, 5);
97 ROUND2(C, D, A, B, 9, 9);
98 ROUND2(B, C, D, A, 13, 13);
99 ROUND2(A, B, C, D, 2, 3);
100 ROUND2(D, A, B, C, 6, 5);
101 ROUND2(C, D, A, B, 10, 9);
102 ROUND2(B, C, D, A, 14, 13);
103 ROUND2(A, B, C, D, 3, 3);
104 ROUND2(D, A, B, C, 7, 5);
105 ROUND2(C, D, A, B, 11, 9);
106 ROUND2(B, C, D, A, 15, 13);
107
108 ROUND3(A, B, C, D, 0, 3);
109 ROUND3(D, A, B, C, 8, 9);
110 ROUND3(C, D, A, B, 4, 11);
111 ROUND3(B, C, D, A, 12, 15);
112 ROUND3(A, B, C, D, 2, 3);
113 ROUND3(D, A, B, C, 10, 9);
114 ROUND3(C, D, A, B, 6, 11);
115 ROUND3(B, C, D, A, 14, 15);
116 ROUND3(A, B, C, D, 1, 3);
117 ROUND3(D, A, B, C, 9, 9);
118 ROUND3(C, D, A, B, 5, 11);
119 ROUND3(B, C, D, A, 13, 15);
120 ROUND3(A, B, C, D, 3, 3);
121 ROUND3(D, A, B, C, 11, 9);
122 ROUND3(C, D, A, B, 7, 11);
123 ROUND3(B, C, D, A, 15, 15);
124
125 *A += AA;
126 *B += BB;
127 *C += CC;
128 *D += DD;
129
130 *A &= 0xFFFFFFFF;
131 *B &= 0xFFFFFFFF;
132 *C &= 0xFFFFFFFF;
133 *D &= 0xFFFFFFFF;
134
135 for (j = 0; j < 16; j++)
136 X[j] = 0;
137}
138
139static void
140copy64(__u32 *M, unsigned char *in)
141{
142 int i;
143
144 for (i = 0; i < 16; i++)
145 M[i] = (in[i * 4 + 3] << 24) | (in[i * 4 + 2] << 16) |
146 (in[i * 4 + 1] << 8) | (in[i * 4 + 0] << 0);
147}
148
149static void
150copy4(unsigned char *out, __u32 x)
151{
152 out[0] = x & 0xFF;
153 out[1] = (x >> 8) & 0xFF;
154 out[2] = (x >> 16) & 0xFF;
155 out[3] = (x >> 24) & 0xFF;
156}
157
158/* produce a md4 message digest from data of length n bytes */
159void
160mdfour(unsigned char *out, unsigned char *in, int n)
161{
162 unsigned char buf[128];
163 __u32 M[16];
164 __u32 b = n * 8;
165 int i;
166 __u32 A = 0x67452301;
167 __u32 B = 0xefcdab89;
168 __u32 C = 0x98badcfe;
169 __u32 D = 0x10325476;
170
171 while (n > 64) {
172 copy64(M, in);
173 mdfour64(M, &A, &B, &C, &D);
174 in += 64;
175 n -= 64;
176 }
177
178 for (i = 0; i < 128; i++)
179 buf[i] = 0;
180 memcpy(buf, in, n);
181 buf[n] = 0x80;
182
183 if (n <= 55) {
184 copy4(buf + 56, b);
185 copy64(M, buf);
186 mdfour64(M, &A, &B, &C, &D);
187 } else {
188 copy4(buf + 120, b);
189 copy64(M, buf);
190 mdfour64(M, &A, &B, &C, &D);
191 copy64(M, buf + 64);
192 mdfour64(M, &A, &B, &C, &D);
193 }
194
195 for (i = 0; i < 128; i++)
196 buf[i] = 0;
197 copy64(M, buf);
198
199 copy4(out, A);
200 copy4(out + 4, B);
201 copy4(out + 8, C);
202 copy4(out + 12, D);
203
204 A = B = C = D = 0;
205}
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
deleted file mode 100644
index 98b66a54c319..000000000000
--- a/fs/cifs/md5.c
+++ /dev/null
@@ -1,366 +0,0 @@
1/*
2 * This code implements the MD5 message-digest algorithm.
3 * The algorithm is due to Ron Rivest. This code was
4 * written by Colin Plumb in 1993, no copyright is claimed.
5 * This code is in the public domain; do with it what you wish.
6 *
7 * Equivalent code is available from RSA Data Security, Inc.
8 * This code has been tested against that, and is equivalent,
9 * except that you don't need to include two pages of legalese
10 * with every copy.
11 *
12 * To compute the message digest of a chunk of bytes, declare an
13 * MD5Context structure, pass it to cifs_MD5_init, call cifs_MD5_update as
14 * needed on buffers full of bytes, and then call cifs_MD5_final, which
15 * will fill a supplied 16-byte array with the digest.
16 */
17
18/* This code slightly modified to fit into Samba by
19 abartlet@samba.org Jun 2001
20 and to fit the cifs vfs by
21 Steve French sfrench@us.ibm.com */
22
23#include <linux/string.h>
24#include "md5.h"
25
26static void MD5Transform(__u32 buf[4], __u32 const in[16]);
27
28/*
29 * Note: this code is harmless on little-endian machines.
30 */
31static void
32byteReverse(unsigned char *buf, unsigned longs)
33{
34 __u32 t;
35 do {
36 t = (__u32) ((unsigned) buf[3] << 8 | buf[2]) << 16 |
37 ((unsigned) buf[1] << 8 | buf[0]);
38 *(__u32 *) buf = t;
39 buf += 4;
40 } while (--longs);
41}
42
43/*
44 * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious
45 * initialization constants.
46 */
47void
48cifs_MD5_init(struct MD5Context *ctx)
49{
50 ctx->buf[0] = 0x67452301;
51 ctx->buf[1] = 0xefcdab89;
52 ctx->buf[2] = 0x98badcfe;
53 ctx->buf[3] = 0x10325476;
54
55 ctx->bits[0] = 0;
56 ctx->bits[1] = 0;
57}
58
59/*
60 * Update context to reflect the concatenation of another buffer full
61 * of bytes.
62 */
63void
64cifs_MD5_update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
65{
66 register __u32 t;
67
68 /* Update bitcount */
69
70 t = ctx->bits[0];
71 if ((ctx->bits[0] = t + ((__u32) len << 3)) < t)
72 ctx->bits[1]++; /* Carry from low to high */
73 ctx->bits[1] += len >> 29;
74
75 t = (t >> 3) & 0x3f; /* Bytes already in shsInfo->data */
76
77 /* Handle any leading odd-sized chunks */
78
79 if (t) {
80 unsigned char *p = (unsigned char *) ctx->in + t;
81
82 t = 64 - t;
83 if (len < t) {
84 memmove(p, buf, len);
85 return;
86 }
87 memmove(p, buf, t);
88 byteReverse(ctx->in, 16);
89 MD5Transform(ctx->buf, (__u32 *) ctx->in);
90 buf += t;
91 len -= t;
92 }
93 /* Process data in 64-byte chunks */
94
95 while (len >= 64) {
96 memmove(ctx->in, buf, 64);
97 byteReverse(ctx->in, 16);
98 MD5Transform(ctx->buf, (__u32 *) ctx->in);
99 buf += 64;
100 len -= 64;
101 }
102
103 /* Handle any remaining bytes of data. */
104
105 memmove(ctx->in, buf, len);
106}
107
108/*
109 * Final wrapup - pad to 64-byte boundary with the bit pattern
110 * 1 0* (64-bit count of bits processed, MSB-first)
111 */
112void
113cifs_MD5_final(unsigned char digest[16], struct MD5Context *ctx)
114{
115 unsigned int count;
116 unsigned char *p;
117
118 /* Compute number of bytes mod 64 */
119 count = (ctx->bits[0] >> 3) & 0x3F;
120
121 /* Set the first char of padding to 0x80. This is safe since there is
122 always at least one byte free */
123 p = ctx->in + count;
124 *p++ = 0x80;
125
126 /* Bytes of padding needed to make 64 bytes */
127 count = 64 - 1 - count;
128
129 /* Pad out to 56 mod 64 */
130 if (count < 8) {
131 /* Two lots of padding: Pad the first block to 64 bytes */
132 memset(p, 0, count);
133 byteReverse(ctx->in, 16);
134 MD5Transform(ctx->buf, (__u32 *) ctx->in);
135
136 /* Now fill the next block with 56 bytes */
137 memset(ctx->in, 0, 56);
138 } else {
139 /* Pad block to 56 bytes */
140 memset(p, 0, count - 8);
141 }
142 byteReverse(ctx->in, 14);
143
144 /* Append length in bits and transform */
145 ((__u32 *) ctx->in)[14] = ctx->bits[0];
146 ((__u32 *) ctx->in)[15] = ctx->bits[1];
147
148 MD5Transform(ctx->buf, (__u32 *) ctx->in);
149 byteReverse((unsigned char *) ctx->buf, 4);
150 memmove(digest, ctx->buf, 16);
151 memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */
152}
153
154/* The four core functions - F1 is optimized somewhat */
155
156/* #define F1(x, y, z) (x & y | ~x & z) */
157#define F1(x, y, z) (z ^ (x & (y ^ z)))
158#define F2(x, y, z) F1(z, x, y)
159#define F3(x, y, z) (x ^ y ^ z)
160#define F4(x, y, z) (y ^ (x | ~z))
161
162/* This is the central step in the MD5 algorithm. */
163#define MD5STEP(f, w, x, y, z, data, s) \
164 (w += f(x, y, z) + data, w = w<<s | w>>(32-s), w += x)
165
166/*
167 * The core of the MD5 algorithm, this alters an existing MD5 hash to
168 * reflect the addition of 16 longwords of new data. cifs_MD5_update blocks
169 * the data and converts bytes into longwords for this routine.
170 */
171static void
172MD5Transform(__u32 buf[4], __u32 const in[16])
173{
174 register __u32 a, b, c, d;
175
176 a = buf[0];
177 b = buf[1];
178 c = buf[2];
179 d = buf[3];
180
181 MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
182 MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
183 MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
184 MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
185 MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
186 MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
187 MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
188 MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
189 MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
190 MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
191 MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
192 MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
193 MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
194 MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
195 MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
196 MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
197
198 MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
199 MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
200 MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
201 MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
202 MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
203 MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
204 MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
205 MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
206 MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
207 MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
208 MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
209 MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
210 MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
211 MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
212 MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
213 MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
214
215 MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
216 MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
217 MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
218 MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
219 MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
220 MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
221 MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
222 MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
223 MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
224 MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
225 MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
226 MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
227 MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
228 MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
229 MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
230 MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
231
232 MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
233 MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
234 MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
235 MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
236 MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
237 MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
238 MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
239 MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
240 MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
241 MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
242 MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
243 MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
244 MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
245 MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
246 MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
247 MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
248
249 buf[0] += a;
250 buf[1] += b;
251 buf[2] += c;
252 buf[3] += d;
253}
254
255#if 0 /* currently unused */
256/***********************************************************************
257 the rfc 2104 version of hmac_md5 initialisation.
258***********************************************************************/
259static void
260hmac_md5_init_rfc2104(unsigned char *key, int key_len,
261 struct HMACMD5Context *ctx)
262{
263 int i;
264
265 /* if key is longer than 64 bytes reset it to key=MD5(key) */
266 if (key_len > 64) {
267 unsigned char tk[16];
268 struct MD5Context tctx;
269
270 cifs_MD5_init(&tctx);
271 cifs_MD5_update(&tctx, key, key_len);
272 cifs_MD5_final(tk, &tctx);
273
274 key = tk;
275 key_len = 16;
276 }
277
278 /* start out by storing key in pads */
279 memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
280 memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
281 memcpy(ctx->k_ipad, key, key_len);
282 memcpy(ctx->k_opad, key, key_len);
283
284 /* XOR key with ipad and opad values */
285 for (i = 0; i < 64; i++) {
286 ctx->k_ipad[i] ^= 0x36;
287 ctx->k_opad[i] ^= 0x5c;
288 }
289
290 cifs_MD5_init(&ctx->ctx);
291 cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
292}
293#endif
294
295/***********************************************************************
296 the microsoft version of hmac_md5 initialisation.
297***********************************************************************/
298void
299hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
300 struct HMACMD5Context *ctx)
301{
302 int i;
303
304 /* if key is longer than 64 bytes truncate it */
305 if (key_len > 64)
306 key_len = 64;
307
308 /* start out by storing key in pads */
309 memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
310 memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
311 memcpy(ctx->k_ipad, key, key_len);
312 memcpy(ctx->k_opad, key, key_len);
313
314 /* XOR key with ipad and opad values */
315 for (i = 0; i < 64; i++) {
316 ctx->k_ipad[i] ^= 0x36;
317 ctx->k_opad[i] ^= 0x5c;
318 }
319
320 cifs_MD5_init(&ctx->ctx);
321 cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
322}
323
324/***********************************************************************
325 update hmac_md5 "inner" buffer
326***********************************************************************/
327void
328hmac_md5_update(const unsigned char *text, int text_len,
329 struct HMACMD5Context *ctx)
330{
331 cifs_MD5_update(&ctx->ctx, text, text_len); /* then text of datagram */
332}
333
334/***********************************************************************
335 finish off hmac_md5 "inner" buffer and generate outer one.
336***********************************************************************/
337void
338hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx)
339{
340 struct MD5Context ctx_o;
341
342 cifs_MD5_final(digest, &ctx->ctx);
343
344 cifs_MD5_init(&ctx_o);
345 cifs_MD5_update(&ctx_o, ctx->k_opad, 64);
346 cifs_MD5_update(&ctx_o, digest, 16);
347 cifs_MD5_final(digest, &ctx_o);
348}
349
350/***********************************************************
351 single function to calculate an HMAC MD5 digest from data.
352 use the microsoft hmacmd5 init method because the key is 16 bytes.
353************************************************************/
354#if 0 /* currently unused */
355static void
356hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
357 unsigned char *digest)
358{
359 struct HMACMD5Context ctx;
360 hmac_md5_init_limK_to_64(key, 16, &ctx);
361 if (data_len != 0)
362 hmac_md5_update(data, data_len, &ctx);
363
364 hmac_md5_final(digest, &ctx);
365}
366#endif
diff --git a/fs/cifs/md5.h b/fs/cifs/md5.h
deleted file mode 100644
index 6fba8cb402fd..000000000000
--- a/fs/cifs/md5.h
+++ /dev/null
@@ -1,38 +0,0 @@
1#ifndef MD5_H
2#define MD5_H
3#ifndef HEADER_MD5_H
4/* Try to avoid clashes with OpenSSL */
5#define HEADER_MD5_H
6#endif
7
8struct MD5Context {
9 __u32 buf[4];
10 __u32 bits[2];
11 unsigned char in[64];
12};
13#endif /* !MD5_H */
14
15#ifndef _HMAC_MD5_H
16struct HMACMD5Context {
17 struct MD5Context ctx;
18 unsigned char k_ipad[65];
19 unsigned char k_opad[65];
20};
21#endif /* _HMAC_MD5_H */
22
23void cifs_MD5_init(struct MD5Context *context);
24void cifs_MD5_update(struct MD5Context *context, unsigned char const *buf,
25 unsigned len);
26void cifs_MD5_final(unsigned char digest[16], struct MD5Context *context);
27
28/* The following definitions come from lib/hmacmd5.c */
29
30/* void hmac_md5_init_rfc2104(unsigned char *key, int key_len,
31 struct HMACMD5Context *ctx);*/
32void hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
33 struct HMACMD5Context *ctx);
34void hmac_md5_update(const unsigned char *text, int text_len,
35 struct HMACMD5Context *ctx);
36void hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx);
37/* void hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
38 unsigned char *digest);*/
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 43f10281bc19..2a930a752a78 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -236,10 +236,7 @@ __u16 GetNextMid(struct TCP_Server_Info *server)
236{ 236{
237 __u16 mid = 0; 237 __u16 mid = 0;
238 __u16 last_mid; 238 __u16 last_mid;
239 int collision; 239 bool collision;
240
241 if (server == NULL)
242 return mid;
243 240
244 spin_lock(&GlobalMid_Lock); 241 spin_lock(&GlobalMid_Lock);
245 last_mid = server->CurrentMid; /* we do not want to loop forever */ 242 last_mid = server->CurrentMid; /* we do not want to loop forever */
@@ -252,24 +249,38 @@ __u16 GetNextMid(struct TCP_Server_Info *server)
252 (and it would also have to have been a request that 249 (and it would also have to have been a request that
253 did not time out) */ 250 did not time out) */
254 while (server->CurrentMid != last_mid) { 251 while (server->CurrentMid != last_mid) {
255 struct list_head *tmp;
256 struct mid_q_entry *mid_entry; 252 struct mid_q_entry *mid_entry;
253 unsigned int num_mids;
257 254
258 collision = 0; 255 collision = false;
259 if (server->CurrentMid == 0) 256 if (server->CurrentMid == 0)
260 server->CurrentMid++; 257 server->CurrentMid++;
261 258
262 list_for_each(tmp, &server->pending_mid_q) { 259 num_mids = 0;
263 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 260 list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
264 261 ++num_mids;
265 if ((mid_entry->mid == server->CurrentMid) && 262 if (mid_entry->mid == server->CurrentMid &&
266 (mid_entry->midState == MID_REQUEST_SUBMITTED)) { 263 mid_entry->midState == MID_REQUEST_SUBMITTED) {
267 /* This mid is in use, try a different one */ 264 /* This mid is in use, try a different one */
268 collision = 1; 265 collision = true;
269 break; 266 break;
270 } 267 }
271 } 268 }
272 if (collision == 0) { 269
270 /*
271 * if we have more than 32k mids in the list, then something
272 * is very wrong. Possibly a local user is trying to DoS the
273 * box by issuing long-running calls and SIGKILL'ing them. If
274 * we get to 2^16 mids then we're in big trouble as this
275 * function could loop forever.
276 *
277 * Go ahead and assign out the mid in this situation, but force
278 * an eventual reconnect to clean out the pending_mid_q.
279 */
280 if (num_mids > 32768)
281 server->tcpStatus = CifsNeedReconnect;
282
283 if (!collision) {
273 mid = server->CurrentMid; 284 mid = server->CurrentMid;
274 break; 285 break;
275 } 286 }
@@ -381,29 +392,31 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
381} 392}
382 393
383static int 394static int
384checkSMBhdr(struct smb_hdr *smb, __u16 mid) 395check_smb_hdr(struct smb_hdr *smb, __u16 mid)
385{ 396{
386 /* Make sure that this really is an SMB, that it is a response, 397 /* does it have the right SMB "signature" ? */
387 and that the message ids match */ 398 if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) {
388 if ((*(__le32 *) smb->Protocol == cpu_to_le32(0x424d53ff)) && 399 cERROR(1, "Bad protocol string signature header 0x%x",
389 (mid == smb->Mid)) { 400 *(unsigned int *)smb->Protocol);
390 if (smb->Flags & SMBFLG_RESPONSE) 401 return 1;
391 return 0;
392 else {
393 /* only one valid case where server sends us request */
394 if (smb->Command == SMB_COM_LOCKING_ANDX)
395 return 0;
396 else
397 cERROR(1, "Received Request not response");
398 }
399 } else { /* bad signature or mid */
400 if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff))
401 cERROR(1, "Bad protocol string signature header %x",
402 *(unsigned int *) smb->Protocol);
403 if (mid != smb->Mid)
404 cERROR(1, "Mids do not match");
405 } 402 }
406 cERROR(1, "bad smb detected. The Mid=%d", smb->Mid); 403
404 /* Make sure that message ids match */
405 if (mid != smb->Mid) {
406 cERROR(1, "Mids do not match. received=%u expected=%u",
407 smb->Mid, mid);
408 return 1;
409 }
410
411 /* if it's a response then accept */
412 if (smb->Flags & SMBFLG_RESPONSE)
413 return 0;
414
415 /* only one valid case where server sends us request */
416 if (smb->Command == SMB_COM_LOCKING_ANDX)
417 return 0;
418
419 cERROR(1, "Server sent request, not response. mid=%u", smb->Mid);
407 return 1; 420 return 1;
408} 421}
409 422
@@ -448,7 +461,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
448 return 1; 461 return 1;
449 } 462 }
450 463
451 if (checkSMBhdr(smb, mid)) 464 if (check_smb_hdr(smb, mid))
452 return 1; 465 return 1;
453 clc_len = smbCalcSize_LE(smb); 466 clc_len = smbCalcSize_LE(smb);
454 467
@@ -465,25 +478,26 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
465 if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF)) 478 if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
466 return 0; /* bcc wrapped */ 479 return 0; /* bcc wrapped */
467 } 480 }
468 cFYI(1, "Calculated size %d vs length %d mismatch for mid %d", 481 cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u",
469 clc_len, 4 + len, smb->Mid); 482 clc_len, 4 + len, smb->Mid);
470 /* Windows XP can return a few bytes too much, presumably 483
471 an illegal pad, at the end of byte range lock responses 484 if (4 + len < clc_len) {
472 so we allow for that three byte pad, as long as actual 485 cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u",
473 received length is as long or longer than calculated length */
474 /* We have now had to extend this more, since there is a
475 case in which it needs to be bigger still to handle a
476 malformed response to transact2 findfirst from WinXP when
477 access denied is returned and thus bcc and wct are zero
478 but server says length is 0x21 bytes too long as if the server
479 forget to reset the smb rfc1001 length when it reset the
480 wct and bcc to minimum size and drop the t2 parms and data */
481 if ((4+len > clc_len) && (len <= clc_len + 512))
482 return 0;
483 else {
484 cERROR(1, "RFC1001 size %d bigger than SMB for Mid=%d",
485 len, smb->Mid); 486 len, smb->Mid);
486 return 1; 487 return 1;
488 } else if (len > clc_len + 512) {
489 /*
490 * Some servers (Windows XP in particular) send more
491 * data than the lengths in the SMB packet would
492 * indicate on certain calls (byte range locks and
493 * trans2 find first calls in particular). While the
494 * client can handle such a frame by ignoring the
495 * trailing data, we choose limit the amount of extra
496 * data to 512 bytes.
497 */
498 cERROR(1, "RFC1001 size %u more than 512 bytes larger "
499 "than SMB for mid=%u", len, smb->Mid);
500 return 1;
487 } 501 }
488 } 502 }
489 return 0; 503 return 0;
@@ -571,7 +585,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
571 pCifsInode = CIFS_I(netfile->dentry->d_inode); 585 pCifsInode = CIFS_I(netfile->dentry->d_inode);
572 586
573 cifs_set_oplock_level(pCifsInode, 587 cifs_set_oplock_level(pCifsInode,
574 pSMB->OplockLevel); 588 pSMB->OplockLevel ? OPLOCK_READ : 0);
575 /* 589 /*
576 * cifs_oplock_break_put() can't be called 590 * cifs_oplock_break_put() can't be called
577 * from here. Get reference after queueing 591 * from here. Get reference after queueing
@@ -637,77 +651,6 @@ dump_smb(struct smb_hdr *smb_buf, int smb_buf_length)
637 return; 651 return;
638} 652}
639 653
640/* Convert 16 bit Unicode pathname to wire format from string in current code
641 page. Conversion may involve remapping up the seven characters that are
642 only legal in POSIX-like OS (if they are present in the string). Path
643 names are little endian 16 bit Unicode on the wire */
644int
645cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
646 const struct nls_table *cp, int mapChars)
647{
648 int i, j, charlen;
649 int len_remaining = maxlen;
650 char src_char;
651 __u16 temp;
652
653 if (!mapChars)
654 return cifs_strtoUCS(target, source, PATH_MAX, cp);
655
656 for (i = 0, j = 0; i < maxlen; j++) {
657 src_char = source[i];
658 switch (src_char) {
659 case 0:
660 target[j] = 0;
661 goto ctoUCS_out;
662 case ':':
663 target[j] = cpu_to_le16(UNI_COLON);
664 break;
665 case '*':
666 target[j] = cpu_to_le16(UNI_ASTERIK);
667 break;
668 case '?':
669 target[j] = cpu_to_le16(UNI_QUESTION);
670 break;
671 case '<':
672 target[j] = cpu_to_le16(UNI_LESSTHAN);
673 break;
674 case '>':
675 target[j] = cpu_to_le16(UNI_GRTRTHAN);
676 break;
677 case '|':
678 target[j] = cpu_to_le16(UNI_PIPE);
679 break;
680 /* BB We can not handle remapping slash until
681 all the calls to build_path_from_dentry
682 are modified, as they use slash as separator BB */
683 /* case '\\':
684 target[j] = cpu_to_le16(UNI_SLASH);
685 break;*/
686 default:
687 charlen = cp->char2uni(source+i,
688 len_remaining, &temp);
689 /* if no match, use question mark, which
690 at least in some cases servers as wild card */
691 if (charlen < 1) {
692 target[j] = cpu_to_le16(0x003f);
693 charlen = 1;
694 } else
695 target[j] = cpu_to_le16(temp);
696 len_remaining -= charlen;
697 /* character may take more than one byte in the
698 the source string, but will take exactly two
699 bytes in the target string */
700 i += charlen;
701 continue;
702 }
703 i++; /* move to next char in source string */
704 len_remaining--;
705 }
706
707ctoUCS_out:
708 return i;
709}
710
711void 654void
712cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) 655cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
713{ 656{
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 9aad47a2d62f..8d9189f64477 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -899,8 +899,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
899 } 899 }
900 /* else ERRHRD class errors or junk - return EIO */ 900 /* else ERRHRD class errors or junk - return EIO */
901 901
902 cFYI(1, "Mapping smb error code %d to POSIX err %d", 902 cFYI(1, "Mapping smb error code 0x%x to POSIX err %d",
903 smberrcode, rc); 903 le32_to_cpu(smb->Status.CifsError), rc);
904 904
905 /* generic corrective action e.g. reconnect SMB session on 905 /* generic corrective action e.g. reconnect SMB session on
906 * ERRbaduid could be added */ 906 * ERRbaduid could be added */
@@ -916,14 +916,14 @@ unsigned int
916smbCalcSize(struct smb_hdr *ptr) 916smbCalcSize(struct smb_hdr *ptr)
917{ 917{
918 return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + 918 return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
919 2 /* size of the bcc field */ + BCC(ptr)); 919 2 /* size of the bcc field */ + get_bcc(ptr));
920} 920}
921 921
922unsigned int 922unsigned int
923smbCalcSize_LE(struct smb_hdr *ptr) 923smbCalcSize_LE(struct smb_hdr *ptr)
924{ 924{
925 return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + 925 return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
926 2 /* size of the bcc field */ + le16_to_cpu(BCC_LE(ptr))); 926 2 /* size of the bcc field */ + get_bcc_le(ptr));
927} 927}
928 928
929/* The following are taken from fs/ntfs/util.c */ 929/* The following are taken from fs/ntfs/util.c */
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 7f25cc3d2256..f8e4cd2a7912 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -764,7 +764,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
764{ 764{
765 int rc = 0; 765 int rc = 0;
766 int xid, i; 766 int xid, i;
767 struct cifs_sb_info *cifs_sb;
768 struct cifsTconInfo *pTcon; 767 struct cifsTconInfo *pTcon;
769 struct cifsFileInfo *cifsFile = NULL; 768 struct cifsFileInfo *cifsFile = NULL;
770 char *current_entry; 769 char *current_entry;
@@ -775,8 +774,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
775 774
776 xid = GetXid(); 775 xid = GetXid();
777 776
778 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
779
780 /* 777 /*
781 * Ensure FindFirst doesn't fail before doing filldir() for '.' and 778 * Ensure FindFirst doesn't fail before doing filldir() for '.' and
782 * '..'. Otherwise we won't be able to notify VFS in case of failure. 779 * '..'. Otherwise we won't be able to notify VFS in case of failure.
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index eb746486e49e..1adc9625a344 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -277,7 +277,7 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
277} 277}
278 278
279static void 279static void
280decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses, 280decode_unicode_ssetup(char **pbcc_area, __u16 bleft, struct cifsSesInfo *ses,
281 const struct nls_table *nls_cp) 281 const struct nls_table *nls_cp)
282{ 282{
283 int len; 283 int len;
@@ -323,7 +323,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
323 return; 323 return;
324} 324}
325 325
326static int decode_ascii_ssetup(char **pbcc_area, int bleft, 326static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
327 struct cifsSesInfo *ses, 327 struct cifsSesInfo *ses,
328 const struct nls_table *nls_cp) 328 const struct nls_table *nls_cp)
329{ 329{
@@ -575,12 +575,11 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
575 char *str_area; 575 char *str_area;
576 SESSION_SETUP_ANDX *pSMB; 576 SESSION_SETUP_ANDX *pSMB;
577 __u32 capabilities; 577 __u32 capabilities;
578 int count; 578 __u16 count;
579 int resp_buf_type; 579 int resp_buf_type;
580 struct kvec iov[3]; 580 struct kvec iov[3];
581 enum securityEnum type; 581 enum securityEnum type;
582 __u16 action; 582 __u16 action, bytes_remaining;
583 int bytes_remaining;
584 struct key *spnego_key = NULL; 583 struct key *spnego_key = NULL;
585 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ 584 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
586 u16 blob_len; 585 u16 blob_len;
@@ -876,10 +875,10 @@ ssetup_ntlmssp_authenticate:
876 count = iov[1].iov_len + iov[2].iov_len; 875 count = iov[1].iov_len + iov[2].iov_len;
877 smb_buf->smb_buf_length += count; 876 smb_buf->smb_buf_length += count;
878 877
879 BCC_LE(smb_buf) = cpu_to_le16(count); 878 put_bcc_le(count, smb_buf);
880 879
881 rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type, 880 rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
882 CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR); 881 CIFS_LOG_ERROR);
883 /* SMB request buf freed in SendReceive2 */ 882 /* SMB request buf freed in SendReceive2 */
884 883
885 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; 884 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
@@ -910,7 +909,7 @@ ssetup_ntlmssp_authenticate:
910 cFYI(1, "UID = %d ", ses->Suid); 909 cFYI(1, "UID = %d ", ses->Suid);
911 /* response can have either 3 or 4 word count - Samba sends 3 */ 910 /* response can have either 3 or 4 word count - Samba sends 3 */
912 /* and lanman response is 3 */ 911 /* and lanman response is 3 */
913 bytes_remaining = BCC(smb_buf); 912 bytes_remaining = get_bcc(smb_buf);
914 bcc_ptr = pByteArea(smb_buf); 913 bcc_ptr = pByteArea(smb_buf);
915 914
916 if (smb_buf->WordCount == 4) { 915 if (smb_buf->WordCount == 4) {
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index b6b6dcb500bf..04721485925d 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -45,7 +45,6 @@
45 up with a different answer to the one above) 45 up with a different answer to the one above)
46*/ 46*/
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include "cifsencrypt.h"
49#define uchar unsigned char 48#define uchar unsigned char
50 49
51static uchar perm1[56] = { 57, 49, 41, 33, 25, 17, 9, 50static uchar perm1[56] = { 57, 49, 41, 33, 25, 17, 9,
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 192ea51af20f..b5041c849981 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -32,9 +32,8 @@
32#include "cifs_unicode.h" 32#include "cifs_unicode.h"
33#include "cifspdu.h" 33#include "cifspdu.h"
34#include "cifsglob.h" 34#include "cifsglob.h"
35#include "md5.h"
36#include "cifs_debug.h" 35#include "cifs_debug.h"
37#include "cifsencrypt.h" 36#include "cifsproto.h"
38 37
39#ifndef false 38#ifndef false
40#define false 0 39#define false 0
@@ -48,14 +47,58 @@
48#define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8) 47#define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
49#define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val))) 48#define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val)))
50 49
51/*The following definitions come from libsmb/smbencrypt.c */ 50/* produce a md4 message digest from data of length n bytes */
51int
52mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
53{
54 int rc;
55 unsigned int size;
56 struct crypto_shash *md4;
57 struct sdesc *sdescmd4;
58
59 md4 = crypto_alloc_shash("md4", 0, 0);
60 if (IS_ERR(md4)) {
61 rc = PTR_ERR(md4);
62 cERROR(1, "%s: Crypto md4 allocation error %d\n", __func__, rc);
63 return rc;
64 }
65 size = sizeof(struct shash_desc) + crypto_shash_descsize(md4);
66 sdescmd4 = kmalloc(size, GFP_KERNEL);
67 if (!sdescmd4) {
68 rc = -ENOMEM;
69 cERROR(1, "%s: Memory allocation failure\n", __func__);
70 goto mdfour_err;
71 }
72 sdescmd4->shash.tfm = md4;
73 sdescmd4->shash.flags = 0x0;
74
75 rc = crypto_shash_init(&sdescmd4->shash);
76 if (rc) {
77 cERROR(1, "%s: Could not init md4 shash\n", __func__);
78 goto mdfour_err;
79 }
80 crypto_shash_update(&sdescmd4->shash, link_str, link_len);
81 rc = crypto_shash_final(&sdescmd4->shash, md4_hash);
52 82
53void SMBencrypt(unsigned char *passwd, const unsigned char *c8, 83mdfour_err:
54 unsigned char *p24); 84 crypto_free_shash(md4);
55void E_md4hash(const unsigned char *passwd, unsigned char *p16); 85 kfree(sdescmd4);
56static void SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8, 86
57 unsigned char p24[24]); 87 return rc;
58void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24); 88}
89
90/* Does the des encryption from the NT or LM MD4 hash. */
91static void
92SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
93 unsigned char p24[24])
94{
95 unsigned char p21[21];
96
97 memset(p21, '\0', 21);
98
99 memcpy(p21, passwd, 16);
100 E_P24(p21, c8, p24);
101}
59 102
60/* 103/*
61 This implements the X/Open SMB password encryption 104 This implements the X/Open SMB password encryption
@@ -118,9 +161,10 @@ _my_mbstowcs(__u16 *dst, const unsigned char *src, int len)
118 * Creates the MD4 Hash of the users password in NT UNICODE. 161 * Creates the MD4 Hash of the users password in NT UNICODE.
119 */ 162 */
120 163
121void 164int
122E_md4hash(const unsigned char *passwd, unsigned char *p16) 165E_md4hash(const unsigned char *passwd, unsigned char *p16)
123{ 166{
167 int rc;
124 int len; 168 int len;
125 __u16 wpwd[129]; 169 __u16 wpwd[129];
126 170
@@ -139,8 +183,10 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16)
139 /* Calculate length in bytes */ 183 /* Calculate length in bytes */
140 len = _my_wcslen(wpwd) * sizeof(__u16); 184 len = _my_wcslen(wpwd) * sizeof(__u16);
141 185
142 mdfour(p16, (unsigned char *) wpwd, len); 186 rc = mdfour(p16, (unsigned char *) wpwd, len);
143 memset(wpwd, 0, 129 * 2); 187 memset(wpwd, 0, 129 * 2);
188
189 return rc;
144} 190}
145 191
146#if 0 /* currently unused */ 192#if 0 /* currently unused */
@@ -212,19 +258,6 @@ ntv2_owf_gen(const unsigned char owf[16], const char *user_n,
212} 258}
213#endif 259#endif
214 260
215/* Does the des encryption from the NT or LM MD4 hash. */
216static void
217SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
218 unsigned char p24[24])
219{
220 unsigned char p21[21];
221
222 memset(p21, '\0', 21);
223
224 memcpy(p21, passwd, 16);
225 E_P24(p21, c8, p24);
226}
227
228/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */ 261/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */
229#if 0 /* currently unused */ 262#if 0 /* currently unused */
230static void 263static void
@@ -242,16 +275,21 @@ NTLMSSPOWFencrypt(unsigned char passwd[8],
242#endif 275#endif
243 276
244/* Does the NT MD4 hash then des encryption. */ 277/* Does the NT MD4 hash then des encryption. */
245 278int
246void
247SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) 279SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
248{ 280{
281 int rc;
249 unsigned char p21[21]; 282 unsigned char p21[21];
250 283
251 memset(p21, '\0', 21); 284 memset(p21, '\0', 21);
252 285
253 E_md4hash(passwd, p21); 286 rc = E_md4hash(passwd, p21);
287 if (rc) {
288 cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
289 return rc;
290 }
254 SMBOWFencrypt(p21, c8, p24); 291 SMBOWFencrypt(p21, c8, p24);
292 return rc;
255} 293}
256 294
257 295
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 59ca81b16919..46d8756f2b24 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -36,7 +36,13 @@
36 36
37extern mempool_t *cifs_mid_poolp; 37extern mempool_t *cifs_mid_poolp;
38 38
39static struct mid_q_entry * 39static void
40wake_up_task(struct mid_q_entry *mid)
41{
42 wake_up_process(mid->callback_data);
43}
44
45struct mid_q_entry *
40AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) 46AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
41{ 47{
42 struct mid_q_entry *temp; 48 struct mid_q_entry *temp;
@@ -58,28 +64,28 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
58 /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */ 64 /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
59 /* when mid allocated can be before when sent */ 65 /* when mid allocated can be before when sent */
60 temp->when_alloc = jiffies; 66 temp->when_alloc = jiffies;
61 temp->tsk = current; 67
68 /*
69 * The default is for the mid to be synchronous, so the
70 * default callback just wakes up the current task.
71 */
72 temp->callback = wake_up_task;
73 temp->callback_data = current;
62 } 74 }
63 75
64 spin_lock(&GlobalMid_Lock);
65 list_add_tail(&temp->qhead, &server->pending_mid_q);
66 atomic_inc(&midCount); 76 atomic_inc(&midCount);
67 temp->midState = MID_REQUEST_ALLOCATED; 77 temp->midState = MID_REQUEST_ALLOCATED;
68 spin_unlock(&GlobalMid_Lock);
69 return temp; 78 return temp;
70} 79}
71 80
72static void 81void
73DeleteMidQEntry(struct mid_q_entry *midEntry) 82DeleteMidQEntry(struct mid_q_entry *midEntry)
74{ 83{
75#ifdef CONFIG_CIFS_STATS2 84#ifdef CONFIG_CIFS_STATS2
76 unsigned long now; 85 unsigned long now;
77#endif 86#endif
78 spin_lock(&GlobalMid_Lock);
79 midEntry->midState = MID_FREE; 87 midEntry->midState = MID_FREE;
80 list_del(&midEntry->qhead);
81 atomic_dec(&midCount); 88 atomic_dec(&midCount);
82 spin_unlock(&GlobalMid_Lock);
83 if (midEntry->largeBuf) 89 if (midEntry->largeBuf)
84 cifs_buf_release(midEntry->resp_buf); 90 cifs_buf_release(midEntry->resp_buf);
85 else 91 else
@@ -103,6 +109,16 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
103 mempool_free(midEntry, cifs_mid_poolp); 109 mempool_free(midEntry, cifs_mid_poolp);
104} 110}
105 111
112static void
113delete_mid(struct mid_q_entry *mid)
114{
115 spin_lock(&GlobalMid_Lock);
116 list_del(&mid->qhead);
117 spin_unlock(&GlobalMid_Lock);
118
119 DeleteMidQEntry(mid);
120}
121
106static int 122static int
107smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) 123smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
108{ 124{
@@ -220,9 +236,9 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
220 server->tcpStatus = CifsNeedReconnect; 236 server->tcpStatus = CifsNeedReconnect;
221 } 237 }
222 238
223 if (rc < 0) { 239 if (rc < 0 && rc != -EINTR)
224 cERROR(1, "Error %d sending data on socket to server", rc); 240 cERROR(1, "Error %d sending data on socket to server", rc);
225 } else 241 else
226 rc = 0; 242 rc = 0;
227 243
228 /* Don't want to modify the buffer as a 244 /* Don't want to modify the buffer as a
@@ -244,31 +260,31 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
244 return smb_sendv(server, &iov, 1); 260 return smb_sendv(server, &iov, 1);
245} 261}
246 262
247static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op) 263static int wait_for_free_request(struct TCP_Server_Info *server,
264 const int long_op)
248{ 265{
249 if (long_op == CIFS_ASYNC_OP) { 266 if (long_op == CIFS_ASYNC_OP) {
250 /* oplock breaks must not be held up */ 267 /* oplock breaks must not be held up */
251 atomic_inc(&ses->server->inFlight); 268 atomic_inc(&server->inFlight);
252 return 0; 269 return 0;
253 } 270 }
254 271
255 spin_lock(&GlobalMid_Lock); 272 spin_lock(&GlobalMid_Lock);
256 while (1) { 273 while (1) {
257 if (atomic_read(&ses->server->inFlight) >= 274 if (atomic_read(&server->inFlight) >= cifs_max_pending) {
258 cifs_max_pending){
259 spin_unlock(&GlobalMid_Lock); 275 spin_unlock(&GlobalMid_Lock);
260#ifdef CONFIG_CIFS_STATS2 276#ifdef CONFIG_CIFS_STATS2
261 atomic_inc(&ses->server->num_waiters); 277 atomic_inc(&server->num_waiters);
262#endif 278#endif
263 wait_event(ses->server->request_q, 279 wait_event(server->request_q,
264 atomic_read(&ses->server->inFlight) 280 atomic_read(&server->inFlight)
265 < cifs_max_pending); 281 < cifs_max_pending);
266#ifdef CONFIG_CIFS_STATS2 282#ifdef CONFIG_CIFS_STATS2
267 atomic_dec(&ses->server->num_waiters); 283 atomic_dec(&server->num_waiters);
268#endif 284#endif
269 spin_lock(&GlobalMid_Lock); 285 spin_lock(&GlobalMid_Lock);
270 } else { 286 } else {
271 if (ses->server->tcpStatus == CifsExiting) { 287 if (server->tcpStatus == CifsExiting) {
272 spin_unlock(&GlobalMid_Lock); 288 spin_unlock(&GlobalMid_Lock);
273 return -ENOENT; 289 return -ENOENT;
274 } 290 }
@@ -278,7 +294,7 @@ static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
278 294
279 /* update # of requests on the wire to server */ 295 /* update # of requests on the wire to server */
280 if (long_op != CIFS_BLOCKING_OP) 296 if (long_op != CIFS_BLOCKING_OP)
281 atomic_inc(&ses->server->inFlight); 297 atomic_inc(&server->inFlight);
282 spin_unlock(&GlobalMid_Lock); 298 spin_unlock(&GlobalMid_Lock);
283 break; 299 break;
284 } 300 }
@@ -308,53 +324,85 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
308 *ppmidQ = AllocMidQEntry(in_buf, ses->server); 324 *ppmidQ = AllocMidQEntry(in_buf, ses->server);
309 if (*ppmidQ == NULL) 325 if (*ppmidQ == NULL)
310 return -ENOMEM; 326 return -ENOMEM;
327 spin_lock(&GlobalMid_Lock);
328 list_add_tail(&(*ppmidQ)->qhead, &ses->server->pending_mid_q);
329 spin_unlock(&GlobalMid_Lock);
311 return 0; 330 return 0;
312} 331}
313 332
314static int wait_for_response(struct cifsSesInfo *ses, 333static int
315 struct mid_q_entry *midQ, 334wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
316 unsigned long timeout,
317 unsigned long time_to_wait)
318{ 335{
319 unsigned long curr_timeout; 336 int error;
320 337
321 for (;;) { 338 error = wait_event_killable(server->response_q,
322 curr_timeout = timeout + jiffies; 339 midQ->midState != MID_REQUEST_SUBMITTED);
323 wait_event_timeout(ses->server->response_q, 340 if (error < 0)
324 midQ->midState != MID_REQUEST_SUBMITTED, timeout); 341 return -ERESTARTSYS;
325 342
326 if (time_after(jiffies, curr_timeout) && 343 return 0;
327 (midQ->midState == MID_REQUEST_SUBMITTED) && 344}
328 ((ses->server->tcpStatus == CifsGood) ||
329 (ses->server->tcpStatus == CifsNew))) {
330 345
331 unsigned long lrt;
332 346
333 /* We timed out. Is the server still 347/*
334 sending replies ? */ 348 * Send a SMB request and set the callback function in the mid to handle
335 spin_lock(&GlobalMid_Lock); 349 * the result. Caller is responsible for dealing with timeouts.
336 lrt = ses->server->lstrp; 350 */
337 spin_unlock(&GlobalMid_Lock); 351int
352cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
353 mid_callback_t *callback, void *cbdata)
354{
355 int rc;
356 struct mid_q_entry *mid;
338 357
339 /* Calculate time_to_wait past last receive time. 358 rc = wait_for_free_request(server, CIFS_ASYNC_OP);
340 Although we prefer not to time out if the 359 if (rc)
341 server is still responding - we will time 360 return rc;
342 out if the server takes more than 15 (or 45 361
343 or 180) seconds to respond to this request 362 /* enable signing if server requires it */
344 and has not responded to any request from 363 if (server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
345 other threads on the client within 10 seconds */ 364 in_buf->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
346 lrt += time_to_wait; 365
347 if (time_after(jiffies, lrt)) { 366 mutex_lock(&server->srv_mutex);
348 /* No replies for time_to_wait. */ 367 mid = AllocMidQEntry(in_buf, server);
349 cERROR(1, "server not responding"); 368 if (mid == NULL) {
350 return -1; 369 mutex_unlock(&server->srv_mutex);
351 } 370 return -ENOMEM;
352 } else {
353 return 0;
354 }
355 } 371 }
356}
357 372
373 /* put it on the pending_mid_q */
374 spin_lock(&GlobalMid_Lock);
375 list_add_tail(&mid->qhead, &server->pending_mid_q);
376 spin_unlock(&GlobalMid_Lock);
377
378 rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
379 if (rc) {
380 mutex_unlock(&server->srv_mutex);
381 goto out_err;
382 }
383
384 mid->callback = callback;
385 mid->callback_data = cbdata;
386 mid->midState = MID_REQUEST_SUBMITTED;
387#ifdef CONFIG_CIFS_STATS2
388 atomic_inc(&server->inSend);
389#endif
390 rc = smb_send(server, in_buf, in_buf->smb_buf_length);
391#ifdef CONFIG_CIFS_STATS2
392 atomic_dec(&server->inSend);
393 mid->when_sent = jiffies;
394#endif
395 mutex_unlock(&server->srv_mutex);
396 if (rc)
397 goto out_err;
398
399 return rc;
400out_err:
401 delete_mid(mid);
402 atomic_dec(&server->inFlight);
403 wake_up(&server->request_q);
404 return rc;
405}
358 406
359/* 407/*
360 * 408 *
@@ -382,6 +430,84 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
382 return rc; 430 return rc;
383} 431}
384 432
433static int
434sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
435{
436 int rc = 0;
437
438 cFYI(1, "%s: cmd=%d mid=%d state=%d", __func__, mid->command,
439 mid->mid, mid->midState);
440
441 spin_lock(&GlobalMid_Lock);
442 /* ensure that it's no longer on the pending_mid_q */
443 list_del_init(&mid->qhead);
444
445 switch (mid->midState) {
446 case MID_RESPONSE_RECEIVED:
447 spin_unlock(&GlobalMid_Lock);
448 return rc;
449 case MID_REQUEST_SUBMITTED:
450 /* socket is going down, reject all calls */
451 if (server->tcpStatus == CifsExiting) {
452 cERROR(1, "%s: canceling mid=%d cmd=0x%x state=%d",
453 __func__, mid->mid, mid->command, mid->midState);
454 rc = -EHOSTDOWN;
455 break;
456 }
457 case MID_RETRY_NEEDED:
458 rc = -EAGAIN;
459 break;
460 case MID_RESPONSE_MALFORMED:
461 rc = -EIO;
462 break;
463 default:
464 cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
465 mid->mid, mid->midState);
466 rc = -EIO;
467 }
468 spin_unlock(&GlobalMid_Lock);
469
470 DeleteMidQEntry(mid);
471 return rc;
472}
473
474/*
475 * An NT cancel request header looks just like the original request except:
476 *
477 * The Command is SMB_COM_NT_CANCEL
478 * The WordCount is zeroed out
479 * The ByteCount is zeroed out
480 *
481 * This function mangles an existing request buffer into a
482 * SMB_COM_NT_CANCEL request and then sends it.
483 */
484static int
485send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
486 struct mid_q_entry *mid)
487{
488 int rc = 0;
489
490 /* -4 for RFC1001 length and +2 for BCC field */
491 in_buf->smb_buf_length = sizeof(struct smb_hdr) - 4 + 2;
492 in_buf->Command = SMB_COM_NT_CANCEL;
493 in_buf->WordCount = 0;
494 put_bcc_le(0, in_buf);
495
496 mutex_lock(&server->srv_mutex);
497 rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
498 if (rc) {
499 mutex_unlock(&server->srv_mutex);
500 return rc;
501 }
502 rc = smb_send(server, in_buf, in_buf->smb_buf_length);
503 mutex_unlock(&server->srv_mutex);
504
505 cFYI(1, "issued NT_CANCEL for mid %u, rc = %d",
506 in_buf->Mid, rc);
507
508 return rc;
509}
510
385int 511int
386SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, 512SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
387 struct kvec *iov, int n_vec, int *pRespBufType /* ret */, 513 struct kvec *iov, int n_vec, int *pRespBufType /* ret */,
@@ -390,7 +516,6 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
390 int rc = 0; 516 int rc = 0;
391 int long_op; 517 int long_op;
392 unsigned int receive_len; 518 unsigned int receive_len;
393 unsigned long timeout;
394 struct mid_q_entry *midQ; 519 struct mid_q_entry *midQ;
395 struct smb_hdr *in_buf = iov[0].iov_base; 520 struct smb_hdr *in_buf = iov[0].iov_base;
396 521
@@ -413,7 +538,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
413 to the same server. We may make this configurable later or 538 to the same server. We may make this configurable later or
414 use ses->maxReq */ 539 use ses->maxReq */
415 540
416 rc = wait_for_free_request(ses, long_op); 541 rc = wait_for_free_request(ses->server, long_op);
417 if (rc) { 542 if (rc) {
418 cifs_small_buf_release(in_buf); 543 cifs_small_buf_release(in_buf);
419 return rc; 544 return rc;
@@ -452,70 +577,41 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
452#endif 577#endif
453 578
454 mutex_unlock(&ses->server->srv_mutex); 579 mutex_unlock(&ses->server->srv_mutex);
455 cifs_small_buf_release(in_buf);
456 580
457 if (rc < 0) 581 if (rc < 0) {
458 goto out; 582 cifs_small_buf_release(in_buf);
459
460 if (long_op == CIFS_STD_OP)
461 timeout = 15 * HZ;
462 else if (long_op == CIFS_VLONG_OP) /* e.g. slow writes past EOF */
463 timeout = 180 * HZ;
464 else if (long_op == CIFS_LONG_OP)
465 timeout = 45 * HZ; /* should be greater than
466 servers oplock break timeout (about 43 seconds) */
467 else if (long_op == CIFS_ASYNC_OP)
468 goto out;
469 else if (long_op == CIFS_BLOCKING_OP)
470 timeout = 0x7FFFFFFF; /* large, but not so large as to wrap */
471 else {
472 cERROR(1, "unknown timeout flag %d", long_op);
473 rc = -EIO;
474 goto out; 583 goto out;
475 } 584 }
476 585
477 /* wait for 15 seconds or until woken up due to response arriving or 586 if (long_op == CIFS_ASYNC_OP) {
478 due to last connection to this server being unmounted */ 587 cifs_small_buf_release(in_buf);
479 if (signal_pending(current)) { 588 goto out;
480 /* if signal pending do not hold up user for full smb timeout
481 but we still give response a chance to complete */
482 timeout = 2 * HZ;
483 } 589 }
484 590
485 /* No user interrupts in wait - wreaks havoc with performance */ 591 rc = wait_for_response(ses->server, midQ);
486 wait_for_response(ses, midQ, timeout, 10 * HZ); 592 if (rc != 0) {
487 593 send_nt_cancel(ses->server, in_buf, midQ);
488 spin_lock(&GlobalMid_Lock); 594 spin_lock(&GlobalMid_Lock);
489
490 if (midQ->resp_buf == NULL) {
491 cERROR(1, "No response to cmd %d mid %d",
492 midQ->command, midQ->mid);
493 if (midQ->midState == MID_REQUEST_SUBMITTED) { 595 if (midQ->midState == MID_REQUEST_SUBMITTED) {
494 if (ses->server->tcpStatus == CifsExiting) 596 midQ->callback = DeleteMidQEntry;
495 rc = -EHOSTDOWN; 597 spin_unlock(&GlobalMid_Lock);
496 else { 598 cifs_small_buf_release(in_buf);
497 ses->server->tcpStatus = CifsNeedReconnect; 599 atomic_dec(&ses->server->inFlight);
498 midQ->midState = MID_RETRY_NEEDED; 600 wake_up(&ses->server->request_q);
499 } 601 return rc;
500 }
501
502 if (rc != -EHOSTDOWN) {
503 if (midQ->midState == MID_RETRY_NEEDED) {
504 rc = -EAGAIN;
505 cFYI(1, "marking request for retry");
506 } else {
507 rc = -EIO;
508 }
509 } 602 }
510 spin_unlock(&GlobalMid_Lock); 603 spin_unlock(&GlobalMid_Lock);
511 DeleteMidQEntry(midQ); 604 }
512 /* Update # of requests on wire to server */ 605
606 cifs_small_buf_release(in_buf);
607
608 rc = sync_mid_result(midQ, ses->server);
609 if (rc != 0) {
513 atomic_dec(&ses->server->inFlight); 610 atomic_dec(&ses->server->inFlight);
514 wake_up(&ses->server->request_q); 611 wake_up(&ses->server->request_q);
515 return rc; 612 return rc;
516 } 613 }
517 614
518 spin_unlock(&GlobalMid_Lock);
519 receive_len = midQ->resp_buf->smb_buf_length; 615 receive_len = midQ->resp_buf->smb_buf_length;
520 616
521 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { 617 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
@@ -559,19 +655,18 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
559 if (receive_len >= sizeof(struct smb_hdr) - 4 655 if (receive_len >= sizeof(struct smb_hdr) - 4
560 /* do not count RFC1001 header */ + 656 /* do not count RFC1001 header */ +
561 (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ ) 657 (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ )
562 BCC(midQ->resp_buf) = 658 put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
563 le16_to_cpu(BCC_LE(midQ->resp_buf));
564 if ((flags & CIFS_NO_RESP) == 0) 659 if ((flags & CIFS_NO_RESP) == 0)
565 midQ->resp_buf = NULL; /* mark it so buf will 660 midQ->resp_buf = NULL; /* mark it so buf will
566 not be freed by 661 not be freed by
567 DeleteMidQEntry */ 662 delete_mid */
568 } else { 663 } else {
569 rc = -EIO; 664 rc = -EIO;
570 cFYI(1, "Bad MID state?"); 665 cFYI(1, "Bad MID state?");
571 } 666 }
572 667
573out: 668out:
574 DeleteMidQEntry(midQ); 669 delete_mid(midQ);
575 atomic_dec(&ses->server->inFlight); 670 atomic_dec(&ses->server->inFlight);
576 wake_up(&ses->server->request_q); 671 wake_up(&ses->server->request_q);
577 672
@@ -585,7 +680,6 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
585{ 680{
586 int rc = 0; 681 int rc = 0;
587 unsigned int receive_len; 682 unsigned int receive_len;
588 unsigned long timeout;
589 struct mid_q_entry *midQ; 683 struct mid_q_entry *midQ;
590 684
591 if (ses == NULL) { 685 if (ses == NULL) {
@@ -610,7 +704,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
610 return -EIO; 704 return -EIO;
611 } 705 }
612 706
613 rc = wait_for_free_request(ses, long_op); 707 rc = wait_for_free_request(ses->server, long_op);
614 if (rc) 708 if (rc)
615 return rc; 709 return rc;
616 710
@@ -649,64 +743,31 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
649 if (rc < 0) 743 if (rc < 0)
650 goto out; 744 goto out;
651 745
652 if (long_op == CIFS_STD_OP) 746 if (long_op == CIFS_ASYNC_OP)
653 timeout = 15 * HZ;
654 /* wait for 15 seconds or until woken up due to response arriving or
655 due to last connection to this server being unmounted */
656 else if (long_op == CIFS_ASYNC_OP)
657 goto out;
658 else if (long_op == CIFS_VLONG_OP) /* writes past EOF can be slow */
659 timeout = 180 * HZ;
660 else if (long_op == CIFS_LONG_OP)
661 timeout = 45 * HZ; /* should be greater than
662 servers oplock break timeout (about 43 seconds) */
663 else if (long_op == CIFS_BLOCKING_OP)
664 timeout = 0x7FFFFFFF; /* large but no so large as to wrap */
665 else {
666 cERROR(1, "unknown timeout flag %d", long_op);
667 rc = -EIO;
668 goto out; 747 goto out;
669 }
670
671 if (signal_pending(current)) {
672 /* if signal pending do not hold up user for full smb timeout
673 but we still give response a chance to complete */
674 timeout = 2 * HZ;
675 }
676
677 /* No user interrupts in wait - wreaks havoc with performance */
678 wait_for_response(ses, midQ, timeout, 10 * HZ);
679 748
680 spin_lock(&GlobalMid_Lock); 749 rc = wait_for_response(ses->server, midQ);
681 if (midQ->resp_buf == NULL) { 750 if (rc != 0) {
682 cERROR(1, "No response for cmd %d mid %d", 751 send_nt_cancel(ses->server, in_buf, midQ);
683 midQ->command, midQ->mid); 752 spin_lock(&GlobalMid_Lock);
684 if (midQ->midState == MID_REQUEST_SUBMITTED) { 753 if (midQ->midState == MID_REQUEST_SUBMITTED) {
685 if (ses->server->tcpStatus == CifsExiting) 754 /* no longer considered to be "in-flight" */
686 rc = -EHOSTDOWN; 755 midQ->callback = DeleteMidQEntry;
687 else { 756 spin_unlock(&GlobalMid_Lock);
688 ses->server->tcpStatus = CifsNeedReconnect; 757 atomic_dec(&ses->server->inFlight);
689 midQ->midState = MID_RETRY_NEEDED; 758 wake_up(&ses->server->request_q);
690 } 759 return rc;
691 }
692
693 if (rc != -EHOSTDOWN) {
694 if (midQ->midState == MID_RETRY_NEEDED) {
695 rc = -EAGAIN;
696 cFYI(1, "marking request for retry");
697 } else {
698 rc = -EIO;
699 }
700 } 760 }
701 spin_unlock(&GlobalMid_Lock); 761 spin_unlock(&GlobalMid_Lock);
702 DeleteMidQEntry(midQ); 762 }
703 /* Update # of requests on wire to server */ 763
764 rc = sync_mid_result(midQ, ses->server);
765 if (rc != 0) {
704 atomic_dec(&ses->server->inFlight); 766 atomic_dec(&ses->server->inFlight);
705 wake_up(&ses->server->request_q); 767 wake_up(&ses->server->request_q);
706 return rc; 768 return rc;
707 } 769 }
708 770
709 spin_unlock(&GlobalMid_Lock);
710 receive_len = midQ->resp_buf->smb_buf_length; 771 receive_len = midQ->resp_buf->smb_buf_length;
711 772
712 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { 773 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
@@ -748,43 +809,20 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
748 if (receive_len >= sizeof(struct smb_hdr) - 4 809 if (receive_len >= sizeof(struct smb_hdr) - 4
749 /* do not count RFC1001 header */ + 810 /* do not count RFC1001 header */ +
750 (2 * out_buf->WordCount) + 2 /* bcc */ ) 811 (2 * out_buf->WordCount) + 2 /* bcc */ )
751 BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf)); 812 put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
752 } else { 813 } else {
753 rc = -EIO; 814 rc = -EIO;
754 cERROR(1, "Bad MID state?"); 815 cERROR(1, "Bad MID state?");
755 } 816 }
756 817
757out: 818out:
758 DeleteMidQEntry(midQ); 819 delete_mid(midQ);
759 atomic_dec(&ses->server->inFlight); 820 atomic_dec(&ses->server->inFlight);
760 wake_up(&ses->server->request_q); 821 wake_up(&ses->server->request_q);
761 822
762 return rc; 823 return rc;
763} 824}
764 825
765/* Send an NT_CANCEL SMB to cause the POSIX blocking lock to return. */
766
767static int
768send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
769 struct mid_q_entry *midQ)
770{
771 int rc = 0;
772 struct cifsSesInfo *ses = tcon->ses;
773 __u16 mid = in_buf->Mid;
774
775 header_assemble(in_buf, SMB_COM_NT_CANCEL, tcon, 0);
776 in_buf->Mid = mid;
777 mutex_lock(&ses->server->srv_mutex);
778 rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
779 if (rc) {
780 mutex_unlock(&ses->server->srv_mutex);
781 return rc;
782 }
783 rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
784 mutex_unlock(&ses->server->srv_mutex);
785 return rc;
786}
787
788/* We send a LOCKINGX_CANCEL_LOCK to cause the Windows 826/* We send a LOCKINGX_CANCEL_LOCK to cause the Windows
789 blocking lock to return. */ 827 blocking lock to return. */
790 828
@@ -807,7 +845,7 @@ send_lock_cancel(const unsigned int xid, struct cifsTconInfo *tcon,
807 pSMB->hdr.Mid = GetNextMid(ses->server); 845 pSMB->hdr.Mid = GetNextMid(ses->server);
808 846
809 return SendReceive(xid, ses, in_buf, out_buf, 847 return SendReceive(xid, ses, in_buf, out_buf,
810 &bytes_returned, CIFS_STD_OP); 848 &bytes_returned, 0);
811} 849}
812 850
813int 851int
@@ -845,7 +883,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
845 return -EIO; 883 return -EIO;
846 } 884 }
847 885
848 rc = wait_for_free_request(ses, CIFS_BLOCKING_OP); 886 rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP);
849 if (rc) 887 if (rc)
850 return rc; 888 return rc;
851 889
@@ -863,7 +901,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
863 901
864 rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number); 902 rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
865 if (rc) { 903 if (rc) {
866 DeleteMidQEntry(midQ); 904 delete_mid(midQ);
867 mutex_unlock(&ses->server->srv_mutex); 905 mutex_unlock(&ses->server->srv_mutex);
868 return rc; 906 return rc;
869 } 907 }
@@ -880,7 +918,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
880 mutex_unlock(&ses->server->srv_mutex); 918 mutex_unlock(&ses->server->srv_mutex);
881 919
882 if (rc < 0) { 920 if (rc < 0) {
883 DeleteMidQEntry(midQ); 921 delete_mid(midQ);
884 return rc; 922 return rc;
885 } 923 }
886 924
@@ -899,10 +937,9 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
899 if (in_buf->Command == SMB_COM_TRANSACTION2) { 937 if (in_buf->Command == SMB_COM_TRANSACTION2) {
900 /* POSIX lock. We send a NT_CANCEL SMB to cause the 938 /* POSIX lock. We send a NT_CANCEL SMB to cause the
901 blocking lock to return. */ 939 blocking lock to return. */
902 940 rc = send_nt_cancel(ses->server, in_buf, midQ);
903 rc = send_nt_cancel(tcon, in_buf, midQ);
904 if (rc) { 941 if (rc) {
905 DeleteMidQEntry(midQ); 942 delete_mid(midQ);
906 return rc; 943 return rc;
907 } 944 }
908 } else { 945 } else {
@@ -914,47 +951,33 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
914 /* If we get -ENOLCK back the lock may have 951 /* If we get -ENOLCK back the lock may have
915 already been removed. Don't exit in this case. */ 952 already been removed. Don't exit in this case. */
916 if (rc && rc != -ENOLCK) { 953 if (rc && rc != -ENOLCK) {
917 DeleteMidQEntry(midQ); 954 delete_mid(midQ);
918 return rc; 955 return rc;
919 } 956 }
920 } 957 }
921 958
922 /* Wait 5 seconds for the response. */ 959 rc = wait_for_response(ses->server, midQ);
923 if (wait_for_response(ses, midQ, 5 * HZ, 5 * HZ) == 0) { 960 if (rc) {
924 /* We got the response - restart system call. */ 961 send_nt_cancel(ses->server, in_buf, midQ);
925 rstart = 1; 962 spin_lock(&GlobalMid_Lock);
926 } 963 if (midQ->midState == MID_REQUEST_SUBMITTED) {
927 } 964 /* no longer considered to be "in-flight" */
928 965 midQ->callback = DeleteMidQEntry;
929 spin_lock(&GlobalMid_Lock); 966 spin_unlock(&GlobalMid_Lock);
930 if (midQ->resp_buf) { 967 return rc;
931 spin_unlock(&GlobalMid_Lock);
932 receive_len = midQ->resp_buf->smb_buf_length;
933 } else {
934 cERROR(1, "No response for cmd %d mid %d",
935 midQ->command, midQ->mid);
936 if (midQ->midState == MID_REQUEST_SUBMITTED) {
937 if (ses->server->tcpStatus == CifsExiting)
938 rc = -EHOSTDOWN;
939 else {
940 ses->server->tcpStatus = CifsNeedReconnect;
941 midQ->midState = MID_RETRY_NEEDED;
942 } 968 }
969 spin_unlock(&GlobalMid_Lock);
943 } 970 }
944 971
945 if (rc != -EHOSTDOWN) { 972 /* We got the response - restart system call. */
946 if (midQ->midState == MID_RETRY_NEEDED) { 973 rstart = 1;
947 rc = -EAGAIN;
948 cFYI(1, "marking request for retry");
949 } else {
950 rc = -EIO;
951 }
952 }
953 spin_unlock(&GlobalMid_Lock);
954 DeleteMidQEntry(midQ);
955 return rc;
956 } 974 }
957 975
976 rc = sync_mid_result(midQ, ses->server);
977 if (rc != 0)
978 return rc;
979
980 receive_len = midQ->resp_buf->smb_buf_length;
958 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { 981 if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
959 cERROR(1, "Frame too large received. Length: %d Xid: %d", 982 cERROR(1, "Frame too large received. Length: %d Xid: %d",
960 receive_len, xid); 983 receive_len, xid);
@@ -998,10 +1021,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
998 if (receive_len >= sizeof(struct smb_hdr) - 4 1021 if (receive_len >= sizeof(struct smb_hdr) - 4
999 /* do not count RFC1001 header */ + 1022 /* do not count RFC1001 header */ +
1000 (2 * out_buf->WordCount) + 2 /* bcc */ ) 1023 (2 * out_buf->WordCount) + 2 /* bcc */ )
1001 BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf)); 1024 put_bcc(get_bcc_le(out_buf), out_buf);
1002 1025
1003out: 1026out:
1004 DeleteMidQEntry(midQ); 1027 delete_mid(midQ);
1005 if (rstart && rc == -EACCES) 1028 if (rstart && rc == -EACCES)
1006 return -ERESTARTSYS; 1029 return -ERESTARTSYS;
1007 return rc; 1030 return rc;
diff --git a/fs/compat.c b/fs/compat.c
index eb1740ac8c0a..f6fd0a00e6cc 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -257,7 +257,7 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
257} 257}
258 258
259/* 259/*
260 * The following statfs calls are copies of code from fs/open.c and 260 * The following statfs calls are copies of code from fs/statfs.c and
261 * should be checked against those from time to time 261 * should be checked against those from time to time
262 */ 262 */
263asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf) 263asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
@@ -320,7 +320,9 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
320 __put_user(kbuf->f_namelen, &ubuf->f_namelen) || 320 __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
321 __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || 321 __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
322 __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || 322 __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
323 __put_user(kbuf->f_frsize, &ubuf->f_frsize)) 323 __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
324 __put_user(kbuf->f_flags, &ubuf->f_flags) ||
325 __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
324 return -EFAULT; 326 return -EFAULT;
325 return 0; 327 return 0;
326} 328}
@@ -597,10 +599,8 @@ ssize_t compat_rw_copy_check_uvector(int type,
597 if (nr_segs > fast_segs) { 599 if (nr_segs > fast_segs) {
598 ret = -ENOMEM; 600 ret = -ENOMEM;
599 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); 601 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
600 if (iov == NULL) { 602 if (iov == NULL)
601 *ret_pointer = fast_pointer;
602 goto out; 603 goto out;
603 }
604 } 604 }
605 *ret_pointer = iov; 605 *ret_pointer = iov;
606 606
diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig
index 13587cc97a0b..9febcdefdfdc 100644
--- a/fs/configfs/Kconfig
+++ b/fs/configfs/Kconfig
@@ -1,8 +1,8 @@
1config CONFIGFS_FS 1config CONFIGFS_FS
2 tristate "Userspace-driven configuration filesystem" 2 tristate "Userspace-driven configuration filesystem"
3 depends on SYSFS 3 select SYSFS
4 help 4 help
5 configfs is a ram-based filesystem that provides the converse 5 configfs is a RAM-based filesystem that provides the converse
6 of sysfs's functionality. Where sysfs is a filesystem-based 6 of sysfs's functionality. Where sysfs is a filesystem-based
7 view of kernel objects, configfs is a filesystem-based manager 7 view of kernel objects, configfs is a filesystem-based manager
8 of kernel objects, or config_items. 8 of kernel objects, or config_items.
diff --git a/fs/dcache.c b/fs/dcache.c
index 0c6d5c549d84..2a6bd9a4ae97 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -176,6 +176,7 @@ static void d_free(struct dentry *dentry)
176 176
177/** 177/**
178 * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups 178 * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
179 * @dentry: the target dentry
179 * After this call, in-progress rcu-walk path lookup will fail. This 180 * After this call, in-progress rcu-walk path lookup will fail. This
180 * should be called after unhashing, and after changing d_inode (if 181 * should be called after unhashing, and after changing d_inode (if
181 * the dentry has not already been unhashed). 182 * the dentry has not already been unhashed).
@@ -281,6 +282,7 @@ static void dentry_lru_move_tail(struct dentry *dentry)
281/** 282/**
282 * d_kill - kill dentry and return parent 283 * d_kill - kill dentry and return parent
283 * @dentry: dentry to kill 284 * @dentry: dentry to kill
285 * @parent: parent dentry
284 * 286 *
285 * The dentry must already be unhashed and removed from the LRU. 287 * The dentry must already be unhashed and removed from the LRU.
286 * 288 *
@@ -1357,8 +1359,8 @@ EXPORT_SYMBOL(d_alloc_name);
1357 1359
1358void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) 1360void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
1359{ 1361{
1360 BUG_ON(dentry->d_op); 1362 WARN_ON_ONCE(dentry->d_op);
1361 BUG_ON(dentry->d_flags & (DCACHE_OP_HASH | 1363 WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH |
1362 DCACHE_OP_COMPARE | 1364 DCACHE_OP_COMPARE |
1363 DCACHE_OP_REVALIDATE | 1365 DCACHE_OP_REVALIDATE |
1364 DCACHE_OP_DELETE )); 1366 DCACHE_OP_DELETE ));
@@ -1380,8 +1382,11 @@ EXPORT_SYMBOL(d_set_d_op);
1380static void __d_instantiate(struct dentry *dentry, struct inode *inode) 1382static void __d_instantiate(struct dentry *dentry, struct inode *inode)
1381{ 1383{
1382 spin_lock(&dentry->d_lock); 1384 spin_lock(&dentry->d_lock);
1383 if (inode) 1385 if (inode) {
1386 if (unlikely(IS_AUTOMOUNT(inode)))
1387 dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
1384 list_add(&dentry->d_alias, &inode->i_dentry); 1388 list_add(&dentry->d_alias, &inode->i_dentry);
1389 }
1385 dentry->d_inode = inode; 1390 dentry->d_inode = inode;
1386 dentry_rcuwalk_barrier(dentry); 1391 dentry_rcuwalk_barrier(dentry);
1387 spin_unlock(&dentry->d_lock); 1392 spin_unlock(&dentry->d_lock);
@@ -1970,7 +1975,7 @@ out:
1970/** 1975/**
1971 * d_validate - verify dentry provided from insecure source (deprecated) 1976 * d_validate - verify dentry provided from insecure source (deprecated)
1972 * @dentry: The dentry alleged to be valid child of @dparent 1977 * @dentry: The dentry alleged to be valid child of @dparent
1973 * @parent: The parent dentry (known to be valid) 1978 * @dparent: The parent dentry (known to be valid)
1974 * 1979 *
1975 * An insecure source has sent us a dentry, here we verify it and dget() it. 1980 * An insecure source has sent us a dentry, here we verify it and dget() it.
1976 * This is used by ncpfs in its readdir implementation. 1981 * This is used by ncpfs in its readdir implementation.
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 8201c2558d85..dcb5577cde1d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -325,12 +325,16 @@ void dio_end_io(struct bio *bio, int error)
325} 325}
326EXPORT_SYMBOL_GPL(dio_end_io); 326EXPORT_SYMBOL_GPL(dio_end_io);
327 327
328static int 328static void
329dio_bio_alloc(struct dio *dio, struct block_device *bdev, 329dio_bio_alloc(struct dio *dio, struct block_device *bdev,
330 sector_t first_sector, int nr_vecs) 330 sector_t first_sector, int nr_vecs)
331{ 331{
332 struct bio *bio; 332 struct bio *bio;
333 333
334 /*
335 * bio_alloc() is guaranteed to return a bio when called with
336 * __GFP_WAIT and we request a valid number of vectors.
337 */
334 bio = bio_alloc(GFP_KERNEL, nr_vecs); 338 bio = bio_alloc(GFP_KERNEL, nr_vecs);
335 339
336 bio->bi_bdev = bdev; 340 bio->bi_bdev = bdev;
@@ -342,7 +346,6 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
342 346
343 dio->bio = bio; 347 dio->bio = bio;
344 dio->logical_offset_in_bio = dio->cur_page_fs_offset; 348 dio->logical_offset_in_bio = dio->cur_page_fs_offset;
345 return 0;
346} 349}
347 350
348/* 351/*
@@ -583,8 +586,9 @@ static int dio_new_bio(struct dio *dio, sector_t start_sector)
583 goto out; 586 goto out;
584 sector = start_sector << (dio->blkbits - 9); 587 sector = start_sector << (dio->blkbits - 9);
585 nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev)); 588 nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
589 nr_pages = min(nr_pages, BIO_MAX_PAGES);
586 BUG_ON(nr_pages <= 0); 590 BUG_ON(nr_pages <= 0);
587 ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages); 591 dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
588 dio->boundary = 0; 592 dio->boundary = 0;
589out: 593out:
590 return ret; 594 return ret;
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 2dbb422e8116..1897eb1b4b6a 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -1,8 +1,7 @@
1menuconfig DLM 1menuconfig DLM
2 tristate "Distributed Lock Manager (DLM)" 2 tristate "Distributed Lock Manager (DLM)"
3 depends on EXPERIMENTAL && INET 3 depends on EXPERIMENTAL && INET
4 depends on SYSFS && (IPV6 || IPV6=n) 4 depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
5 select CONFIGFS_FS
6 select IP_SCTP 5 select IP_SCTP
7 help 6 help
8 A general purpose distributed lock manager for kernel or userspace 7 A general purpose distributed lock manager for kernel or userspace
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 9c64ae9e4c1a..2d8c87b951c2 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1468,15 +1468,13 @@ static void work_stop(void)
1468 1468
1469static int work_start(void) 1469static int work_start(void)
1470{ 1470{
1471 recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM | 1471 recv_workqueue = create_singlethread_workqueue("dlm_recv");
1472 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
1473 if (!recv_workqueue) { 1472 if (!recv_workqueue) {
1474 log_print("can't start dlm_recv"); 1473 log_print("can't start dlm_recv");
1475 return -ENOMEM; 1474 return -ENOMEM;
1476 } 1475 }
1477 1476
1478 send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM | 1477 send_workqueue = create_singlethread_workqueue("dlm_send");
1479 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
1480 if (!send_workqueue) { 1478 if (!send_workqueue) {
1481 log_print("can't start dlm_send"); 1479 log_print("can't start dlm_send");
1482 destroy_workqueue(recv_workqueue); 1480 destroy_workqueue(recv_workqueue);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index cbadc1bee6e7..bfd8b680e648 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -348,7 +348,7 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
348 BUG_ON(!crypt_stat || !crypt_stat->tfm 348 BUG_ON(!crypt_stat || !crypt_stat->tfm
349 || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)); 349 || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
350 if (unlikely(ecryptfs_verbosity > 0)) { 350 if (unlikely(ecryptfs_verbosity > 0)) {
351 ecryptfs_printk(KERN_DEBUG, "Key size [%d]; key:\n", 351 ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
352 crypt_stat->key_size); 352 crypt_stat->key_size);
353 ecryptfs_dump_hex(crypt_stat->key, 353 ecryptfs_dump_hex(crypt_stat->key,
354 crypt_stat->key_size); 354 crypt_stat->key_size);
@@ -413,10 +413,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
413 rc = ecryptfs_derive_iv(extent_iv, crypt_stat, 413 rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
414 (extent_base + extent_offset)); 414 (extent_base + extent_offset));
415 if (rc) { 415 if (rc) {
416 ecryptfs_printk(KERN_ERR, "Error attempting to " 416 ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
417 "derive IV for extent [0x%.16x]; " 417 "extent [0x%.16llx]; rc = [%d]\n",
418 "rc = [%d]\n", (extent_base + extent_offset), 418 (unsigned long long)(extent_base + extent_offset), rc);
419 rc);
420 goto out; 419 goto out;
421 } 420 }
422 if (unlikely(ecryptfs_verbosity > 0)) { 421 if (unlikely(ecryptfs_verbosity > 0)) {
@@ -443,9 +442,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
443 } 442 }
444 rc = 0; 443 rc = 0;
445 if (unlikely(ecryptfs_verbosity > 0)) { 444 if (unlikely(ecryptfs_verbosity > 0)) {
446 ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16x]; " 445 ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; "
447 "rc = [%d]\n", (extent_base + extent_offset), 446 "rc = [%d]\n",
448 rc); 447 (unsigned long long)(extent_base + extent_offset), rc);
449 ecryptfs_printk(KERN_DEBUG, "First 8 bytes after " 448 ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
450 "encryption:\n"); 449 "encryption:\n");
451 ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8); 450 ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8);
@@ -540,10 +539,9 @@ static int ecryptfs_decrypt_extent(struct page *page,
540 rc = ecryptfs_derive_iv(extent_iv, crypt_stat, 539 rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
541 (extent_base + extent_offset)); 540 (extent_base + extent_offset));
542 if (rc) { 541 if (rc) {
543 ecryptfs_printk(KERN_ERR, "Error attempting to " 542 ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
544 "derive IV for extent [0x%.16x]; " 543 "extent [0x%.16llx]; rc = [%d]\n",
545 "rc = [%d]\n", (extent_base + extent_offset), 544 (unsigned long long)(extent_base + extent_offset), rc);
546 rc);
547 goto out; 545 goto out;
548 } 546 }
549 if (unlikely(ecryptfs_verbosity > 0)) { 547 if (unlikely(ecryptfs_verbosity > 0)) {
@@ -571,9 +569,9 @@ static int ecryptfs_decrypt_extent(struct page *page,
571 } 569 }
572 rc = 0; 570 rc = 0;
573 if (unlikely(ecryptfs_verbosity > 0)) { 571 if (unlikely(ecryptfs_verbosity > 0)) {
574 ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16x]; " 572 ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; "
575 "rc = [%d]\n", (extent_base + extent_offset), 573 "rc = [%d]\n",
576 rc); 574 (unsigned long long)(extent_base + extent_offset), rc);
577 ecryptfs_printk(KERN_DEBUG, "First 8 bytes after " 575 ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
578 "decryption:\n"); 576 "decryption:\n");
579 ecryptfs_dump_hex((char *)(page_address(page) 577 ecryptfs_dump_hex((char *)(page_address(page)
@@ -780,7 +778,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
780 } 778 }
781 ecryptfs_printk(KERN_DEBUG, 779 ecryptfs_printk(KERN_DEBUG,
782 "Initializing cipher [%s]; strlen = [%d]; " 780 "Initializing cipher [%s]; strlen = [%d]; "
783 "key_size_bits = [%d]\n", 781 "key_size_bits = [%zd]\n",
784 crypt_stat->cipher, (int)strlen(crypt_stat->cipher), 782 crypt_stat->cipher, (int)strlen(crypt_stat->cipher),
785 crypt_stat->key_size << 3); 783 crypt_stat->key_size << 3);
786 if (crypt_stat->tfm) { 784 if (crypt_stat->tfm) {
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 413a3c48f0bb..dbc84ed96336 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -192,7 +192,6 @@ ecryptfs_get_key_payload_data(struct key *key)
192 (((struct user_key_payload*)key->payload.data)->data); 192 (((struct user_key_payload*)key->payload.data)->data);
193} 193}
194 194
195#define ECRYPTFS_SUPER_MAGIC 0xf15f
196#define ECRYPTFS_MAX_KEYSET_SIZE 1024 195#define ECRYPTFS_MAX_KEYSET_SIZE 1024
197#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32 196#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32
198#define ECRYPTFS_MAX_NUM_ENC_KEYS 64 197#define ECRYPTFS_MAX_NUM_ENC_KEYS 64
@@ -584,6 +583,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
584 583
585#define ecryptfs_printk(type, fmt, arg...) \ 584#define ecryptfs_printk(type, fmt, arg...) \
586 __ecryptfs_printk(type "%s: " fmt, __func__, ## arg); 585 __ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
586__attribute__ ((format(printf, 1, 2)))
587void __ecryptfs_printk(const char *fmt, ...); 587void __ecryptfs_printk(const char *fmt, ...);
588 588
589extern const struct file_operations ecryptfs_main_fops; 589extern const struct file_operations ecryptfs_main_fops;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 91da02987bff..81e10e6a9443 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -47,7 +47,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
47 const struct iovec *iov, 47 const struct iovec *iov,
48 unsigned long nr_segs, loff_t pos) 48 unsigned long nr_segs, loff_t pos)
49{ 49{
50 int rc; 50 ssize_t rc;
51 struct dentry *lower_dentry; 51 struct dentry *lower_dentry;
52 struct vfsmount *lower_vfsmount; 52 struct vfsmount *lower_vfsmount;
53 struct file *file = iocb->ki_filp; 53 struct file *file = iocb->ki_filp;
@@ -191,18 +191,16 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
191 | ECRYPTFS_ENCRYPTED); 191 | ECRYPTFS_ENCRYPTED);
192 } 192 }
193 mutex_unlock(&crypt_stat->cs_mutex); 193 mutex_unlock(&crypt_stat->cs_mutex);
194 if (!ecryptfs_inode_to_private(inode)->lower_file) { 194 rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
195 rc = ecryptfs_init_persistent_file(ecryptfs_dentry); 195 if (rc) {
196 if (rc) { 196 printk(KERN_ERR "%s: Error attempting to initialize "
197 printk(KERN_ERR "%s: Error attempting to initialize " 197 "the persistent file for the dentry with name "
198 "the persistent file for the dentry with name " 198 "[%s]; rc = [%d]\n", __func__,
199 "[%s]; rc = [%d]\n", __func__, 199 ecryptfs_dentry->d_name.name, rc);
200 ecryptfs_dentry->d_name.name, rc); 200 goto out_free;
201 goto out_free;
202 }
203 } 201 }
204 if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY) 202 if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE)
205 && !(file->f_flags & O_RDONLY)) { 203 == O_RDONLY && (file->f_flags & O_ACCMODE) != O_RDONLY) {
206 rc = -EPERM; 204 rc = -EPERM;
207 printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " 205 printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
208 "file must hence be opened RO\n", __func__); 206 "file must hence be opened RO\n", __func__);
@@ -243,9 +241,9 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
243 } 241 }
244 } 242 }
245 mutex_unlock(&crypt_stat->cs_mutex); 243 mutex_unlock(&crypt_stat->cs_mutex);
246 ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = [0x%.16x] " 244 ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = "
247 "size: [0x%.16x]\n", inode, inode->i_ino, 245 "[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino,
248 i_size_read(inode)); 246 (unsigned long long)i_size_read(inode));
249 goto out; 247 goto out;
250out_free: 248out_free:
251 kmem_cache_free(ecryptfs_file_info_cache, 249 kmem_cache_free(ecryptfs_file_info_cache,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 64ff02330752..bd33f87a1907 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -185,15 +185,13 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
185 "context; rc = [%d]\n", rc); 185 "context; rc = [%d]\n", rc);
186 goto out; 186 goto out;
187 } 187 }
188 if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) { 188 rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
189 rc = ecryptfs_init_persistent_file(ecryptfs_dentry); 189 if (rc) {
190 if (rc) { 190 printk(KERN_ERR "%s: Error attempting to initialize "
191 printk(KERN_ERR "%s: Error attempting to initialize " 191 "the persistent file for the dentry with name "
192 "the persistent file for the dentry with name " 192 "[%s]; rc = [%d]\n", __func__,
193 "[%s]; rc = [%d]\n", __func__, 193 ecryptfs_dentry->d_name.name, rc);
194 ecryptfs_dentry->d_name.name, rc); 194 goto out;
195 goto out;
196 }
197 } 195 }
198 rc = ecryptfs_write_metadata(ecryptfs_dentry); 196 rc = ecryptfs_write_metadata(ecryptfs_dentry);
199 if (rc) { 197 if (rc) {
@@ -302,15 +300,13 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
302 rc = -ENOMEM; 300 rc = -ENOMEM;
303 goto out; 301 goto out;
304 } 302 }
305 if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) { 303 rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
306 rc = ecryptfs_init_persistent_file(ecryptfs_dentry); 304 if (rc) {
307 if (rc) { 305 printk(KERN_ERR "%s: Error attempting to initialize "
308 printk(KERN_ERR "%s: Error attempting to initialize " 306 "the persistent file for the dentry with name "
309 "the persistent file for the dentry with name " 307 "[%s]; rc = [%d]\n", __func__,
310 "[%s]; rc = [%d]\n", __func__, 308 ecryptfs_dentry->d_name.name, rc);
311 ecryptfs_dentry->d_name.name, rc); 309 goto out_free_kmem;
312 goto out_free_kmem;
313 }
314 } 310 }
315 crypt_stat = &ecryptfs_inode_to_private( 311 crypt_stat = &ecryptfs_inode_to_private(
316 ecryptfs_dentry->d_inode)->crypt_stat; 312 ecryptfs_dentry->d_inode)->crypt_stat;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index b1f6858a5223..c1436cff6f2d 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -59,7 +59,7 @@ static int process_request_key_err(long err_code)
59 break; 59 break;
60 default: 60 default:
61 ecryptfs_printk(KERN_WARNING, "Unknown error code: " 61 ecryptfs_printk(KERN_WARNING, "Unknown error code: "
62 "[0x%.16x]\n", err_code); 62 "[0x%.16lx]\n", err_code);
63 rc = -EINVAL; 63 rc = -EINVAL;
64 } 64 }
65 return rc; 65 return rc;
@@ -130,7 +130,7 @@ int ecryptfs_write_packet_length(char *dest, size_t size,
130 } else { 130 } else {
131 rc = -EINVAL; 131 rc = -EINVAL;
132 ecryptfs_printk(KERN_WARNING, 132 ecryptfs_printk(KERN_WARNING,
133 "Unsupported packet size: [%d]\n", size); 133 "Unsupported packet size: [%zd]\n", size);
134 } 134 }
135 return rc; 135 return rc;
136} 136}
@@ -1672,7 +1672,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
1672 auth_tok->session_key.decrypted_key_size); 1672 auth_tok->session_key.decrypted_key_size);
1673 crypt_stat->flags |= ECRYPTFS_KEY_VALID; 1673 crypt_stat->flags |= ECRYPTFS_KEY_VALID;
1674 if (unlikely(ecryptfs_verbosity > 0)) { 1674 if (unlikely(ecryptfs_verbosity > 0)) {
1675 ecryptfs_printk(KERN_DEBUG, "FEK of size [%d]:\n", 1675 ecryptfs_printk(KERN_DEBUG, "FEK of size [%zd]:\n",
1676 crypt_stat->key_size); 1676 crypt_stat->key_size);
1677 ecryptfs_dump_hex(crypt_stat->key, 1677 ecryptfs_dump_hex(crypt_stat->key,
1678 crypt_stat->key_size); 1678 crypt_stat->key_size);
@@ -1754,7 +1754,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
1754 if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) { 1754 if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) {
1755 ecryptfs_printk(KERN_ERR, "Expected " 1755 ecryptfs_printk(KERN_ERR, "Expected "
1756 "signature of size [%d]; " 1756 "signature of size [%d]; "
1757 "read size [%d]\n", 1757 "read size [%zd]\n",
1758 ECRYPTFS_SIG_SIZE, 1758 ECRYPTFS_SIG_SIZE,
1759 tag_11_contents_size); 1759 tag_11_contents_size);
1760 rc = -EIO; 1760 rc = -EIO;
@@ -1787,8 +1787,8 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
1787 goto out_wipe_list; 1787 goto out_wipe_list;
1788 break; 1788 break;
1789 default: 1789 default:
1790 ecryptfs_printk(KERN_DEBUG, "No packet at offset " 1790 ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] "
1791 "[%d] of the file header; hex value of " 1791 "of the file header; hex value of "
1792 "character is [0x%.2x]\n", i, src[i]); 1792 "character is [0x%.2x]\n", i, src[i]);
1793 next_packet_is_auth_tok_packet = 0; 1793 next_packet_is_auth_tok_packet = 0;
1794 } 1794 }
@@ -1864,8 +1864,8 @@ found_matching_auth_tok:
1864 "session key for authentication token with sig " 1864 "session key for authentication token with sig "
1865 "[%.*s]; rc = [%d]. Removing auth tok " 1865 "[%.*s]; rc = [%d]. Removing auth tok "
1866 "candidate from the list and searching for " 1866 "candidate from the list and searching for "
1867 "the next match.\n", candidate_auth_tok_sig, 1867 "the next match.\n", ECRYPTFS_SIG_SIZE_HEX,
1868 ECRYPTFS_SIG_SIZE_HEX, rc); 1868 candidate_auth_tok_sig, rc);
1869 list_for_each_entry_safe(auth_tok_list_item, 1869 list_for_each_entry_safe(auth_tok_list_item,
1870 auth_tok_list_item_tmp, 1870 auth_tok_list_item_tmp,
1871 &auth_tok_list, list) { 1871 &auth_tok_list, list) {
@@ -2168,7 +2168,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2168 if (encrypted_session_key_valid) { 2168 if (encrypted_session_key_valid) {
2169 ecryptfs_printk(KERN_DEBUG, "encrypted_session_key_valid != 0; " 2169 ecryptfs_printk(KERN_DEBUG, "encrypted_session_key_valid != 0; "
2170 "using auth_tok->session_key.encrypted_key, " 2170 "using auth_tok->session_key.encrypted_key, "
2171 "where key_rec->enc_key_size = [%d]\n", 2171 "where key_rec->enc_key_size = [%zd]\n",
2172 key_rec->enc_key_size); 2172 key_rec->enc_key_size);
2173 memcpy(key_rec->enc_key, 2173 memcpy(key_rec->enc_key,
2174 auth_tok->session_key.encrypted_key, 2174 auth_tok->session_key.encrypted_key,
@@ -2198,7 +2198,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2198 if (rc < 1 || rc > 2) { 2198 if (rc < 1 || rc > 2) {
2199 ecryptfs_printk(KERN_ERR, "Error generating scatterlist " 2199 ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
2200 "for crypt_stat session key; expected rc = 1; " 2200 "for crypt_stat session key; expected rc = 1; "
2201 "got rc = [%d]. key_rec->enc_key_size = [%d]\n", 2201 "got rc = [%d]. key_rec->enc_key_size = [%zd]\n",
2202 rc, key_rec->enc_key_size); 2202 rc, key_rec->enc_key_size);
2203 rc = -ENOMEM; 2203 rc = -ENOMEM;
2204 goto out; 2204 goto out;
@@ -2209,7 +2209,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2209 ecryptfs_printk(KERN_ERR, "Error generating scatterlist " 2209 ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
2210 "for crypt_stat encrypted session key; " 2210 "for crypt_stat encrypted session key; "
2211 "expected rc = 1; got rc = [%d]. " 2211 "expected rc = 1; got rc = [%d]. "
2212 "key_rec->enc_key_size = [%d]\n", rc, 2212 "key_rec->enc_key_size = [%zd]\n", rc,
2213 key_rec->enc_key_size); 2213 key_rec->enc_key_size);
2214 rc = -ENOMEM; 2214 rc = -ENOMEM;
2215 goto out; 2215 goto out;
@@ -2224,7 +2224,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2224 goto out; 2224 goto out;
2225 } 2225 }
2226 rc = 0; 2226 rc = 0;
2227 ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes of the key\n", 2227 ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n",
2228 crypt_stat->key_size); 2228 crypt_stat->key_size);
2229 rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg, 2229 rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg,
2230 (*key_rec).enc_key_size); 2230 (*key_rec).enc_key_size);
@@ -2235,7 +2235,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
2235 } 2235 }
2236 ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n"); 2236 ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n");
2237 if (ecryptfs_verbosity > 0) { 2237 if (ecryptfs_verbosity > 0) {
2238 ecryptfs_printk(KERN_DEBUG, "EFEK of size [%d]:\n", 2238 ecryptfs_printk(KERN_DEBUG, "EFEK of size [%zd]:\n",
2239 key_rec->enc_key_size); 2239 key_rec->enc_key_size);
2240 ecryptfs_dump_hex(key_rec->enc_key, 2240 ecryptfs_dump_hex(key_rec->enc_key,
2241 key_rec->enc_key_size); 2241 key_rec->enc_key_size);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 9ed476906327..758323a0f09a 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -36,6 +36,7 @@
36#include <linux/parser.h> 36#include <linux/parser.h>
37#include <linux/fs_stack.h> 37#include <linux/fs_stack.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/magic.h>
39#include "ecryptfs_kernel.h" 40#include "ecryptfs_kernel.h"
40 41
41/** 42/**
@@ -141,13 +142,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
141 return rc; 142 return rc;
142} 143}
143 144
144static inode *ecryptfs_get_inode(struct inode *lower_inode, 145static struct inode *ecryptfs_get_inode(struct inode *lower_inode,
145 struct super_block *sb) 146 struct super_block *sb)
146{ 147{
147 struct inode *inode; 148 struct inode *inode;
148 int rc = 0; 149 int rc = 0;
149 150
150 lower_inode = lower_dentry->d_inode;
151 if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) { 151 if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
152 rc = -EXDEV; 152 rc = -EXDEV;
153 goto out; 153 goto out;
@@ -202,7 +202,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
202{ 202{
203 struct inode *lower_inode = lower_dentry->d_inode; 203 struct inode *lower_inode = lower_dentry->d_inode;
204 struct inode *inode = ecryptfs_get_inode(lower_inode, sb); 204 struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
205 if (IS_ERR(inode) 205 if (IS_ERR(inode))
206 return PTR_ERR(inode); 206 return PTR_ERR(inode);
207 if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD) 207 if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
208 d_add(dentry, inode); 208 d_add(dentry, inode);
@@ -565,6 +565,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
565 ecryptfs_set_superblock_lower(s, path.dentry->d_sb); 565 ecryptfs_set_superblock_lower(s, path.dentry->d_sb);
566 s->s_maxbytes = path.dentry->d_sb->s_maxbytes; 566 s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
567 s->s_blocksize = path.dentry->d_sb->s_blocksize; 567 s->s_blocksize = path.dentry->d_sb->s_blocksize;
568 s->s_magic = ECRYPTFS_SUPER_MAGIC;
568 569
569 inode = ecryptfs_get_inode(path.dentry->d_inode, s); 570 inode = ecryptfs_get_inode(path.dentry->d_inode, s);
570 rc = PTR_ERR(inode); 571 rc = PTR_ERR(inode);
@@ -809,9 +810,10 @@ static int __init ecryptfs_init(void)
809 ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is " 810 ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is "
810 "larger than the host's page size, and so " 811 "larger than the host's page size, and so "
811 "eCryptfs cannot run on this system. The " 812 "eCryptfs cannot run on this system. The "
812 "default eCryptfs extent size is [%d] bytes; " 813 "default eCryptfs extent size is [%u] bytes; "
813 "the page size is [%d] bytes.\n", 814 "the page size is [%lu] bytes.\n",
814 ECRYPTFS_DEFAULT_EXTENT_SIZE, PAGE_CACHE_SIZE); 815 ECRYPTFS_DEFAULT_EXTENT_SIZE,
816 (unsigned long)PAGE_CACHE_SIZE);
815 goto out; 817 goto out;
816 } 818 }
817 rc = ecryptfs_init_kmem_caches(); 819 rc = ecryptfs_init_kmem_caches();
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index b1d82756544b..cc64fca89f8d 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -65,7 +65,7 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
65 rc = ecryptfs_encrypt_page(page); 65 rc = ecryptfs_encrypt_page(page);
66 if (rc) { 66 if (rc) {
67 ecryptfs_printk(KERN_WARNING, "Error encrypting " 67 ecryptfs_printk(KERN_WARNING, "Error encrypting "
68 "page (upper index [0x%.16x])\n", page->index); 68 "page (upper index [0x%.16lx])\n", page->index);
69 ClearPageUptodate(page); 69 ClearPageUptodate(page);
70 goto out; 70 goto out;
71 } 71 }
@@ -237,7 +237,7 @@ out:
237 ClearPageUptodate(page); 237 ClearPageUptodate(page);
238 else 238 else
239 SetPageUptodate(page); 239 SetPageUptodate(page);
240 ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n", 240 ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16lx]\n",
241 page->index); 241 page->index);
242 unlock_page(page); 242 unlock_page(page);
243 return rc; 243 return rc;
@@ -290,6 +290,7 @@ static int ecryptfs_write_begin(struct file *file,
290 return -ENOMEM; 290 return -ENOMEM;
291 *pagep = page; 291 *pagep = page;
292 292
293 prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
293 if (!PageUptodate(page)) { 294 if (!PageUptodate(page)) {
294 struct ecryptfs_crypt_stat *crypt_stat = 295 struct ecryptfs_crypt_stat *crypt_stat =
295 &ecryptfs_inode_to_private(mapping->host)->crypt_stat; 296 &ecryptfs_inode_to_private(mapping->host)->crypt_stat;
@@ -335,18 +336,23 @@ static int ecryptfs_write_begin(struct file *file,
335 SetPageUptodate(page); 336 SetPageUptodate(page);
336 } 337 }
337 } else { 338 } else {
338 rc = ecryptfs_decrypt_page(page); 339 if (prev_page_end_size
339 if (rc) { 340 >= i_size_read(page->mapping->host)) {
340 printk(KERN_ERR "%s: Error decrypting page " 341 zero_user(page, 0, PAGE_CACHE_SIZE);
341 "at index [%ld]; rc = [%d]\n", 342 } else {
342 __func__, page->index, rc); 343 rc = ecryptfs_decrypt_page(page);
343 ClearPageUptodate(page); 344 if (rc) {
344 goto out; 345 printk(KERN_ERR "%s: Error decrypting "
346 "page at index [%ld]; "
347 "rc = [%d]\n",
348 __func__, page->index, rc);
349 ClearPageUptodate(page);
350 goto out;
351 }
345 } 352 }
346 SetPageUptodate(page); 353 SetPageUptodate(page);
347 } 354 }
348 } 355 }
349 prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
350 /* If creating a page or more of holes, zero them out via truncate. 356 /* If creating a page or more of holes, zero them out via truncate.
351 * Note, this will increase i_size. */ 357 * Note, this will increase i_size. */
352 if (index != 0) { 358 if (index != 0) {
@@ -488,7 +494,7 @@ static int ecryptfs_write_end(struct file *file,
488 } else 494 } else
489 ecryptfs_printk(KERN_DEBUG, "Not a new file\n"); 495 ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
490 ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" 496 ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
491 "(page w/ index = [0x%.16x], to = [%d])\n", index, to); 497 "(page w/ index = [0x%.16lx], to = [%d])\n", index, to);
492 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { 498 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
493 rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0, 499 rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0,
494 to); 500 to);
@@ -503,19 +509,20 @@ static int ecryptfs_write_end(struct file *file,
503 rc = fill_zeros_to_end_of_page(page, to); 509 rc = fill_zeros_to_end_of_page(page, to);
504 if (rc) { 510 if (rc) {
505 ecryptfs_printk(KERN_WARNING, "Error attempting to fill " 511 ecryptfs_printk(KERN_WARNING, "Error attempting to fill "
506 "zeros in page with index = [0x%.16x]\n", index); 512 "zeros in page with index = [0x%.16lx]\n", index);
507 goto out; 513 goto out;
508 } 514 }
509 rc = ecryptfs_encrypt_page(page); 515 rc = ecryptfs_encrypt_page(page);
510 if (rc) { 516 if (rc) {
511 ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper " 517 ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
512 "index [0x%.16x])\n", index); 518 "index [0x%.16lx])\n", index);
513 goto out; 519 goto out;
514 } 520 }
515 if (pos + copied > i_size_read(ecryptfs_inode)) { 521 if (pos + copied > i_size_read(ecryptfs_inode)) {
516 i_size_write(ecryptfs_inode, pos + copied); 522 i_size_write(ecryptfs_inode, pos + copied);
517 ecryptfs_printk(KERN_DEBUG, "Expanded file size to " 523 ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
518 "[0x%.16x]\n", i_size_read(ecryptfs_inode)); 524 "[0x%.16llx]\n",
525 (unsigned long long)i_size_read(ecryptfs_inode));
519 } 526 }
520 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); 527 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
521 if (rc) 528 if (rc)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index cc8a9b7d6064..267d0ada4541 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1114,6 +1114,17 @@ static int ep_send_events(struct eventpoll *ep,
1114 return ep_scan_ready_list(ep, ep_send_events_proc, &esed); 1114 return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
1115} 1115}
1116 1116
1117static inline struct timespec ep_set_mstimeout(long ms)
1118{
1119 struct timespec now, ts = {
1120 .tv_sec = ms / MSEC_PER_SEC,
1121 .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
1122 };
1123
1124 ktime_get_ts(&now);
1125 return timespec_add_safe(now, ts);
1126}
1127
1117static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 1128static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1118 int maxevents, long timeout) 1129 int maxevents, long timeout)
1119{ 1130{
@@ -1121,12 +1132,11 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1121 unsigned long flags; 1132 unsigned long flags;
1122 long slack; 1133 long slack;
1123 wait_queue_t wait; 1134 wait_queue_t wait;
1124 struct timespec end_time;
1125 ktime_t expires, *to = NULL; 1135 ktime_t expires, *to = NULL;
1126 1136
1127 if (timeout > 0) { 1137 if (timeout > 0) {
1128 ktime_get_ts(&end_time); 1138 struct timespec end_time = ep_set_mstimeout(timeout);
1129 timespec_add_ns(&end_time, (u64)timeout * NSEC_PER_MSEC); 1139
1130 slack = select_estimate_accuracy(&end_time); 1140 slack = select_estimate_accuracy(&end_time);
1131 to = &expires; 1141 to = &expires;
1132 *to = timespec_to_ktime(end_time); 1142 *to = timespec_to_ktime(end_time);
diff --git a/fs/exec.c b/fs/exec.c
index c62efcb959c7..52a447d9b6ab 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -120,7 +120,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
120 goto out; 120 goto out;
121 121
122 file = do_filp_open(AT_FDCWD, tmp, 122 file = do_filp_open(AT_FDCWD, tmp,
123 O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0, 123 O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
124 MAY_READ | MAY_EXEC | MAY_OPEN); 124 MAY_READ | MAY_EXEC | MAY_OPEN);
125 putname(tmp); 125 putname(tmp);
126 error = PTR_ERR(file); 126 error = PTR_ERR(file);
@@ -723,7 +723,7 @@ struct file *open_exec(const char *name)
723 int err; 723 int err;
724 724
725 file = do_filp_open(AT_FDCWD, name, 725 file = do_filp_open(AT_FDCWD, name,
726 O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0, 726 O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
727 MAY_EXEC | MAY_OPEN); 727 MAY_EXEC | MAY_OPEN);
728 if (IS_ERR(file)) 728 if (IS_ERR(file))
729 goto out; 729 goto out;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 42685424817b..a7555238c41a 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1030,7 +1030,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
1030 memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); 1030 memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
1031 } 1031 }
1032 1032
1033 inode->i_mapping->backing_dev_info = sb->s_bdi;
1034 if (S_ISREG(inode->i_mode)) { 1033 if (S_ISREG(inode->i_mode)) {
1035 inode->i_op = &exofs_file_inode_operations; 1034 inode->i_op = &exofs_file_inode_operations;
1036 inode->i_fop = &exofs_file_operations; 1035 inode->i_fop = &exofs_file_operations;
@@ -1131,7 +1130,6 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1131 1130
1132 sbi = sb->s_fs_info; 1131 sbi = sb->s_fs_info;
1133 1132
1134 inode->i_mapping->backing_dev_info = sb->s_bdi;
1135 sb->s_dirt = 1; 1133 sb->s_dirt = 1;
1136 inode_init_owner(inode, dir, mode); 1134 inode_init_owner(inode, dir, mode);
1137 inode->i_ino = sbi->s_nextid++; 1135 inode->i_ino = sbi->s_nextid++;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 7aa767d4f06f..85c8cc8f2473 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -754,7 +754,7 @@ static int ext3_release_dquot(struct dquot *dquot);
754static int ext3_mark_dquot_dirty(struct dquot *dquot); 754static int ext3_mark_dquot_dirty(struct dquot *dquot);
755static int ext3_write_info(struct super_block *sb, int type); 755static int ext3_write_info(struct super_block *sb, int type);
756static int ext3_quota_on(struct super_block *sb, int type, int format_id, 756static int ext3_quota_on(struct super_block *sb, int type, int format_id,
757 char *path); 757 struct path *path);
758static int ext3_quota_on_mount(struct super_block *sb, int type); 758static int ext3_quota_on_mount(struct super_block *sb, int type);
759static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, 759static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
760 size_t len, loff_t off); 760 size_t len, loff_t off);
@@ -2877,27 +2877,20 @@ static int ext3_quota_on_mount(struct super_block *sb, int type)
2877 * Standard function to be called on quota_on 2877 * Standard function to be called on quota_on
2878 */ 2878 */
2879static int ext3_quota_on(struct super_block *sb, int type, int format_id, 2879static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2880 char *name) 2880 struct path *path)
2881{ 2881{
2882 int err; 2882 int err;
2883 struct path path;
2884 2883
2885 if (!test_opt(sb, QUOTA)) 2884 if (!test_opt(sb, QUOTA))
2886 return -EINVAL; 2885 return -EINVAL;
2887 2886
2888 err = kern_path(name, LOOKUP_FOLLOW, &path);
2889 if (err)
2890 return err;
2891
2892 /* Quotafile not on the same filesystem? */ 2887 /* Quotafile not on the same filesystem? */
2893 if (path.mnt->mnt_sb != sb) { 2888 if (path->mnt->mnt_sb != sb)
2894 path_put(&path);
2895 return -EXDEV; 2889 return -EXDEV;
2896 }
2897 /* Journaling quota? */ 2890 /* Journaling quota? */
2898 if (EXT3_SB(sb)->s_qf_names[type]) { 2891 if (EXT3_SB(sb)->s_qf_names[type]) {
2899 /* Quotafile not of fs root? */ 2892 /* Quotafile not of fs root? */
2900 if (path.dentry->d_parent != sb->s_root) 2893 if (path->dentry->d_parent != sb->s_root)
2901 ext3_msg(sb, KERN_WARNING, 2894 ext3_msg(sb, KERN_WARNING,
2902 "warning: Quota file not on filesystem root. " 2895 "warning: Quota file not on filesystem root. "
2903 "Journaled quota will not work."); 2896 "Journaled quota will not work.");
@@ -2907,7 +2900,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2907 * When we journal data on quota file, we have to flush journal to see 2900 * When we journal data on quota file, we have to flush journal to see
2908 * all updates to the file when we bypass pagecache... 2901 * all updates to the file when we bypass pagecache...
2909 */ 2902 */
2910 if (ext3_should_journal_data(path.dentry->d_inode)) { 2903 if (ext3_should_journal_data(path->dentry->d_inode)) {
2911 /* 2904 /*
2912 * We don't need to lock updates but journal_flush() could 2905 * We don't need to lock updates but journal_flush() could
2913 * otherwise be livelocked... 2906 * otherwise be livelocked...
@@ -2915,15 +2908,11 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2915 journal_lock_updates(EXT3_SB(sb)->s_journal); 2908 journal_lock_updates(EXT3_SB(sb)->s_journal);
2916 err = journal_flush(EXT3_SB(sb)->s_journal); 2909 err = journal_flush(EXT3_SB(sb)->s_journal);
2917 journal_unlock_updates(EXT3_SB(sb)->s_journal); 2910 journal_unlock_updates(EXT3_SB(sb)->s_journal);
2918 if (err) { 2911 if (err)
2919 path_put(&path);
2920 return err; 2912 return err;
2921 }
2922 } 2913 }
2923 2914
2924 err = dquot_quota_on_path(sb, type, format_id, &path); 2915 return dquot_quota_on(sb, type, format_id, path);
2925 path_put(&path);
2926 return err;
2927} 2916}
2928 2917
2929/* Read data from quotafile - avoid pagecache and such because we cannot afford 2918/* Read data from quotafile - avoid pagecache and such because we cannot afford
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1de65f572033..3aa0b72b3b94 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -848,6 +848,7 @@ struct ext4_inode_info {
848 atomic_t i_ioend_count; /* Number of outstanding io_end structs */ 848 atomic_t i_ioend_count; /* Number of outstanding io_end structs */
849 /* current io_end structure for async DIO write*/ 849 /* current io_end structure for async DIO write*/
850 ext4_io_end_t *cur_aio_dio; 850 ext4_io_end_t *cur_aio_dio;
851 atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
851 852
852 spinlock_t i_block_reservation_lock; 853 spinlock_t i_block_reservation_lock;
853 854
@@ -2065,7 +2066,7 @@ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
2065extern void ext4_ext_truncate(struct inode *); 2066extern void ext4_ext_truncate(struct inode *);
2066extern void ext4_ext_init(struct super_block *); 2067extern void ext4_ext_init(struct super_block *);
2067extern void ext4_ext_release(struct super_block *); 2068extern void ext4_ext_release(struct super_block *);
2068extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 2069extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
2069 loff_t len); 2070 loff_t len);
2070extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 2071extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
2071 ssize_t len); 2072 ssize_t len);
@@ -2119,6 +2120,15 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
2119 2120
2120#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 2121#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
2121 2122
2123/* For ioend & aio unwritten conversion wait queues */
2124#define EXT4_WQ_HASH_SZ 37
2125#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
2126 EXT4_WQ_HASH_SZ])
2127#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\
2128 EXT4_WQ_HASH_SZ])
2129extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
2130extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
2131
2122#endif /* __KERNEL__ */ 2132#endif /* __KERNEL__ */
2123 2133
2124#endif /* _EXT4_H */ 2134#endif /* _EXT4_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 6b90b6825d32..686240e89df1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3174,9 +3174,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3174 * that this IO needs to convertion to written when IO is 3174 * that this IO needs to convertion to written when IO is
3175 * completed 3175 * completed
3176 */ 3176 */
3177 if (io) 3177 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
3178 io->flag = EXT4_IO_END_UNWRITTEN; 3178 io->flag = EXT4_IO_END_UNWRITTEN;
3179 else 3179 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
3180 } else
3180 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3181 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3181 if (ext4_should_dioread_nolock(inode)) 3182 if (ext4_should_dioread_nolock(inode))
3182 map->m_flags |= EXT4_MAP_UNINIT; 3183 map->m_flags |= EXT4_MAP_UNINIT;
@@ -3463,9 +3464,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3463 * that we need to perform convertion when IO is done. 3464 * that we need to perform convertion when IO is done.
3464 */ 3465 */
3465 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3466 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3466 if (io) 3467 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
3467 io->flag = EXT4_IO_END_UNWRITTEN; 3468 io->flag = EXT4_IO_END_UNWRITTEN;
3468 else 3469 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
3470 } else
3469 ext4_set_inode_state(inode, 3471 ext4_set_inode_state(inode,
3470 EXT4_STATE_DIO_UNWRITTEN); 3472 EXT4_STATE_DIO_UNWRITTEN);
3471 } 3473 }
@@ -3627,14 +3629,15 @@ static void ext4_falloc_update_inode(struct inode *inode,
3627} 3629}
3628 3630
3629/* 3631/*
3630 * preallocate space for a file. This implements ext4's fallocate inode 3632 * preallocate space for a file. This implements ext4's fallocate file
3631 * operation, which gets called from sys_fallocate system call. 3633 * operation, which gets called from sys_fallocate system call.
3632 * For block-mapped files, posix_fallocate should fall back to the method 3634 * For block-mapped files, posix_fallocate should fall back to the method
3633 * of writing zeroes to the required new blocks (the same behavior which is 3635 * of writing zeroes to the required new blocks (the same behavior which is
3634 * expected for file systems which do not support fallocate() system call). 3636 * expected for file systems which do not support fallocate() system call).
3635 */ 3637 */
3636long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) 3638long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3637{ 3639{
3640 struct inode *inode = file->f_path.dentry->d_inode;
3638 handle_t *handle; 3641 handle_t *handle;
3639 loff_t new_size; 3642 loff_t new_size;
3640 unsigned int max_blocks; 3643 unsigned int max_blocks;
@@ -3645,7 +3648,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3645 unsigned int credits, blkbits = inode->i_blkbits; 3648 unsigned int credits, blkbits = inode->i_blkbits;
3646 3649
3647 /* We only support the FALLOC_FL_KEEP_SIZE mode */ 3650 /* We only support the FALLOC_FL_KEEP_SIZE mode */
3648 if (mode && (mode != FALLOC_FL_KEEP_SIZE)) 3651 if (mode & ~FALLOC_FL_KEEP_SIZE)
3649 return -EOPNOTSUPP; 3652 return -EOPNOTSUPP;
3650 3653
3651 /* 3654 /*
@@ -3655,10 +3658,6 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3655 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 3658 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3656 return -EOPNOTSUPP; 3659 return -EOPNOTSUPP;
3657 3660
3658 /* preallocation to directories is currently not supported */
3659 if (S_ISDIR(inode->i_mode))
3660 return -ENODEV;
3661
3662 map.m_lblk = offset >> blkbits; 3661 map.m_lblk = offset >> blkbits;
3663 /* 3662 /*
3664 * We can't just convert len to max_blocks because 3663 * We can't just convert len to max_blocks because
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index bb003dc9ffff..7b80d543b89e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -55,11 +55,47 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
55 return 0; 55 return 0;
56} 56}
57 57
58static void ext4_aiodio_wait(struct inode *inode)
59{
60 wait_queue_head_t *wq = ext4_ioend_wq(inode);
61
62 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
63}
64
65/*
66 * This tests whether the IO in question is block-aligned or not.
67 * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
68 * are converted to written only after the IO is complete. Until they are
69 * mapped, these blocks appear as holes, so dio_zero_block() will assume that
70 * it needs to zero out portions of the start and/or end block. If 2 AIO
71 * threads are at work on the same unwritten block, they must be synchronized
72 * or one thread will zero the other's data, causing corruption.
73 */
74static int
75ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
76 unsigned long nr_segs, loff_t pos)
77{
78 struct super_block *sb = inode->i_sb;
79 int blockmask = sb->s_blocksize - 1;
80 size_t count = iov_length(iov, nr_segs);
81 loff_t final_size = pos + count;
82
83 if (pos >= inode->i_size)
84 return 0;
85
86 if ((pos & blockmask) || (final_size & blockmask))
87 return 1;
88
89 return 0;
90}
91
58static ssize_t 92static ssize_t
59ext4_file_write(struct kiocb *iocb, const struct iovec *iov, 93ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
60 unsigned long nr_segs, loff_t pos) 94 unsigned long nr_segs, loff_t pos)
61{ 95{
62 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 96 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
97 int unaligned_aio = 0;
98 int ret;
63 99
64 /* 100 /*
65 * If we have encountered a bitmap-format file, the size limit 101 * If we have encountered a bitmap-format file, the size limit
@@ -78,9 +114,31 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
78 nr_segs = iov_shorten((struct iovec *)iov, nr_segs, 114 nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
79 sbi->s_bitmap_maxbytes - pos); 115 sbi->s_bitmap_maxbytes - pos);
80 } 116 }
117 } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
118 !is_sync_kiocb(iocb))) {
119 unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
120 }
121
122 /* Unaligned direct AIO must be serialized; see comment above */
123 if (unaligned_aio) {
124 static unsigned long unaligned_warn_time;
125
126 /* Warn about this once per day */
127 if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
128 ext4_msg(inode->i_sb, KERN_WARNING,
129 "Unaligned AIO/DIO on inode %ld by %s; "
130 "performance will be poor.",
131 inode->i_ino, current->comm);
132 mutex_lock(ext4_aio_mutex(inode));
133 ext4_aiodio_wait(inode);
81 } 134 }
82 135
83 return generic_file_aio_write(iocb, iov, nr_segs, pos); 136 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
137
138 if (unaligned_aio)
139 mutex_unlock(ext4_aio_mutex(inode));
140
141 return ret;
84} 142}
85 143
86static const struct vm_operations_struct ext4_file_vm_ops = { 144static const struct vm_operations_struct ext4_file_vm_ops = {
@@ -210,6 +268,7 @@ const struct file_operations ext4_file_operations = {
210 .fsync = ext4_sync_file, 268 .fsync = ext4_sync_file,
211 .splice_read = generic_file_splice_read, 269 .splice_read = generic_file_splice_read,
212 .splice_write = generic_file_splice_write, 270 .splice_write = generic_file_splice_write,
271 .fallocate = ext4_fallocate,
213}; 272};
214 273
215const struct inode_operations ext4_file_inode_operations = { 274const struct inode_operations ext4_file_inode_operations = {
@@ -223,7 +282,6 @@ const struct inode_operations ext4_file_inode_operations = {
223 .removexattr = generic_removexattr, 282 .removexattr = generic_removexattr,
224#endif 283#endif
225 .check_acl = ext4_check_acl, 284 .check_acl = ext4_check_acl,
226 .fallocate = ext4_fallocate,
227 .fiemap = ext4_fiemap, 285 .fiemap = ext4_fiemap,
228}; 286};
229 287
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 851f49b2f9d2..d1fe09aea73d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -342,10 +342,15 @@ static struct kmem_cache *ext4_free_ext_cachep;
342/* We create slab caches for groupinfo data structures based on the 342/* We create slab caches for groupinfo data structures based on the
343 * superblock block size. There will be one per mounted filesystem for 343 * superblock block size. There will be one per mounted filesystem for
344 * each unique s_blocksize_bits */ 344 * each unique s_blocksize_bits */
345#define NR_GRPINFO_CACHES \ 345#define NR_GRPINFO_CACHES 8
346 (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
347static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; 346static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
348 347
348static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
349 "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
350 "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
351 "ext4_groupinfo_64k", "ext4_groupinfo_128k"
352};
353
349static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 354static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
350 ext4_group_t group); 355 ext4_group_t group);
351static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 356static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@ -2414,6 +2419,55 @@ err_freesgi:
2414 return -ENOMEM; 2419 return -ENOMEM;
2415} 2420}
2416 2421
2422static void ext4_groupinfo_destroy_slabs(void)
2423{
2424 int i;
2425
2426 for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2427 if (ext4_groupinfo_caches[i])
2428 kmem_cache_destroy(ext4_groupinfo_caches[i]);
2429 ext4_groupinfo_caches[i] = NULL;
2430 }
2431}
2432
2433static int ext4_groupinfo_create_slab(size_t size)
2434{
2435 static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
2436 int slab_size;
2437 int blocksize_bits = order_base_2(size);
2438 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2439 struct kmem_cache *cachep;
2440
2441 if (cache_index >= NR_GRPINFO_CACHES)
2442 return -EINVAL;
2443
2444 if (unlikely(cache_index < 0))
2445 cache_index = 0;
2446
2447 mutex_lock(&ext4_grpinfo_slab_create_mutex);
2448 if (ext4_groupinfo_caches[cache_index]) {
2449 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2450 return 0; /* Already created */
2451 }
2452
2453 slab_size = offsetof(struct ext4_group_info,
2454 bb_counters[blocksize_bits + 2]);
2455
2456 cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
2457 slab_size, 0, SLAB_RECLAIM_ACCOUNT,
2458 NULL);
2459
2460 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2461 if (!cachep) {
2462 printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n");
2463 return -ENOMEM;
2464 }
2465
2466 ext4_groupinfo_caches[cache_index] = cachep;
2467
2468 return 0;
2469}
2470
2417int ext4_mb_init(struct super_block *sb, int needs_recovery) 2471int ext4_mb_init(struct super_block *sb, int needs_recovery)
2418{ 2472{
2419 struct ext4_sb_info *sbi = EXT4_SB(sb); 2473 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2421,9 +2475,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2421 unsigned offset; 2475 unsigned offset;
2422 unsigned max; 2476 unsigned max;
2423 int ret; 2477 int ret;
2424 int cache_index;
2425 struct kmem_cache *cachep;
2426 char *namep = NULL;
2427 2478
2428 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); 2479 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2429 2480
@@ -2440,30 +2491,9 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2440 goto out; 2491 goto out;
2441 } 2492 }
2442 2493
2443 cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; 2494 ret = ext4_groupinfo_create_slab(sb->s_blocksize);
2444 cachep = ext4_groupinfo_caches[cache_index]; 2495 if (ret < 0)
2445 if (!cachep) { 2496 goto out;
2446 char name[32];
2447 int len = offsetof(struct ext4_group_info,
2448 bb_counters[sb->s_blocksize_bits + 2]);
2449
2450 sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
2451 namep = kstrdup(name, GFP_KERNEL);
2452 if (!namep) {
2453 ret = -ENOMEM;
2454 goto out;
2455 }
2456
2457 /* Need to free the kmem_cache_name() when we
2458 * destroy the slab */
2459 cachep = kmem_cache_create(namep, len, 0,
2460 SLAB_RECLAIM_ACCOUNT, NULL);
2461 if (!cachep) {
2462 ret = -ENOMEM;
2463 goto out;
2464 }
2465 ext4_groupinfo_caches[cache_index] = cachep;
2466 }
2467 2497
2468 /* order 0 is regular bitmap */ 2498 /* order 0 is regular bitmap */
2469 sbi->s_mb_maxs[0] = sb->s_blocksize << 3; 2499 sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
@@ -2520,7 +2550,6 @@ out:
2520 if (ret) { 2550 if (ret) {
2521 kfree(sbi->s_mb_offsets); 2551 kfree(sbi->s_mb_offsets);
2522 kfree(sbi->s_mb_maxs); 2552 kfree(sbi->s_mb_maxs);
2523 kfree(namep);
2524 } 2553 }
2525 return ret; 2554 return ret;
2526} 2555}
@@ -2734,7 +2763,6 @@ int __init ext4_init_mballoc(void)
2734 2763
2735void ext4_exit_mballoc(void) 2764void ext4_exit_mballoc(void)
2736{ 2765{
2737 int i;
2738 /* 2766 /*
2739 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 2767 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2740 * before destroying the slab cache. 2768 * before destroying the slab cache.
@@ -2743,15 +2771,7 @@ void ext4_exit_mballoc(void)
2743 kmem_cache_destroy(ext4_pspace_cachep); 2771 kmem_cache_destroy(ext4_pspace_cachep);
2744 kmem_cache_destroy(ext4_ac_cachep); 2772 kmem_cache_destroy(ext4_ac_cachep);
2745 kmem_cache_destroy(ext4_free_ext_cachep); 2773 kmem_cache_destroy(ext4_free_ext_cachep);
2746 2774 ext4_groupinfo_destroy_slabs();
2747 for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2748 struct kmem_cache *cachep = ext4_groupinfo_caches[i];
2749 if (cachep) {
2750 char *name = (char *)kmem_cache_name(cachep);
2751 kmem_cache_destroy(cachep);
2752 kfree(name);
2753 }
2754 }
2755 ext4_remove_debugfs_entry(); 2775 ext4_remove_debugfs_entry();
2756} 2776}
2757 2777
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 7270dcfca92a..955cc309142f 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,14 +32,8 @@
32 32
33static struct kmem_cache *io_page_cachep, *io_end_cachep; 33static struct kmem_cache *io_page_cachep, *io_end_cachep;
34 34
35#define WQ_HASH_SZ 37
36#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
37static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
38
39int __init ext4_init_pageio(void) 35int __init ext4_init_pageio(void)
40{ 36{
41 int i;
42
43 io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); 37 io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
44 if (io_page_cachep == NULL) 38 if (io_page_cachep == NULL)
45 return -ENOMEM; 39 return -ENOMEM;
@@ -48,9 +42,6 @@ int __init ext4_init_pageio(void)
48 kmem_cache_destroy(io_page_cachep); 42 kmem_cache_destroy(io_page_cachep);
49 return -ENOMEM; 43 return -ENOMEM;
50 } 44 }
51 for (i = 0; i < WQ_HASH_SZ; i++)
52 init_waitqueue_head(&ioend_wq[i]);
53
54 return 0; 45 return 0;
55} 46}
56 47
@@ -62,7 +53,7 @@ void ext4_exit_pageio(void)
62 53
63void ext4_ioend_wait(struct inode *inode) 54void ext4_ioend_wait(struct inode *inode)
64{ 55{
65 wait_queue_head_t *wq = to_ioend_wq(inode); 56 wait_queue_head_t *wq = ext4_ioend_wq(inode);
66 57
67 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); 58 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
68} 59}
@@ -87,7 +78,7 @@ void ext4_free_io_end(ext4_io_end_t *io)
87 for (i = 0; i < io->num_io_pages; i++) 78 for (i = 0; i < io->num_io_pages; i++)
88 put_io_page(io->pages[i]); 79 put_io_page(io->pages[i]);
89 io->num_io_pages = 0; 80 io->num_io_pages = 0;
90 wq = to_ioend_wq(io->inode); 81 wq = ext4_ioend_wq(io->inode);
91 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && 82 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
92 waitqueue_active(wq)) 83 waitqueue_active(wq))
93 wake_up_all(wq); 84 wake_up_all(wq);
@@ -102,6 +93,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
102 struct inode *inode = io->inode; 93 struct inode *inode = io->inode;
103 loff_t offset = io->offset; 94 loff_t offset = io->offset;
104 ssize_t size = io->size; 95 ssize_t size = io->size;
96 wait_queue_head_t *wq;
105 int ret = 0; 97 int ret = 0;
106 98
107 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," 99 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
@@ -126,7 +118,16 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
126 if (io->iocb) 118 if (io->iocb)
127 aio_complete(io->iocb, io->result, 0); 119 aio_complete(io->iocb, io->result, 0);
128 /* clear the DIO AIO unwritten flag */ 120 /* clear the DIO AIO unwritten flag */
129 io->flag &= ~EXT4_IO_END_UNWRITTEN; 121 if (io->flag & EXT4_IO_END_UNWRITTEN) {
122 io->flag &= ~EXT4_IO_END_UNWRITTEN;
123 /* Wake up anyone waiting on unwritten extent conversion */
124 wq = ext4_ioend_wq(io->inode);
125 if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) &&
126 waitqueue_active(wq)) {
127 wake_up_all(wq);
128 }
129 }
130
130 return ret; 131 return ret;
131} 132}
132 133
@@ -190,6 +191,7 @@ static void ext4_end_bio(struct bio *bio, int error)
190 struct inode *inode; 191 struct inode *inode;
191 unsigned long flags; 192 unsigned long flags;
192 int i; 193 int i;
194 sector_t bi_sector = bio->bi_sector;
193 195
194 BUG_ON(!io_end); 196 BUG_ON(!io_end);
195 bio->bi_private = NULL; 197 bio->bi_private = NULL;
@@ -207,9 +209,7 @@ static void ext4_end_bio(struct bio *bio, int error)
207 if (error) 209 if (error)
208 SetPageError(page); 210 SetPageError(page);
209 BUG_ON(!head); 211 BUG_ON(!head);
210 if (head->b_size == PAGE_CACHE_SIZE) 212 if (head->b_size != PAGE_CACHE_SIZE) {
211 clear_buffer_dirty(head);
212 else {
213 loff_t offset; 213 loff_t offset;
214 loff_t io_end_offset = io_end->offset + io_end->size; 214 loff_t io_end_offset = io_end->offset + io_end->size;
215 215
@@ -221,7 +221,6 @@ static void ext4_end_bio(struct bio *bio, int error)
221 if (error) 221 if (error)
222 buffer_io_error(bh); 222 buffer_io_error(bh);
223 223
224 clear_buffer_dirty(bh);
225 } 224 }
226 if (buffer_delay(bh)) 225 if (buffer_delay(bh))
227 partial_write = 1; 226 partial_write = 1;
@@ -257,7 +256,7 @@ static void ext4_end_bio(struct bio *bio, int error)
257 (unsigned long long) io_end->offset, 256 (unsigned long long) io_end->offset,
258 (long) io_end->size, 257 (long) io_end->size,
259 (unsigned long long) 258 (unsigned long long)
260 bio->bi_sector >> (inode->i_blkbits - 9)); 259 bi_sector >> (inode->i_blkbits - 9));
261 } 260 }
262 261
263 /* Add the io_end to per-inode completed io list*/ 262 /* Add the io_end to per-inode completed io list*/
@@ -380,6 +379,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
380 379
381 blocksize = 1 << inode->i_blkbits; 380 blocksize = 1 << inode->i_blkbits;
382 381
382 BUG_ON(!PageLocked(page));
383 BUG_ON(PageWriteback(page)); 383 BUG_ON(PageWriteback(page));
384 set_page_writeback(page); 384 set_page_writeback(page);
385 ClearPageError(page); 385 ClearPageError(page);
@@ -397,12 +397,14 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
397 for (bh = head = page_buffers(page), block_start = 0; 397 for (bh = head = page_buffers(page), block_start = 0;
398 bh != head || !block_start; 398 bh != head || !block_start;
399 block_start = block_end, bh = bh->b_this_page) { 399 block_start = block_end, bh = bh->b_this_page) {
400
400 block_end = block_start + blocksize; 401 block_end = block_start + blocksize;
401 if (block_start >= len) { 402 if (block_start >= len) {
402 clear_buffer_dirty(bh); 403 clear_buffer_dirty(bh);
403 set_buffer_uptodate(bh); 404 set_buffer_uptodate(bh);
404 continue; 405 continue;
405 } 406 }
407 clear_buffer_dirty(bh);
406 ret = io_submit_add_bh(io, io_page, inode, wbc, bh); 408 ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
407 if (ret) { 409 if (ret) {
408 /* 410 /*
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cb10a06775e4..f6a318f836b2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -77,6 +77,7 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
77 const char *dev_name, void *data); 77 const char *dev_name, void *data);
78static void ext4_destroy_lazyinit_thread(void); 78static void ext4_destroy_lazyinit_thread(void);
79static void ext4_unregister_li_request(struct super_block *sb); 79static void ext4_unregister_li_request(struct super_block *sb);
80static void ext4_clear_request_list(void);
80 81
81#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 82#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
82static struct file_system_type ext3_fs_type = { 83static struct file_system_type ext3_fs_type = {
@@ -832,6 +833,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
832 ei->i_sync_tid = 0; 833 ei->i_sync_tid = 0;
833 ei->i_datasync_tid = 0; 834 ei->i_datasync_tid = 0;
834 atomic_set(&ei->i_ioend_count, 0); 835 atomic_set(&ei->i_ioend_count, 0);
836 atomic_set(&ei->i_aiodio_unwritten, 0);
835 837
836 return &ei->vfs_inode; 838 return &ei->vfs_inode;
837} 839}
@@ -1161,7 +1163,7 @@ static int ext4_release_dquot(struct dquot *dquot);
1161static int ext4_mark_dquot_dirty(struct dquot *dquot); 1163static int ext4_mark_dquot_dirty(struct dquot *dquot);
1162static int ext4_write_info(struct super_block *sb, int type); 1164static int ext4_write_info(struct super_block *sb, int type);
1163static int ext4_quota_on(struct super_block *sb, int type, int format_id, 1165static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1164 char *path); 1166 struct path *path);
1165static int ext4_quota_off(struct super_block *sb, int type); 1167static int ext4_quota_off(struct super_block *sb, int type);
1166static int ext4_quota_on_mount(struct super_block *sb, int type); 1168static int ext4_quota_on_mount(struct super_block *sb, int type);
1167static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, 1169static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
@@ -2716,6 +2718,8 @@ static void ext4_unregister_li_request(struct super_block *sb)
2716 mutex_unlock(&ext4_li_info->li_list_mtx); 2718 mutex_unlock(&ext4_li_info->li_list_mtx);
2717} 2719}
2718 2720
2721static struct task_struct *ext4_lazyinit_task;
2722
2719/* 2723/*
2720 * This is the function where ext4lazyinit thread lives. It walks 2724 * This is the function where ext4lazyinit thread lives. It walks
2721 * through the request list searching for next scheduled filesystem. 2725 * through the request list searching for next scheduled filesystem.
@@ -2784,6 +2788,10 @@ cont_thread:
2784 if (time_before(jiffies, next_wakeup)) 2788 if (time_before(jiffies, next_wakeup))
2785 schedule(); 2789 schedule();
2786 finish_wait(&eli->li_wait_daemon, &wait); 2790 finish_wait(&eli->li_wait_daemon, &wait);
2791 if (kthread_should_stop()) {
2792 ext4_clear_request_list();
2793 goto exit_thread;
2794 }
2787 } 2795 }
2788 2796
2789exit_thread: 2797exit_thread:
@@ -2808,6 +2816,7 @@ exit_thread:
2808 wake_up(&eli->li_wait_task); 2816 wake_up(&eli->li_wait_task);
2809 2817
2810 kfree(ext4_li_info); 2818 kfree(ext4_li_info);
2819 ext4_lazyinit_task = NULL;
2811 ext4_li_info = NULL; 2820 ext4_li_info = NULL;
2812 mutex_unlock(&ext4_li_mtx); 2821 mutex_unlock(&ext4_li_mtx);
2813 2822
@@ -2830,11 +2839,10 @@ static void ext4_clear_request_list(void)
2830 2839
2831static int ext4_run_lazyinit_thread(void) 2840static int ext4_run_lazyinit_thread(void)
2832{ 2841{
2833 struct task_struct *t; 2842 ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
2834 2843 ext4_li_info, "ext4lazyinit");
2835 t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit"); 2844 if (IS_ERR(ext4_lazyinit_task)) {
2836 if (IS_ERR(t)) { 2845 int err = PTR_ERR(ext4_lazyinit_task);
2837 int err = PTR_ERR(t);
2838 ext4_clear_request_list(); 2846 ext4_clear_request_list();
2839 del_timer_sync(&ext4_li_info->li_timer); 2847 del_timer_sync(&ext4_li_info->li_timer);
2840 kfree(ext4_li_info); 2848 kfree(ext4_li_info);
@@ -2985,16 +2993,10 @@ static void ext4_destroy_lazyinit_thread(void)
2985 * If thread exited earlier 2993 * If thread exited earlier
2986 * there's nothing to be done. 2994 * there's nothing to be done.
2987 */ 2995 */
2988 if (!ext4_li_info) 2996 if (!ext4_li_info || !ext4_lazyinit_task)
2989 return; 2997 return;
2990 2998
2991 ext4_clear_request_list(); 2999 kthread_stop(ext4_lazyinit_task);
2992
2993 while (ext4_li_info->li_task) {
2994 wake_up(&ext4_li_info->li_wait_daemon);
2995 wait_event(ext4_li_info->li_wait_task,
2996 ext4_li_info->li_task == NULL);
2997 }
2998} 3000}
2999 3001
3000static int ext4_fill_super(struct super_block *sb, void *data, int silent) 3002static int ext4_fill_super(struct super_block *sb, void *data, int silent)
@@ -4558,27 +4560,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
4558 * Standard function to be called on quota_on 4560 * Standard function to be called on quota_on
4559 */ 4561 */
4560static int ext4_quota_on(struct super_block *sb, int type, int format_id, 4562static int ext4_quota_on(struct super_block *sb, int type, int format_id,
4561 char *name) 4563 struct path *path)
4562{ 4564{
4563 int err; 4565 int err;
4564 struct path path;
4565 4566
4566 if (!test_opt(sb, QUOTA)) 4567 if (!test_opt(sb, QUOTA))
4567 return -EINVAL; 4568 return -EINVAL;
4568 4569
4569 err = kern_path(name, LOOKUP_FOLLOW, &path);
4570 if (err)
4571 return err;
4572
4573 /* Quotafile not on the same filesystem? */ 4570 /* Quotafile not on the same filesystem? */
4574 if (path.mnt->mnt_sb != sb) { 4571 if (path->mnt->mnt_sb != sb)
4575 path_put(&path);
4576 return -EXDEV; 4572 return -EXDEV;
4577 }
4578 /* Journaling quota? */ 4573 /* Journaling quota? */
4579 if (EXT4_SB(sb)->s_qf_names[type]) { 4574 if (EXT4_SB(sb)->s_qf_names[type]) {
4580 /* Quotafile not in fs root? */ 4575 /* Quotafile not in fs root? */
4581 if (path.dentry->d_parent != sb->s_root) 4576 if (path->dentry->d_parent != sb->s_root)
4582 ext4_msg(sb, KERN_WARNING, 4577 ext4_msg(sb, KERN_WARNING,
4583 "Quota file not on filesystem root. " 4578 "Quota file not on filesystem root. "
4584 "Journaled quota will not work"); 4579 "Journaled quota will not work");
@@ -4589,7 +4584,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
4589 * all updates to the file when we bypass pagecache... 4584 * all updates to the file when we bypass pagecache...
4590 */ 4585 */
4591 if (EXT4_SB(sb)->s_journal && 4586 if (EXT4_SB(sb)->s_journal &&
4592 ext4_should_journal_data(path.dentry->d_inode)) { 4587 ext4_should_journal_data(path->dentry->d_inode)) {
4593 /* 4588 /*
4594 * We don't need to lock updates but journal_flush() could 4589 * We don't need to lock updates but journal_flush() could
4595 * otherwise be livelocked... 4590 * otherwise be livelocked...
@@ -4597,15 +4592,11 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
4597 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 4592 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
4598 err = jbd2_journal_flush(EXT4_SB(sb)->s_journal); 4593 err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
4599 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 4594 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
4600 if (err) { 4595 if (err)
4601 path_put(&path);
4602 return err; 4596 return err;
4603 }
4604 } 4597 }
4605 4598
4606 err = dquot_quota_on_path(sb, type, format_id, &path); 4599 return dquot_quota_on(sb, type, format_id, path);
4607 path_put(&path);
4608 return err;
4609} 4600}
4610 4601
4611static int ext4_quota_off(struct super_block *sb, int type) 4602static int ext4_quota_off(struct super_block *sb, int type)
@@ -4779,7 +4770,7 @@ static struct file_system_type ext4_fs_type = {
4779 .fs_flags = FS_REQUIRES_DEV, 4770 .fs_flags = FS_REQUIRES_DEV,
4780}; 4771};
4781 4772
4782int __init ext4_init_feat_adverts(void) 4773static int __init ext4_init_feat_adverts(void)
4783{ 4774{
4784 struct ext4_features *ef; 4775 struct ext4_features *ef;
4785 int ret = -ENOMEM; 4776 int ret = -ENOMEM;
@@ -4803,23 +4794,44 @@ out:
4803 return ret; 4794 return ret;
4804} 4795}
4805 4796
4797static void ext4_exit_feat_adverts(void)
4798{
4799 kobject_put(&ext4_feat->f_kobj);
4800 wait_for_completion(&ext4_feat->f_kobj_unregister);
4801 kfree(ext4_feat);
4802}
4803
4804/* Shared across all ext4 file systems */
4805wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
4806struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
4807
4806static int __init ext4_init_fs(void) 4808static int __init ext4_init_fs(void)
4807{ 4809{
4808 int err; 4810 int i, err;
4809 4811
4810 ext4_check_flag_values(); 4812 ext4_check_flag_values();
4813
4814 for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
4815 mutex_init(&ext4__aio_mutex[i]);
4816 init_waitqueue_head(&ext4__ioend_wq[i]);
4817 }
4818
4811 err = ext4_init_pageio(); 4819 err = ext4_init_pageio();
4812 if (err) 4820 if (err)
4813 return err; 4821 return err;
4814 err = ext4_init_system_zone(); 4822 err = ext4_init_system_zone();
4815 if (err) 4823 if (err)
4816 goto out5; 4824 goto out7;
4817 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); 4825 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
4818 if (!ext4_kset) 4826 if (!ext4_kset)
4819 goto out4; 4827 goto out6;
4820 ext4_proc_root = proc_mkdir("fs/ext4", NULL); 4828 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
4829 if (!ext4_proc_root)
4830 goto out5;
4821 4831
4822 err = ext4_init_feat_adverts(); 4832 err = ext4_init_feat_adverts();
4833 if (err)
4834 goto out4;
4823 4835
4824 err = ext4_init_mballoc(); 4836 err = ext4_init_mballoc();
4825 if (err) 4837 if (err)
@@ -4849,12 +4861,14 @@ out1:
4849out2: 4861out2:
4850 ext4_exit_mballoc(); 4862 ext4_exit_mballoc();
4851out3: 4863out3:
4852 kfree(ext4_feat); 4864 ext4_exit_feat_adverts();
4865out4:
4853 remove_proc_entry("fs/ext4", NULL); 4866 remove_proc_entry("fs/ext4", NULL);
4867out5:
4854 kset_unregister(ext4_kset); 4868 kset_unregister(ext4_kset);
4855out4: 4869out6:
4856 ext4_exit_system_zone(); 4870 ext4_exit_system_zone();
4857out5: 4871out7:
4858 ext4_exit_pageio(); 4872 ext4_exit_pageio();
4859 return err; 4873 return err;
4860} 4874}
@@ -4868,6 +4882,7 @@ static void __exit ext4_exit_fs(void)
4868 destroy_inodecache(); 4882 destroy_inodecache();
4869 ext4_exit_xattr(); 4883 ext4_exit_xattr();
4870 ext4_exit_mballoc(); 4884 ext4_exit_mballoc();
4885 ext4_exit_feat_adverts();
4871 remove_proc_entry("fs/ext4", NULL); 4886 remove_proc_entry("fs/ext4", NULL);
4872 kset_unregister(ext4_kset); 4887 kset_unregister(ext4_kset);
4873 ext4_exit_system_zone(); 4888 ext4_exit_system_zone();
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ecc8b3954ed6..cb1026181bdc 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -815,7 +815,7 @@ static int __init fcntl_init(void)
815 __O_SYNC | O_DSYNC | FASYNC | 815 __O_SYNC | O_DSYNC | FASYNC |
816 O_DIRECT | O_LARGEFILE | O_DIRECTORY | 816 O_DIRECT | O_LARGEFILE | O_DIRECTORY |
817 O_NOFOLLOW | O_NOATIME | O_CLOEXEC | 817 O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
818 FMODE_EXEC 818 __FMODE_EXEC
819 )); 819 ));
820 820
821 fasync_cache = kmem_cache_create("fasync_cache", 821 fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/file_table.c b/fs/file_table.c
index c3dee381f1b4..eb36b6b17e26 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -125,13 +125,13 @@ struct file *get_empty_filp(void)
125 goto fail; 125 goto fail;
126 126
127 percpu_counter_inc(&nr_files); 127 percpu_counter_inc(&nr_files);
128 f->f_cred = get_cred(cred);
128 if (security_file_alloc(f)) 129 if (security_file_alloc(f))
129 goto fail_sec; 130 goto fail_sec;
130 131
131 INIT_LIST_HEAD(&f->f_u.fu_list); 132 INIT_LIST_HEAD(&f->f_u.fu_list);
132 atomic_long_set(&f->f_count, 1); 133 atomic_long_set(&f->f_count, 1);
133 rwlock_init(&f->f_owner.lock); 134 rwlock_init(&f->f_owner.lock);
134 f->f_cred = get_cred(cred);
135 spin_lock_init(&f->f_lock); 135 spin_lock_init(&f->f_lock);
136 eventpoll_init_file(f); 136 eventpoll_init_file(f);
137 /* f->f_version: 0 */ 137 /* f->f_version: 0 */
@@ -311,7 +311,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
311 struct files_struct *files = current->files; 311 struct files_struct *files = current->files;
312 312
313 *fput_needed = 0; 313 *fput_needed = 0;
314 if (likely((atomic_read(&files->count) == 1))) { 314 if (atomic_read(&files->count) == 1) {
315 file = fcheck_files(files, fd); 315 file = fcheck_files(files, fd);
316 } else { 316 } else {
317 rcu_read_lock(); 317 rcu_read_lock();
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3d06ccc953aa..59c6e4956786 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -84,13 +84,9 @@ static inline struct inode *wb_inode(struct list_head *head)
84 return list_entry(head, struct inode, i_wb_list); 84 return list_entry(head, struct inode, i_wb_list);
85} 85}
86 86
87static void bdi_queue_work(struct backing_dev_info *bdi, 87/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
88 struct wb_writeback_work *work) 88static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
89{ 89{
90 trace_writeback_queue(bdi, work);
91
92 spin_lock_bh(&bdi->wb_lock);
93 list_add_tail(&work->list, &bdi->work_list);
94 if (bdi->wb.task) { 90 if (bdi->wb.task) {
95 wake_up_process(bdi->wb.task); 91 wake_up_process(bdi->wb.task);
96 } else { 92 } else {
@@ -98,15 +94,26 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
98 * The bdi thread isn't there, wake up the forker thread which 94 * The bdi thread isn't there, wake up the forker thread which
99 * will create and run it. 95 * will create and run it.
100 */ 96 */
101 trace_writeback_nothread(bdi, work);
102 wake_up_process(default_backing_dev_info.wb.task); 97 wake_up_process(default_backing_dev_info.wb.task);
103 } 98 }
99}
100
101static void bdi_queue_work(struct backing_dev_info *bdi,
102 struct wb_writeback_work *work)
103{
104 trace_writeback_queue(bdi, work);
105
106 spin_lock_bh(&bdi->wb_lock);
107 list_add_tail(&work->list, &bdi->work_list);
108 if (!bdi->wb.task)
109 trace_writeback_nothread(bdi, work);
110 bdi_wakeup_flusher(bdi);
104 spin_unlock_bh(&bdi->wb_lock); 111 spin_unlock_bh(&bdi->wb_lock);
105} 112}
106 113
107static void 114static void
108__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, 115__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
109 bool range_cyclic, bool for_background) 116 bool range_cyclic)
110{ 117{
111 struct wb_writeback_work *work; 118 struct wb_writeback_work *work;
112 119
@@ -126,7 +133,6 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
126 work->sync_mode = WB_SYNC_NONE; 133 work->sync_mode = WB_SYNC_NONE;
127 work->nr_pages = nr_pages; 134 work->nr_pages = nr_pages;
128 work->range_cyclic = range_cyclic; 135 work->range_cyclic = range_cyclic;
129 work->for_background = for_background;
130 136
131 bdi_queue_work(bdi, work); 137 bdi_queue_work(bdi, work);
132} 138}
@@ -144,7 +150,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
144 */ 150 */
145void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) 151void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
146{ 152{
147 __bdi_start_writeback(bdi, nr_pages, true, false); 153 __bdi_start_writeback(bdi, nr_pages, true);
148} 154}
149 155
150/** 156/**
@@ -152,13 +158,21 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
152 * @bdi: the backing device to write from 158 * @bdi: the backing device to write from
153 * 159 *
154 * Description: 160 * Description:
155 * This does WB_SYNC_NONE background writeback. The IO is only 161 * This makes sure WB_SYNC_NONE background writeback happens. When
156 * started when this function returns, we make no guarentees on 162 * this function returns, it is only guaranteed that for given BDI
157 * completion. Caller need not hold sb s_umount semaphore. 163 * some IO is happening if we are over background dirty threshold.
164 * Caller need not hold sb s_umount semaphore.
158 */ 165 */
159void bdi_start_background_writeback(struct backing_dev_info *bdi) 166void bdi_start_background_writeback(struct backing_dev_info *bdi)
160{ 167{
161 __bdi_start_writeback(bdi, LONG_MAX, true, true); 168 /*
169 * We just wake up the flusher thread. It will perform background
170 * writeback as soon as there is no other work to do.
171 */
172 trace_writeback_wake_background(bdi);
173 spin_lock_bh(&bdi->wb_lock);
174 bdi_wakeup_flusher(bdi);
175 spin_unlock_bh(&bdi->wb_lock);
162} 176}
163 177
164/* 178/*
@@ -616,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb,
616 }; 630 };
617 unsigned long oldest_jif; 631 unsigned long oldest_jif;
618 long wrote = 0; 632 long wrote = 0;
633 long write_chunk;
619 struct inode *inode; 634 struct inode *inode;
620 635
621 if (wbc.for_kupdate) { 636 if (wbc.for_kupdate) {
@@ -628,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb,
628 wbc.range_end = LLONG_MAX; 643 wbc.range_end = LLONG_MAX;
629 } 644 }
630 645
646 /*
647 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
648 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
649 * here avoids calling into writeback_inodes_wb() more than once.
650 *
651 * The intended call sequence for WB_SYNC_ALL writeback is:
652 *
653 * wb_writeback()
654 * __writeback_inodes_sb() <== called only once
655 * write_cache_pages() <== called once for each inode
656 * (quickly) tag currently dirty pages
657 * (maybe slowly) sync all tagged pages
658 */
659 if (wbc.sync_mode == WB_SYNC_NONE)
660 write_chunk = MAX_WRITEBACK_PAGES;
661 else
662 write_chunk = LONG_MAX;
663
631 wbc.wb_start = jiffies; /* livelock avoidance */ 664 wbc.wb_start = jiffies; /* livelock avoidance */
632 for (;;) { 665 for (;;) {
633 /* 666 /*
@@ -637,6 +670,16 @@ static long wb_writeback(struct bdi_writeback *wb,
637 break; 670 break;
638 671
639 /* 672 /*
673 * Background writeout and kupdate-style writeback may
674 * run forever. Stop them if there is other work to do
675 * so that e.g. sync can proceed. They'll be restarted
676 * after the other works are all done.
677 */
678 if ((work->for_background || work->for_kupdate) &&
679 !list_empty(&wb->bdi->work_list))
680 break;
681
682 /*
640 * For background writeout, stop when we are below the 683 * For background writeout, stop when we are below the
641 * background dirty threshold 684 * background dirty threshold
642 */ 685 */
@@ -644,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb,
644 break; 687 break;
645 688
646 wbc.more_io = 0; 689 wbc.more_io = 0;
647 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 690 wbc.nr_to_write = write_chunk;
648 wbc.pages_skipped = 0; 691 wbc.pages_skipped = 0;
649 692
650 trace_wbc_writeback_start(&wbc, wb->bdi); 693 trace_wbc_writeback_start(&wbc, wb->bdi);
@@ -654,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb,
654 writeback_inodes_wb(wb, &wbc); 697 writeback_inodes_wb(wb, &wbc);
655 trace_wbc_writeback_written(&wbc, wb->bdi); 698 trace_wbc_writeback_written(&wbc, wb->bdi);
656 699
657 work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; 700 work->nr_pages -= write_chunk - wbc.nr_to_write;
658 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; 701 wrote += write_chunk - wbc.nr_to_write;
659 702
660 /* 703 /*
661 * If we consumed everything, see if we have more 704 * If we consumed everything, see if we have more
@@ -670,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb,
670 /* 713 /*
671 * Did we write something? Try for more 714 * Did we write something? Try for more
672 */ 715 */
673 if (wbc.nr_to_write < MAX_WRITEBACK_PAGES) 716 if (wbc.nr_to_write < write_chunk)
674 continue; 717 continue;
675 /* 718 /*
676 * Nothing written. Wait for some inode to 719 * Nothing written. Wait for some inode to
@@ -718,6 +761,23 @@ static unsigned long get_nr_dirty_pages(void)
718 get_nr_dirty_inodes(); 761 get_nr_dirty_inodes();
719} 762}
720 763
764static long wb_check_background_flush(struct bdi_writeback *wb)
765{
766 if (over_bground_thresh()) {
767
768 struct wb_writeback_work work = {
769 .nr_pages = LONG_MAX,
770 .sync_mode = WB_SYNC_NONE,
771 .for_background = 1,
772 .range_cyclic = 1,
773 };
774
775 return wb_writeback(wb, &work);
776 }
777
778 return 0;
779}
780
721static long wb_check_old_data_flush(struct bdi_writeback *wb) 781static long wb_check_old_data_flush(struct bdi_writeback *wb)
722{ 782{
723 unsigned long expired; 783 unsigned long expired;
@@ -787,6 +847,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
787 * Check for periodic writeback, kupdated() style 847 * Check for periodic writeback, kupdated() style
788 */ 848 */
789 wrote += wb_check_old_data_flush(wb); 849 wrote += wb_check_old_data_flush(wb);
850 wrote += wb_check_background_flush(wb);
790 clear_bit(BDI_writeback_running, &wb->bdi->state); 851 clear_bit(BDI_writeback_running, &wb->bdi->state);
791 852
792 return wrote; 853 return wrote;
@@ -873,7 +934,7 @@ void wakeup_flusher_threads(long nr_pages)
873 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 934 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
874 if (!bdi_has_dirty_io(bdi)) 935 if (!bdi_has_dirty_io(bdi))
875 continue; 936 continue;
876 __bdi_start_writeback(bdi, nr_pages, false, false); 937 __bdi_start_writeback(bdi, nr_pages, false);
877 } 938 }
878 rcu_read_unlock(); 939 rcu_read_unlock();
879} 940}
@@ -1164,7 +1225,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
1164 * @sb: the superblock 1225 * @sb: the superblock
1165 * 1226 *
1166 * This function writes and waits on any dirty inode belonging to this 1227 * This function writes and waits on any dirty inode belonging to this
1167 * super_block. The number of pages synced is returned. 1228 * super_block.
1168 */ 1229 */
1169void sync_inodes_sb(struct super_block *sb) 1230void sync_inodes_sb(struct super_block *sb)
1170{ 1231{
@@ -1242,11 +1303,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
1242EXPORT_SYMBOL(sync_inode); 1303EXPORT_SYMBOL(sync_inode);
1243 1304
1244/** 1305/**
1245 * sync_inode - write an inode to disk 1306 * sync_inode_metadata - write an inode to disk
1246 * @inode: the inode to sync 1307 * @inode: the inode to sync
1247 * @wait: wait for I/O to complete. 1308 * @wait: wait for I/O to complete.
1248 * 1309 *
1249 * Write an inode to disk and adjust it's dirty state after completion. 1310 * Write an inode to disk and adjust its dirty state after completion.
1250 * 1311 *
1251 * Note: only writes the actual inode, no associated data or other metadata. 1312 * Note: only writes the actual inode, no associated data or other metadata.
1252 */ 1313 */
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 68ca487bedb1..78b519c13536 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -4,6 +4,19 @@
4#include <linux/path.h> 4#include <linux/path.h>
5#include <linux/slab.h> 5#include <linux/slab.h>
6#include <linux/fs_struct.h> 6#include <linux/fs_struct.h>
7#include "internal.h"
8
9static inline void path_get_longterm(struct path *path)
10{
11 path_get(path);
12 mnt_make_longterm(path->mnt);
13}
14
15static inline void path_put_longterm(struct path *path)
16{
17 mnt_make_shortterm(path->mnt);
18 path_put(path);
19}
7 20
8/* 21/*
9 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. 22 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
@@ -17,11 +30,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
17 write_seqcount_begin(&fs->seq); 30 write_seqcount_begin(&fs->seq);
18 old_root = fs->root; 31 old_root = fs->root;
19 fs->root = *path; 32 fs->root = *path;
20 path_get_long(path); 33 path_get_longterm(path);
21 write_seqcount_end(&fs->seq); 34 write_seqcount_end(&fs->seq);
22 spin_unlock(&fs->lock); 35 spin_unlock(&fs->lock);
23 if (old_root.dentry) 36 if (old_root.dentry)
24 path_put_long(&old_root); 37 path_put_longterm(&old_root);
25} 38}
26 39
27/* 40/*
@@ -36,12 +49,12 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
36 write_seqcount_begin(&fs->seq); 49 write_seqcount_begin(&fs->seq);
37 old_pwd = fs->pwd; 50 old_pwd = fs->pwd;
38 fs->pwd = *path; 51 fs->pwd = *path;
39 path_get_long(path); 52 path_get_longterm(path);
40 write_seqcount_end(&fs->seq); 53 write_seqcount_end(&fs->seq);
41 spin_unlock(&fs->lock); 54 spin_unlock(&fs->lock);
42 55
43 if (old_pwd.dentry) 56 if (old_pwd.dentry)
44 path_put_long(&old_pwd); 57 path_put_longterm(&old_pwd);
45} 58}
46 59
47void chroot_fs_refs(struct path *old_root, struct path *new_root) 60void chroot_fs_refs(struct path *old_root, struct path *new_root)
@@ -59,13 +72,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
59 write_seqcount_begin(&fs->seq); 72 write_seqcount_begin(&fs->seq);
60 if (fs->root.dentry == old_root->dentry 73 if (fs->root.dentry == old_root->dentry
61 && fs->root.mnt == old_root->mnt) { 74 && fs->root.mnt == old_root->mnt) {
62 path_get_long(new_root); 75 path_get_longterm(new_root);
63 fs->root = *new_root; 76 fs->root = *new_root;
64 count++; 77 count++;
65 } 78 }
66 if (fs->pwd.dentry == old_root->dentry 79 if (fs->pwd.dentry == old_root->dentry
67 && fs->pwd.mnt == old_root->mnt) { 80 && fs->pwd.mnt == old_root->mnt) {
68 path_get_long(new_root); 81 path_get_longterm(new_root);
69 fs->pwd = *new_root; 82 fs->pwd = *new_root;
70 count++; 83 count++;
71 } 84 }
@@ -76,13 +89,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
76 } while_each_thread(g, p); 89 } while_each_thread(g, p);
77 read_unlock(&tasklist_lock); 90 read_unlock(&tasklist_lock);
78 while (count--) 91 while (count--)
79 path_put_long(old_root); 92 path_put_longterm(old_root);
80} 93}
81 94
82void free_fs_struct(struct fs_struct *fs) 95void free_fs_struct(struct fs_struct *fs)
83{ 96{
84 path_put_long(&fs->root); 97 path_put_longterm(&fs->root);
85 path_put_long(&fs->pwd); 98 path_put_longterm(&fs->pwd);
86 kmem_cache_free(fs_cachep, fs); 99 kmem_cache_free(fs_cachep, fs);
87} 100}
88 101
@@ -118,9 +131,9 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
118 131
119 spin_lock(&old->lock); 132 spin_lock(&old->lock);
120 fs->root = old->root; 133 fs->root = old->root;
121 path_get_long(&fs->root); 134 path_get_longterm(&fs->root);
122 fs->pwd = old->pwd; 135 fs->pwd = old->pwd;
123 path_get_long(&fs->pwd); 136 path_get_longterm(&fs->pwd);
124 spin_unlock(&old->lock); 137 spin_unlock(&old->lock);
125 } 138 }
126 return fs; 139 return fs;
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index b9f34eaede09..48a18f184d50 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -101,7 +101,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
101 object->n_ops++; 101 object->n_ops++;
102 object->n_exclusive++; /* reads and writes must wait */ 102 object->n_exclusive++; /* reads and writes must wait */
103 103
104 if (object->n_ops > 0) { 104 if (object->n_ops > 1) {
105 atomic_inc(&op->usage); 105 atomic_inc(&op->usage);
106 list_add_tail(&op->pend_link, &object->pending_ops); 106 list_add_tail(&op->pend_link, &object->pending_ops);
107 fscache_stat(&fscache_n_op_pend); 107 fscache_stat(&fscache_n_op_pend);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index fca6689e12e6..7cfdcb913363 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -19,6 +19,8 @@
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/gfs2_ondisk.h> 20#include <linux/gfs2_ondisk.h>
21#include <linux/ext2_fs.h> 21#include <linux/ext2_fs.h>
22#include <linux/falloc.h>
23#include <linux/swap.h>
22#include <linux/crc32.h> 24#include <linux/crc32.h>
23#include <linux/writeback.h> 25#include <linux/writeback.h>
24#include <asm/uaccess.h> 26#include <asm/uaccess.h>
@@ -610,6 +612,260 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
610 return generic_file_aio_write(iocb, iov, nr_segs, pos); 612 return generic_file_aio_write(iocb, iov, nr_segs, pos);
611} 613}
612 614
615static void empty_write_end(struct page *page, unsigned from,
616 unsigned to)
617{
618 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
619
620 page_zero_new_buffers(page, from, to);
621 flush_dcache_page(page);
622 mark_page_accessed(page);
623
624 if (!gfs2_is_writeback(ip))
625 gfs2_page_add_databufs(ip, page, from, to);
626
627 block_commit_write(page, from, to);
628}
629
630static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
631{
632 unsigned start, end, next;
633 struct buffer_head *bh, *head;
634 int error;
635
636 if (!page_has_buffers(page)) {
637 error = __block_write_begin(page, from, to - from, gfs2_block_map);
638 if (unlikely(error))
639 return error;
640
641 empty_write_end(page, from, to);
642 return 0;
643 }
644
645 bh = head = page_buffers(page);
646 next = end = 0;
647 while (next < from) {
648 next += bh->b_size;
649 bh = bh->b_this_page;
650 }
651 start = next;
652 do {
653 next += bh->b_size;
654 if (buffer_mapped(bh)) {
655 if (end) {
656 error = __block_write_begin(page, start, end - start,
657 gfs2_block_map);
658 if (unlikely(error))
659 return error;
660 empty_write_end(page, start, end);
661 end = 0;
662 }
663 start = next;
664 }
665 else
666 end = next;
667 bh = bh->b_this_page;
668 } while (next < to);
669
670 if (end) {
671 error = __block_write_begin(page, start, end - start, gfs2_block_map);
672 if (unlikely(error))
673 return error;
674 empty_write_end(page, start, end);
675 }
676
677 return 0;
678}
679
680static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
681 int mode)
682{
683 struct gfs2_inode *ip = GFS2_I(inode);
684 struct buffer_head *dibh;
685 int error;
686 u64 start = offset >> PAGE_CACHE_SHIFT;
687 unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
688 u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
689 pgoff_t curr;
690 struct page *page;
691 unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
692 unsigned int from, to;
693
694 if (!end_offset)
695 end_offset = PAGE_CACHE_SIZE;
696
697 error = gfs2_meta_inode_buffer(ip, &dibh);
698 if (unlikely(error))
699 goto out;
700
701 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
702
703 if (gfs2_is_stuffed(ip)) {
704 error = gfs2_unstuff_dinode(ip, NULL);
705 if (unlikely(error))
706 goto out;
707 }
708
709 curr = start;
710 offset = start << PAGE_CACHE_SHIFT;
711 from = start_offset;
712 to = PAGE_CACHE_SIZE;
713 while (curr <= end) {
714 page = grab_cache_page_write_begin(inode->i_mapping, curr,
715 AOP_FLAG_NOFS);
716 if (unlikely(!page)) {
717 error = -ENOMEM;
718 goto out;
719 }
720
721 if (curr == end)
722 to = end_offset;
723 error = write_empty_blocks(page, from, to);
724 if (!error && offset + to > inode->i_size &&
725 !(mode & FALLOC_FL_KEEP_SIZE)) {
726 i_size_write(inode, offset + to);
727 }
728 unlock_page(page);
729 page_cache_release(page);
730 if (error)
731 goto out;
732 curr++;
733 offset += PAGE_CACHE_SIZE;
734 from = 0;
735 }
736
737 gfs2_dinode_out(ip, dibh->b_data);
738 mark_inode_dirty(inode);
739
740 brelse(dibh);
741
742out:
743 return error;
744}
745
746static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
747 unsigned int *data_blocks, unsigned int *ind_blocks)
748{
749 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
750 unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
751 unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
752
753 for (tmp = max_data; tmp > sdp->sd_diptrs;) {
754 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
755 max_data -= tmp;
756 }
757 /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
758 so it might end up with fewer data blocks */
759 if (max_data <= *data_blocks)
760 return;
761 *data_blocks = max_data;
762 *ind_blocks = max_blocks - max_data;
763 *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
764 if (*len > max) {
765 *len = max;
766 gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
767 }
768}
769
770static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
771 loff_t len)
772{
773 struct inode *inode = file->f_path.dentry->d_inode;
774 struct gfs2_sbd *sdp = GFS2_SB(inode);
775 struct gfs2_inode *ip = GFS2_I(inode);
776 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
777 loff_t bytes, max_bytes;
778 struct gfs2_alloc *al;
779 int error;
780 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
781 next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
782
783 /* We only support the FALLOC_FL_KEEP_SIZE mode */
784 if (mode & ~FALLOC_FL_KEEP_SIZE)
785 return -EOPNOTSUPP;
786
787 offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
788 sdp->sd_sb.sb_bsize_shift;
789
790 len = next - offset;
791 bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
792 if (!bytes)
793 bytes = UINT_MAX;
794
795 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
796 error = gfs2_glock_nq(&ip->i_gh);
797 if (unlikely(error))
798 goto out_uninit;
799
800 if (!gfs2_write_alloc_required(ip, offset, len))
801 goto out_unlock;
802
803 while (len > 0) {
804 if (len < bytes)
805 bytes = len;
806 al = gfs2_alloc_get(ip);
807 if (!al) {
808 error = -ENOMEM;
809 goto out_unlock;
810 }
811
812 error = gfs2_quota_lock_check(ip);
813 if (error)
814 goto out_alloc_put;
815
816retry:
817 gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
818
819 al->al_requested = data_blocks + ind_blocks;
820 error = gfs2_inplace_reserve(ip);
821 if (error) {
822 if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
823 bytes >>= 1;
824 goto retry;
825 }
826 goto out_qunlock;
827 }
828 max_bytes = bytes;
829 calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
830 al->al_requested = data_blocks + ind_blocks;
831
832 rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
833 RES_RG_HDR + gfs2_rg_blocks(al);
834 if (gfs2_is_jdata(ip))
835 rblocks += data_blocks ? data_blocks : 1;
836
837 error = gfs2_trans_begin(sdp, rblocks,
838 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
839 if (error)
840 goto out_trans_fail;
841
842 error = fallocate_chunk(inode, offset, max_bytes, mode);
843 gfs2_trans_end(sdp);
844
845 if (error)
846 goto out_trans_fail;
847
848 len -= max_bytes;
849 offset += max_bytes;
850 gfs2_inplace_release(ip);
851 gfs2_quota_unlock(ip);
852 gfs2_alloc_put(ip);
853 }
854 goto out_unlock;
855
856out_trans_fail:
857 gfs2_inplace_release(ip);
858out_qunlock:
859 gfs2_quota_unlock(ip);
860out_alloc_put:
861 gfs2_alloc_put(ip);
862out_unlock:
863 gfs2_glock_dq(&ip->i_gh);
864out_uninit:
865 gfs2_holder_uninit(&ip->i_gh);
866 return error;
867}
868
613#ifdef CONFIG_GFS2_FS_LOCKING_DLM 869#ifdef CONFIG_GFS2_FS_LOCKING_DLM
614 870
615/** 871/**
@@ -765,6 +1021,7 @@ const struct file_operations gfs2_file_fops = {
765 .splice_read = generic_file_splice_read, 1021 .splice_read = generic_file_splice_read,
766 .splice_write = generic_file_splice_write, 1022 .splice_write = generic_file_splice_write,
767 .setlease = gfs2_setlease, 1023 .setlease = gfs2_setlease,
1024 .fallocate = gfs2_fallocate,
768}; 1025};
769 1026
770const struct file_operations gfs2_dir_fops = { 1027const struct file_operations gfs2_dir_fops = {
@@ -794,6 +1051,7 @@ const struct file_operations gfs2_file_fops_nolock = {
794 .splice_read = generic_file_splice_read, 1051 .splice_read = generic_file_splice_read,
795 .splice_write = generic_file_splice_write, 1052 .splice_write = generic_file_splice_write,
796 .setlease = generic_setlease, 1053 .setlease = generic_setlease,
1054 .fallocate = gfs2_fallocate,
797}; 1055};
798 1056
799const struct file_operations gfs2_dir_fops_nolock = { 1057const struct file_operations gfs2_dir_fops_nolock = {
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2232b3c780bd..7aa7d4f8984a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -74,16 +74,14 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
74} 74}
75 75
76/** 76/**
77 * GFS2 lookup code fills in vfs inode contents based on info obtained 77 * gfs2_set_iop - Sets inode operations
78 * from directory entry inside gfs2_inode_lookup(). This has caused issues 78 * @inode: The inode with correct i_mode filled in
79 * with NFS code path since its get_dentry routine doesn't have the relevant
80 * directory entry when gfs2_inode_lookup() is invoked. Part of the code
81 * segment inside gfs2_inode_lookup code needs to get moved around.
82 * 79 *
83 * Clears I_NEW as well. 80 * GFS2 lookup code fills in vfs inode contents based on info obtained
84 **/ 81 * from directory entry inside gfs2_inode_lookup().
82 */
85 83
86void gfs2_set_iop(struct inode *inode) 84static void gfs2_set_iop(struct inode *inode)
87{ 85{
88 struct gfs2_sbd *sdp = GFS2_SB(inode); 86 struct gfs2_sbd *sdp = GFS2_SB(inode);
89 umode_t mode = inode->i_mode; 87 umode_t mode = inode->i_mode;
@@ -106,8 +104,6 @@ void gfs2_set_iop(struct inode *inode)
106 inode->i_op = &gfs2_file_iops; 104 inode->i_op = &gfs2_file_iops;
107 init_special_inode(inode, inode->i_mode, inode->i_rdev); 105 init_special_inode(inode, inode->i_mode, inode->i_rdev);
108 } 106 }
109
110 unlock_new_inode(inode);
111} 107}
112 108
113/** 109/**
@@ -119,10 +115,8 @@ void gfs2_set_iop(struct inode *inode)
119 * Returns: A VFS inode, or an error 115 * Returns: A VFS inode, or an error
120 */ 116 */
121 117
122struct inode *gfs2_inode_lookup(struct super_block *sb, 118struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
123 unsigned int type, 119 u64 no_addr, u64 no_formal_ino)
124 u64 no_addr,
125 u64 no_formal_ino)
126{ 120{
127 struct inode *inode; 121 struct inode *inode;
128 struct gfs2_inode *ip; 122 struct gfs2_inode *ip;
@@ -152,51 +146,37 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
152 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); 146 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
153 if (unlikely(error)) 147 if (unlikely(error))
154 goto fail_iopen; 148 goto fail_iopen;
155 ip->i_iopen_gh.gh_gl->gl_object = ip;
156 149
150 ip->i_iopen_gh.gh_gl->gl_object = ip;
157 gfs2_glock_put(io_gl); 151 gfs2_glock_put(io_gl);
158 io_gl = NULL; 152 io_gl = NULL;
159 153
160 if ((type == DT_UNKNOWN) && (no_formal_ino == 0))
161 goto gfs2_nfsbypass;
162
163 inode->i_mode = DT2IF(type);
164
165 /*
166 * We must read the inode in order to work out its type in
167 * this case. Note that this doesn't happen often as we normally
168 * know the type beforehand. This code path only occurs during
169 * unlinked inode recovery (where it is safe to do this glock,
170 * which is not true in the general case).
171 */
172 if (type == DT_UNKNOWN) { 154 if (type == DT_UNKNOWN) {
173 struct gfs2_holder gh; 155 /* Inode glock must be locked already */
174 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 156 error = gfs2_inode_refresh(GFS2_I(inode));
175 if (unlikely(error)) 157 if (error)
176 goto fail_glock; 158 goto fail_refresh;
177 /* Inode is now uptodate */ 159 } else {
178 gfs2_glock_dq_uninit(&gh); 160 inode->i_mode = DT2IF(type);
179 } 161 }
180 162
181 gfs2_set_iop(inode); 163 gfs2_set_iop(inode);
164 unlock_new_inode(inode);
182 } 165 }
183 166
184gfs2_nfsbypass:
185 return inode; 167 return inode;
186fail_glock: 168
187 gfs2_glock_dq(&ip->i_iopen_gh); 169fail_refresh:
170 ip->i_iopen_gh.gh_gl->gl_object = NULL;
171 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
188fail_iopen: 172fail_iopen:
189 if (io_gl) 173 if (io_gl)
190 gfs2_glock_put(io_gl); 174 gfs2_glock_put(io_gl);
191fail_put: 175fail_put:
192 if (inode->i_state & I_NEW) 176 ip->i_gl->gl_object = NULL;
193 ip->i_gl->gl_object = NULL;
194 gfs2_glock_put(ip->i_gl); 177 gfs2_glock_put(ip->i_gl);
195fail: 178fail:
196 if (inode->i_state & I_NEW) 179 iget_failed(inode);
197 iget_failed(inode);
198 else
199 iput(inode);
200 return ERR_PTR(error); 180 return ERR_PTR(error);
201} 181}
202 182
@@ -221,14 +201,6 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
221 if (IS_ERR(inode)) 201 if (IS_ERR(inode))
222 goto fail; 202 goto fail;
223 203
224 error = gfs2_inode_refresh(GFS2_I(inode));
225 if (error)
226 goto fail_iput;
227
228 /* Pick up the works we bypass in gfs2_inode_lookup */
229 if (inode->i_state & I_NEW)
230 gfs2_set_iop(inode);
231
232 /* Two extra checks for NFS only */ 204 /* Two extra checks for NFS only */
233 if (no_formal_ino) { 205 if (no_formal_ino) {
234 error = -ESTALE; 206 error = -ESTALE;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 732a183efdb3..3e00a66e7cbd 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -96,7 +96,6 @@ err:
96 return -EIO; 96 return -EIO;
97} 97}
98 98
99extern void gfs2_set_iop(struct inode *inode);
100extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 99extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
101 u64 no_addr, u64 no_formal_ino); 100 u64 no_addr, u64 no_formal_ino);
102extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, 101extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 040b5a2e6556..d8b26ac2e20b 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,8 +18,6 @@
18#include <linux/gfs2_ondisk.h> 18#include <linux/gfs2_ondisk.h>
19#include <linux/crc32.h> 19#include <linux/crc32.h>
20#include <linux/fiemap.h> 20#include <linux/fiemap.h>
21#include <linux/swap.h>
22#include <linux/falloc.h>
23#include <asm/uaccess.h> 21#include <asm/uaccess.h>
24 22
25#include "gfs2.h" 23#include "gfs2.h"
@@ -1257,261 +1255,6 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
1257 return ret; 1255 return ret;
1258} 1256}
1259 1257
1260static void empty_write_end(struct page *page, unsigned from,
1261 unsigned to)
1262{
1263 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
1264
1265 page_zero_new_buffers(page, from, to);
1266 flush_dcache_page(page);
1267 mark_page_accessed(page);
1268
1269 if (!gfs2_is_writeback(ip))
1270 gfs2_page_add_databufs(ip, page, from, to);
1271
1272 block_commit_write(page, from, to);
1273}
1274
1275
1276static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
1277{
1278 unsigned start, end, next;
1279 struct buffer_head *bh, *head;
1280 int error;
1281
1282 if (!page_has_buffers(page)) {
1283 error = __block_write_begin(page, from, to - from, gfs2_block_map);
1284 if (unlikely(error))
1285 return error;
1286
1287 empty_write_end(page, from, to);
1288 return 0;
1289 }
1290
1291 bh = head = page_buffers(page);
1292 next = end = 0;
1293 while (next < from) {
1294 next += bh->b_size;
1295 bh = bh->b_this_page;
1296 }
1297 start = next;
1298 do {
1299 next += bh->b_size;
1300 if (buffer_mapped(bh)) {
1301 if (end) {
1302 error = __block_write_begin(page, start, end - start,
1303 gfs2_block_map);
1304 if (unlikely(error))
1305 return error;
1306 empty_write_end(page, start, end);
1307 end = 0;
1308 }
1309 start = next;
1310 }
1311 else
1312 end = next;
1313 bh = bh->b_this_page;
1314 } while (next < to);
1315
1316 if (end) {
1317 error = __block_write_begin(page, start, end - start, gfs2_block_map);
1318 if (unlikely(error))
1319 return error;
1320 empty_write_end(page, start, end);
1321 }
1322
1323 return 0;
1324}
1325
1326static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
1327 int mode)
1328{
1329 struct gfs2_inode *ip = GFS2_I(inode);
1330 struct buffer_head *dibh;
1331 int error;
1332 u64 start = offset >> PAGE_CACHE_SHIFT;
1333 unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
1334 u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
1335 pgoff_t curr;
1336 struct page *page;
1337 unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
1338 unsigned int from, to;
1339
1340 if (!end_offset)
1341 end_offset = PAGE_CACHE_SIZE;
1342
1343 error = gfs2_meta_inode_buffer(ip, &dibh);
1344 if (unlikely(error))
1345 goto out;
1346
1347 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1348
1349 if (gfs2_is_stuffed(ip)) {
1350 error = gfs2_unstuff_dinode(ip, NULL);
1351 if (unlikely(error))
1352 goto out;
1353 }
1354
1355 curr = start;
1356 offset = start << PAGE_CACHE_SHIFT;
1357 from = start_offset;
1358 to = PAGE_CACHE_SIZE;
1359 while (curr <= end) {
1360 page = grab_cache_page_write_begin(inode->i_mapping, curr,
1361 AOP_FLAG_NOFS);
1362 if (unlikely(!page)) {
1363 error = -ENOMEM;
1364 goto out;
1365 }
1366
1367 if (curr == end)
1368 to = end_offset;
1369 error = write_empty_blocks(page, from, to);
1370 if (!error && offset + to > inode->i_size &&
1371 !(mode & FALLOC_FL_KEEP_SIZE)) {
1372 i_size_write(inode, offset + to);
1373 }
1374 unlock_page(page);
1375 page_cache_release(page);
1376 if (error)
1377 goto out;
1378 curr++;
1379 offset += PAGE_CACHE_SIZE;
1380 from = 0;
1381 }
1382
1383 gfs2_dinode_out(ip, dibh->b_data);
1384 mark_inode_dirty(inode);
1385
1386 brelse(dibh);
1387
1388out:
1389 return error;
1390}
1391
1392static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
1393 unsigned int *data_blocks, unsigned int *ind_blocks)
1394{
1395 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1396 unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
1397 unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
1398
1399 for (tmp = max_data; tmp > sdp->sd_diptrs;) {
1400 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
1401 max_data -= tmp;
1402 }
1403 /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
1404 so it might end up with fewer data blocks */
1405 if (max_data <= *data_blocks)
1406 return;
1407 *data_blocks = max_data;
1408 *ind_blocks = max_blocks - max_data;
1409 *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
1410 if (*len > max) {
1411 *len = max;
1412 gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
1413 }
1414}
1415
1416static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
1417 loff_t len)
1418{
1419 struct gfs2_sbd *sdp = GFS2_SB(inode);
1420 struct gfs2_inode *ip = GFS2_I(inode);
1421 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
1422 loff_t bytes, max_bytes;
1423 struct gfs2_alloc *al;
1424 int error;
1425 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
1426 next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
1427
1428 /* We only support the FALLOC_FL_KEEP_SIZE mode */
1429 if (mode && (mode != FALLOC_FL_KEEP_SIZE))
1430 return -EOPNOTSUPP;
1431
1432 offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
1433 sdp->sd_sb.sb_bsize_shift;
1434
1435 len = next - offset;
1436 bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
1437 if (!bytes)
1438 bytes = UINT_MAX;
1439
1440 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
1441 error = gfs2_glock_nq(&ip->i_gh);
1442 if (unlikely(error))
1443 goto out_uninit;
1444
1445 if (!gfs2_write_alloc_required(ip, offset, len))
1446 goto out_unlock;
1447
1448 while (len > 0) {
1449 if (len < bytes)
1450 bytes = len;
1451 al = gfs2_alloc_get(ip);
1452 if (!al) {
1453 error = -ENOMEM;
1454 goto out_unlock;
1455 }
1456
1457 error = gfs2_quota_lock_check(ip);
1458 if (error)
1459 goto out_alloc_put;
1460
1461retry:
1462 gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
1463
1464 al->al_requested = data_blocks + ind_blocks;
1465 error = gfs2_inplace_reserve(ip);
1466 if (error) {
1467 if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
1468 bytes >>= 1;
1469 goto retry;
1470 }
1471 goto out_qunlock;
1472 }
1473 max_bytes = bytes;
1474 calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
1475 al->al_requested = data_blocks + ind_blocks;
1476
1477 rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
1478 RES_RG_HDR + gfs2_rg_blocks(al);
1479 if (gfs2_is_jdata(ip))
1480 rblocks += data_blocks ? data_blocks : 1;
1481
1482 error = gfs2_trans_begin(sdp, rblocks,
1483 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
1484 if (error)
1485 goto out_trans_fail;
1486
1487 error = fallocate_chunk(inode, offset, max_bytes, mode);
1488 gfs2_trans_end(sdp);
1489
1490 if (error)
1491 goto out_trans_fail;
1492
1493 len -= max_bytes;
1494 offset += max_bytes;
1495 gfs2_inplace_release(ip);
1496 gfs2_quota_unlock(ip);
1497 gfs2_alloc_put(ip);
1498 }
1499 goto out_unlock;
1500
1501out_trans_fail:
1502 gfs2_inplace_release(ip);
1503out_qunlock:
1504 gfs2_quota_unlock(ip);
1505out_alloc_put:
1506 gfs2_alloc_put(ip);
1507out_unlock:
1508 gfs2_glock_dq(&ip->i_gh);
1509out_uninit:
1510 gfs2_holder_uninit(&ip->i_gh);
1511 return error;
1512}
1513
1514
1515static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 1258static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1516 u64 start, u64 len) 1259 u64 start, u64 len)
1517{ 1260{
@@ -1562,7 +1305,6 @@ const struct inode_operations gfs2_file_iops = {
1562 .getxattr = gfs2_getxattr, 1305 .getxattr = gfs2_getxattr,
1563 .listxattr = gfs2_listxattr, 1306 .listxattr = gfs2_listxattr,
1564 .removexattr = gfs2_removexattr, 1307 .removexattr = gfs2_removexattr,
1565 .fallocate = gfs2_fallocate,
1566 .fiemap = gfs2_fiemap, 1308 .fiemap = gfs2_fiemap,
1567}; 1309};
1568 1310
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 16c2ecac7eb7..ec73ed70bae1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1336,6 +1336,7 @@ static void gfs2_evict_inode(struct inode *inode)
1336 if (error) 1336 if (error)
1337 goto out_truncate; 1337 goto out_truncate;
1338 1338
1339 ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
1339 gfs2_glock_dq_wait(&ip->i_iopen_gh); 1340 gfs2_glock_dq_wait(&ip->i_iopen_gh);
1340 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); 1341 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
1341 error = gfs2_glock_nq(&ip->i_iopen_gh); 1342 error = gfs2_glock_nq(&ip->i_iopen_gh);
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 52a0bcaa7b6d..b1991a2a08e0 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -397,8 +397,8 @@ int hfsplus_file_extend(struct inode *inode)
397 u32 start, len, goal; 397 u32 start, len, goal;
398 int res; 398 int res;
399 399
400 if (sbi->total_blocks - sbi->free_blocks + 8 > 400 if (sbi->alloc_file->i_size * 8 <
401 sbi->alloc_file->i_size * 8) { 401 sbi->total_blocks - sbi->free_blocks + 8) {
402 /* extend alloc file */ 402 /* extend alloc file */
403 printk(KERN_ERR "hfs: extend alloc file! " 403 printk(KERN_ERR "hfs: extend alloc file! "
404 "(%llu,%u,%u)\n", 404 "(%llu,%u,%u)\n",
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index d66ad113b1cc..40ad88c12c64 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -134,7 +134,7 @@ int hfs_part_find(struct super_block *sb,
134 res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK, 134 res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK,
135 data, READ); 135 data, READ);
136 if (res) 136 if (res)
137 return res; 137 goto out;
138 138
139 switch (be16_to_cpu(*((__be16 *)data))) { 139 switch (be16_to_cpu(*((__be16 *)data))) {
140 case HFS_OLD_PMAP_MAGIC: 140 case HFS_OLD_PMAP_MAGIC:
@@ -147,7 +147,7 @@ int hfs_part_find(struct super_block *sb,
147 res = -ENOENT; 147 res = -ENOENT;
148 break; 148 break;
149 } 149 }
150 150out:
151 kfree(data); 151 kfree(data);
152 return res; 152 return res;
153} 153}
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 9a3b4795f43c..b49b55584c84 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -338,20 +338,22 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
338 struct inode *root, *inode; 338 struct inode *root, *inode;
339 struct qstr str; 339 struct qstr str;
340 struct nls_table *nls = NULL; 340 struct nls_table *nls = NULL;
341 int err = -EINVAL; 341 int err;
342 342
343 err = -EINVAL;
343 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 344 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
344 if (!sbi) 345 if (!sbi)
345 return -ENOMEM; 346 goto out;
346 347
347 sb->s_fs_info = sbi; 348 sb->s_fs_info = sbi;
348 mutex_init(&sbi->alloc_mutex); 349 mutex_init(&sbi->alloc_mutex);
349 mutex_init(&sbi->vh_mutex); 350 mutex_init(&sbi->vh_mutex);
350 hfsplus_fill_defaults(sbi); 351 hfsplus_fill_defaults(sbi);
352
353 err = -EINVAL;
351 if (!hfsplus_parse_options(data, sbi)) { 354 if (!hfsplus_parse_options(data, sbi)) {
352 printk(KERN_ERR "hfs: unable to parse mount options\n"); 355 printk(KERN_ERR "hfs: unable to parse mount options\n");
353 err = -EINVAL; 356 goto out_unload_nls;
354 goto cleanup;
355 } 357 }
356 358
357 /* temporarily use utf8 to correctly find the hidden dir below */ 359 /* temporarily use utf8 to correctly find the hidden dir below */
@@ -359,16 +361,14 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
359 sbi->nls = load_nls("utf8"); 361 sbi->nls = load_nls("utf8");
360 if (!sbi->nls) { 362 if (!sbi->nls) {
361 printk(KERN_ERR "hfs: unable to load nls for utf8\n"); 363 printk(KERN_ERR "hfs: unable to load nls for utf8\n");
362 err = -EINVAL; 364 goto out_unload_nls;
363 goto cleanup;
364 } 365 }
365 366
366 /* Grab the volume header */ 367 /* Grab the volume header */
367 if (hfsplus_read_wrapper(sb)) { 368 if (hfsplus_read_wrapper(sb)) {
368 if (!silent) 369 if (!silent)
369 printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n"); 370 printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n");
370 err = -EINVAL; 371 goto out_unload_nls;
371 goto cleanup;
372 } 372 }
373 vhdr = sbi->s_vhdr; 373 vhdr = sbi->s_vhdr;
374 374
@@ -377,7 +377,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
377 if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION || 377 if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION ||
378 be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) { 378 be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) {
379 printk(KERN_ERR "hfs: wrong filesystem version\n"); 379 printk(KERN_ERR "hfs: wrong filesystem version\n");
380 goto cleanup; 380 goto out_free_vhdr;
381 } 381 }
382 sbi->total_blocks = be32_to_cpu(vhdr->total_blocks); 382 sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
383 sbi->free_blocks = be32_to_cpu(vhdr->free_blocks); 383 sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
@@ -421,19 +421,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
421 sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); 421 sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
422 if (!sbi->ext_tree) { 422 if (!sbi->ext_tree) {
423 printk(KERN_ERR "hfs: failed to load extents file\n"); 423 printk(KERN_ERR "hfs: failed to load extents file\n");
424 goto cleanup; 424 goto out_free_vhdr;
425 } 425 }
426 sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); 426 sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
427 if (!sbi->cat_tree) { 427 if (!sbi->cat_tree) {
428 printk(KERN_ERR "hfs: failed to load catalog file\n"); 428 printk(KERN_ERR "hfs: failed to load catalog file\n");
429 goto cleanup; 429 goto out_close_ext_tree;
430 } 430 }
431 431
432 inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID); 432 inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID);
433 if (IS_ERR(inode)) { 433 if (IS_ERR(inode)) {
434 printk(KERN_ERR "hfs: failed to load allocation file\n"); 434 printk(KERN_ERR "hfs: failed to load allocation file\n");
435 err = PTR_ERR(inode); 435 err = PTR_ERR(inode);
436 goto cleanup; 436 goto out_close_cat_tree;
437 } 437 }
438 sbi->alloc_file = inode; 438 sbi->alloc_file = inode;
439 439
@@ -442,14 +442,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
442 if (IS_ERR(root)) { 442 if (IS_ERR(root)) {
443 printk(KERN_ERR "hfs: failed to load root directory\n"); 443 printk(KERN_ERR "hfs: failed to load root directory\n");
444 err = PTR_ERR(root); 444 err = PTR_ERR(root);
445 goto cleanup; 445 goto out_put_alloc_file;
446 }
447 sb->s_d_op = &hfsplus_dentry_operations;
448 sb->s_root = d_alloc_root(root);
449 if (!sb->s_root) {
450 iput(root);
451 err = -ENOMEM;
452 goto cleanup;
453 } 446 }
454 447
455 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; 448 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
@@ -459,46 +452,69 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
459 if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { 452 if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
460 hfs_find_exit(&fd); 453 hfs_find_exit(&fd);
461 if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) 454 if (entry.type != cpu_to_be16(HFSPLUS_FOLDER))
462 goto cleanup; 455 goto out_put_root;
463 inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id)); 456 inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id));
464 if (IS_ERR(inode)) { 457 if (IS_ERR(inode)) {
465 err = PTR_ERR(inode); 458 err = PTR_ERR(inode);
466 goto cleanup; 459 goto out_put_root;
467 } 460 }
468 sbi->hidden_dir = inode; 461 sbi->hidden_dir = inode;
469 } else 462 } else
470 hfs_find_exit(&fd); 463 hfs_find_exit(&fd);
471 464
472 if (sb->s_flags & MS_RDONLY) 465 if (!(sb->s_flags & MS_RDONLY)) {
473 goto out; 466 /*
467 * H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused
468 * all three are registered with Apple for our use
469 */
470 vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION);
471 vhdr->modify_date = hfsp_now2mt();
472 be32_add_cpu(&vhdr->write_count, 1);
473 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
474 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
475 hfsplus_sync_fs(sb, 1);
474 476
475 /* H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused 477 if (!sbi->hidden_dir) {
476 * all three are registered with Apple for our use 478 mutex_lock(&sbi->vh_mutex);
477 */ 479 sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
478 vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION); 480 hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str,
479 vhdr->modify_date = hfsp_now2mt(); 481 sbi->hidden_dir);
480 be32_add_cpu(&vhdr->write_count, 1); 482 mutex_unlock(&sbi->vh_mutex);
481 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); 483
482 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); 484 hfsplus_mark_inode_dirty(sbi->hidden_dir,
483 hfsplus_sync_fs(sb, 1); 485 HFSPLUS_I_CAT_DIRTY);
484 486 }
485 if (!sbi->hidden_dir) {
486 mutex_lock(&sbi->vh_mutex);
487 sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
488 hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
489 &str, sbi->hidden_dir);
490 mutex_unlock(&sbi->vh_mutex);
491
492 hfsplus_mark_inode_dirty(sbi->hidden_dir, HFSPLUS_I_CAT_DIRTY);
493 } 487 }
494out: 488
489 sb->s_d_op = &hfsplus_dentry_operations;
490 sb->s_root = d_alloc_root(root);
491 if (!sb->s_root) {
492 err = -ENOMEM;
493 goto out_put_hidden_dir;
494 }
495
495 unload_nls(sbi->nls); 496 unload_nls(sbi->nls);
496 sbi->nls = nls; 497 sbi->nls = nls;
497 return 0; 498 return 0;
498 499
499cleanup: 500out_put_hidden_dir:
500 hfsplus_put_super(sb); 501 iput(sbi->hidden_dir);
502out_put_root:
503 iput(sbi->alloc_file);
504out_put_alloc_file:
505 iput(sbi->alloc_file);
506out_close_cat_tree:
507 hfs_btree_close(sbi->cat_tree);
508out_close_ext_tree:
509 hfs_btree_close(sbi->ext_tree);
510out_free_vhdr:
511 kfree(sbi->s_vhdr);
512 kfree(sbi->s_backup_vhdr);
513out_unload_nls:
514 unload_nls(sbi->nls);
501 unload_nls(nls); 515 unload_nls(nls);
516 kfree(sbi);
517out:
502 return err; 518 return err;
503} 519}
504 520
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 196231794f64..3031d81f5f0f 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -167,7 +167,7 @@ reread:
167 break; 167 break;
168 case cpu_to_be16(HFSP_WRAP_MAGIC): 168 case cpu_to_be16(HFSP_WRAP_MAGIC):
169 if (!hfsplus_read_mdb(sbi->s_vhdr, &wd)) 169 if (!hfsplus_read_mdb(sbi->s_vhdr, &wd))
170 goto out; 170 goto out_free_backup_vhdr;
171 wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT; 171 wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
172 part_start += wd.ablk_start + wd.embed_start * wd.ablk_size; 172 part_start += wd.ablk_start + wd.embed_start * wd.ablk_size;
173 part_size = wd.embed_count * wd.ablk_size; 173 part_size = wd.embed_count * wd.ablk_size;
@@ -179,7 +179,7 @@ reread:
179 * (should do this only for cdrom/loop though) 179 * (should do this only for cdrom/loop though)
180 */ 180 */
181 if (hfs_part_find(sb, &part_start, &part_size)) 181 if (hfs_part_find(sb, &part_start, &part_size))
182 goto out; 182 goto out_free_backup_vhdr;
183 goto reread; 183 goto reread;
184 } 184 }
185 185
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 56f0da1cfd10..1ae35baa539e 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -281,7 +281,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
281 attr->ia_size != i_size_read(inode)) { 281 attr->ia_size != i_size_read(inode)) {
282 error = vmtruncate(inode, attr->ia_size); 282 error = vmtruncate(inode, attr->ia_size);
283 if (error) 283 if (error)
284 return error; 284 goto out_unlock;
285 } 285 }
286 286
287 setattr_copy(inode, attr); 287 setattr_copy(inode, attr);
diff --git a/fs/internal.h b/fs/internal.h
index 9687c2ee2735..0663568b1247 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -70,6 +70,10 @@ extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
70extern void release_mounts(struct list_head *); 70extern void release_mounts(struct list_head *);
71extern void umount_tree(struct vfsmount *, int, struct list_head *); 71extern void umount_tree(struct vfsmount *, int, struct list_head *);
72extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); 72extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
73extern int finish_automount(struct vfsmount *, struct path *);
74
75extern void mnt_make_longterm(struct vfsmount *);
76extern void mnt_make_shortterm(struct vfsmount *);
73 77
74extern void __init mnt_init(void); 78extern void __init mnt_init(void);
75 79
diff --git a/fs/ioctl.c b/fs/ioctl.c
index d6cc16476620..1eebeb72b202 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -86,7 +86,7 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
86 u64 phys, u64 len, u32 flags) 86 u64 phys, u64 len, u32 flags)
87{ 87{
88 struct fiemap_extent extent; 88 struct fiemap_extent extent;
89 struct fiemap_extent *dest = fieinfo->fi_extents_start; 89 struct fiemap_extent __user *dest = fieinfo->fi_extents_start;
90 90
91 /* only count the extents */ 91 /* only count the extents */
92 if (fieinfo->fi_extents_max == 0) { 92 if (fieinfo->fi_extents_max == 0) {
@@ -173,6 +173,7 @@ static int fiemap_check_ranges(struct super_block *sb,
173static int ioctl_fiemap(struct file *filp, unsigned long arg) 173static int ioctl_fiemap(struct file *filp, unsigned long arg)
174{ 174{
175 struct fiemap fiemap; 175 struct fiemap fiemap;
176 struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
176 struct fiemap_extent_info fieinfo = { 0, }; 177 struct fiemap_extent_info fieinfo = { 0, };
177 struct inode *inode = filp->f_path.dentry->d_inode; 178 struct inode *inode = filp->f_path.dentry->d_inode;
178 struct super_block *sb = inode->i_sb; 179 struct super_block *sb = inode->i_sb;
@@ -182,8 +183,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
182 if (!inode->i_op->fiemap) 183 if (!inode->i_op->fiemap)
183 return -EOPNOTSUPP; 184 return -EOPNOTSUPP;
184 185
185 if (copy_from_user(&fiemap, (struct fiemap __user *)arg, 186 if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
186 sizeof(struct fiemap)))
187 return -EFAULT; 187 return -EFAULT;
188 188
189 if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) 189 if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
@@ -196,7 +196,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
196 196
197 fieinfo.fi_flags = fiemap.fm_flags; 197 fieinfo.fi_flags = fiemap.fm_flags;
198 fieinfo.fi_extents_max = fiemap.fm_extent_count; 198 fieinfo.fi_extents_max = fiemap.fm_extent_count;
199 fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap)); 199 fieinfo.fi_extents_start = ufiemap->fm_extents;
200 200
201 if (fiemap.fm_extent_count != 0 && 201 if (fiemap.fm_extent_count != 0 &&
202 !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start, 202 !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
@@ -209,7 +209,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
209 error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len); 209 error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
210 fiemap.fm_flags = fieinfo.fi_flags; 210 fiemap.fm_flags = fieinfo.fi_flags;
211 fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; 211 fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
212 if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap))) 212 if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
213 error = -EFAULT; 213 error = -EFAULT;
214 214
215 return error; 215 return error;
@@ -273,6 +273,13 @@ int __generic_block_fiemap(struct inode *inode,
273 len = isize; 273 len = isize;
274 } 274 }
275 275
276 /*
277 * Some filesystems can't deal with being asked to map less than
278 * blocksize, so make sure our len is at least block length.
279 */
280 if (logical_to_blk(inode, len) == 0)
281 len = blk_to_logical(inode, 1);
282
276 start_blk = logical_to_blk(inode, start); 283 start_blk = logical_to_blk(inode, start);
277 last_blk = logical_to_blk(inode, start + len - 1); 284 last_blk = logical_to_blk(inode, start + len - 1);
278 285
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 9e4686900f18..97e73469b2c4 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -473,7 +473,8 @@ int __jbd2_log_space_left(journal_t *journal)
473} 473}
474 474
475/* 475/*
476 * Called under j_state_lock. Returns true if a transaction commit was started. 476 * Called with j_state_lock locked for writing.
477 * Returns true if a transaction commit was started.
477 */ 478 */
478int __jbd2_log_start_commit(journal_t *journal, tid_t target) 479int __jbd2_log_start_commit(journal_t *journal, tid_t target)
479{ 480{
@@ -520,11 +521,13 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
520{ 521{
521 transaction_t *transaction = NULL; 522 transaction_t *transaction = NULL;
522 tid_t tid; 523 tid_t tid;
524 int need_to_start = 0;
523 525
524 read_lock(&journal->j_state_lock); 526 read_lock(&journal->j_state_lock);
525 if (journal->j_running_transaction && !current->journal_info) { 527 if (journal->j_running_transaction && !current->journal_info) {
526 transaction = journal->j_running_transaction; 528 transaction = journal->j_running_transaction;
527 __jbd2_log_start_commit(journal, transaction->t_tid); 529 if (!tid_geq(journal->j_commit_request, transaction->t_tid))
530 need_to_start = 1;
528 } else if (journal->j_committing_transaction) 531 } else if (journal->j_committing_transaction)
529 transaction = journal->j_committing_transaction; 532 transaction = journal->j_committing_transaction;
530 533
@@ -535,6 +538,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
535 538
536 tid = transaction->t_tid; 539 tid = transaction->t_tid;
537 read_unlock(&journal->j_state_lock); 540 read_unlock(&journal->j_state_lock);
541 if (need_to_start)
542 jbd2_log_start_commit(journal, tid);
538 jbd2_log_wait_commit(journal, tid); 543 jbd2_log_wait_commit(journal, tid);
539 return 1; 544 return 1;
540} 545}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index faad2bd787c7..1d1191050f99 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -117,10 +117,10 @@ static inline void update_t_max_wait(transaction_t *transaction)
117static int start_this_handle(journal_t *journal, handle_t *handle, 117static int start_this_handle(journal_t *journal, handle_t *handle,
118 int gfp_mask) 118 int gfp_mask)
119{ 119{
120 transaction_t *transaction; 120 transaction_t *transaction, *new_transaction = NULL;
121 int needed; 121 tid_t tid;
122 int nblocks = handle->h_buffer_credits; 122 int needed, need_to_start;
123 transaction_t *new_transaction = NULL; 123 int nblocks = handle->h_buffer_credits;
124 124
125 if (nblocks > journal->j_max_transaction_buffers) { 125 if (nblocks > journal->j_max_transaction_buffers) {
126 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", 126 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -222,8 +222,11 @@ repeat:
222 atomic_sub(nblocks, &transaction->t_outstanding_credits); 222 atomic_sub(nblocks, &transaction->t_outstanding_credits);
223 prepare_to_wait(&journal->j_wait_transaction_locked, &wait, 223 prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
224 TASK_UNINTERRUPTIBLE); 224 TASK_UNINTERRUPTIBLE);
225 __jbd2_log_start_commit(journal, transaction->t_tid); 225 tid = transaction->t_tid;
226 need_to_start = !tid_geq(journal->j_commit_request, tid);
226 read_unlock(&journal->j_state_lock); 227 read_unlock(&journal->j_state_lock);
228 if (need_to_start)
229 jbd2_log_start_commit(journal, tid);
227 schedule(); 230 schedule();
228 finish_wait(&journal->j_wait_transaction_locked, &wait); 231 finish_wait(&journal->j_wait_transaction_locked, &wait);
229 goto repeat; 232 goto repeat;
@@ -442,7 +445,8 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
442{ 445{
443 transaction_t *transaction = handle->h_transaction; 446 transaction_t *transaction = handle->h_transaction;
444 journal_t *journal = transaction->t_journal; 447 journal_t *journal = transaction->t_journal;
445 int ret; 448 tid_t tid;
449 int need_to_start, ret;
446 450
447 /* If we've had an abort of any type, don't even think about 451 /* If we've had an abort of any type, don't even think about
448 * actually doing the restart! */ 452 * actually doing the restart! */
@@ -465,8 +469,11 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
465 spin_unlock(&transaction->t_handle_lock); 469 spin_unlock(&transaction->t_handle_lock);
466 470
467 jbd_debug(2, "restarting handle %p\n", handle); 471 jbd_debug(2, "restarting handle %p\n", handle);
468 __jbd2_log_start_commit(journal, transaction->t_tid); 472 tid = transaction->t_tid;
473 need_to_start = !tid_geq(journal->j_commit_request, tid);
469 read_unlock(&journal->j_state_lock); 474 read_unlock(&journal->j_state_lock);
475 if (need_to_start)
476 jbd2_log_start_commit(journal, tid);
470 477
471 lock_map_release(&handle->h_lockdep_map); 478 lock_map_release(&handle->h_lockdep_map);
472 handle->h_buffer_credits = nblocks; 479 handle->h_buffer_credits = nblocks;
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 85c6be2db02f..3005ec4520ad 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -336,14 +336,13 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
336 size = sizeof(struct jffs2_eraseblock) * c->nr_blocks; 336 size = sizeof(struct jffs2_eraseblock) * c->nr_blocks;
337#ifndef __ECOS 337#ifndef __ECOS
338 if (jffs2_blocks_use_vmalloc(c)) 338 if (jffs2_blocks_use_vmalloc(c))
339 c->blocks = vmalloc(size); 339 c->blocks = vzalloc(size);
340 else 340 else
341#endif 341#endif
342 c->blocks = kmalloc(size, GFP_KERNEL); 342 c->blocks = kzalloc(size, GFP_KERNEL);
343 if (!c->blocks) 343 if (!c->blocks)
344 return -ENOMEM; 344 return -ENOMEM;
345 345
346 memset(c->blocks, 0, size);
347 for (i=0; i<c->nr_blocks; i++) { 346 for (i=0; i<c->nr_blocks; i++) {
348 INIT_LIST_HEAD(&c->blocks[i].list); 347 INIT_LIST_HEAD(&c->blocks[i].list);
349 c->blocks[i].offset = i * c->sector_size; 348 c->blocks[i].offset = i * c->sector_size;
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index f864005de64c..0bc6a6c80a56 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -144,4 +144,4 @@ struct jffs2_sb_info {
144 void *os_priv; 144 void *os_priv;
145}; 145};
146 146
147#endif /* _JFFS2_FB_SB */ 147#endif /* _JFFS2_FS_SB */
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 9b572ca40a49..4f9cc0482949 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -151,7 +151,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
151 JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", 151 JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
152 offset, je32_to_cpu(rx.hdr_crc), crc); 152 offset, je32_to_cpu(rx.hdr_crc), crc);
153 xd->flags |= JFFS2_XFLAGS_INVALID; 153 xd->flags |= JFFS2_XFLAGS_INVALID;
154 return EIO; 154 return -EIO;
155 } 155 }
156 totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len)); 156 totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len));
157 if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK 157 if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK
@@ -167,7 +167,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
167 je32_to_cpu(rx.xid), xd->xid, 167 je32_to_cpu(rx.xid), xd->xid,
168 je32_to_cpu(rx.version), xd->version); 168 je32_to_cpu(rx.version), xd->version);
169 xd->flags |= JFFS2_XFLAGS_INVALID; 169 xd->flags |= JFFS2_XFLAGS_INVALID;
170 return EIO; 170 return -EIO;
171 } 171 }
172 xd->xprefix = rx.xprefix; 172 xd->xprefix = rx.xprefix;
173 xd->name_len = rx.name_len; 173 xd->name_len = rx.name_len;
@@ -230,7 +230,7 @@ static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum
230 ref_offset(xd->node), xd->data_crc, crc); 230 ref_offset(xd->node), xd->data_crc, crc);
231 kfree(data); 231 kfree(data);
232 xd->flags |= JFFS2_XFLAGS_INVALID; 232 xd->flags |= JFFS2_XFLAGS_INVALID;
233 return EIO; 233 return -EIO;
234 } 234 }
235 235
236 xd->flags |= JFFS2_XFLAGS_HOT; 236 xd->flags |= JFFS2_XFLAGS_HOT;
@@ -268,7 +268,7 @@ static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
268 if (xd->xname) 268 if (xd->xname)
269 return 0; 269 return 0;
270 if (xd->flags & JFFS2_XFLAGS_INVALID) 270 if (xd->flags & JFFS2_XFLAGS_INVALID)
271 return EIO; 271 return -EIO;
272 if (unlikely(is_xattr_datum_unchecked(c, xd))) 272 if (unlikely(is_xattr_datum_unchecked(c, xd)))
273 rc = do_verify_xattr_datum(c, xd); 273 rc = do_verify_xattr_datum(c, xd);
274 if (!rc) 274 if (!rc)
@@ -460,7 +460,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
460 if (crc != je32_to_cpu(rr.node_crc)) { 460 if (crc != je32_to_cpu(rr.node_crc)) {
461 JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", 461 JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
462 offset, je32_to_cpu(rr.node_crc), crc); 462 offset, je32_to_cpu(rr.node_crc), crc);
463 return EIO; 463 return -EIO;
464 } 464 }
465 if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK 465 if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK
466 || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF 466 || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF
@@ -470,7 +470,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
470 offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK, 470 offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK,
471 je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF, 471 je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF,
472 je32_to_cpu(rr.totlen), PAD(sizeof(rr))); 472 je32_to_cpu(rr.totlen), PAD(sizeof(rr)));
473 return EIO; 473 return -EIO;
474 } 474 }
475 ref->ino = je32_to_cpu(rr.ino); 475 ref->ino = je32_to_cpu(rr.ino);
476 ref->xid = je32_to_cpu(rr.xid); 476 ref->xid = je32_to_cpu(rr.xid);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 5f1bcb2f06f3..b7c99bfb3da6 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -520,7 +520,7 @@ static struct nlm_host *next_host_state(struct hlist_head *cache,
520 struct nsm_handle *nsm, 520 struct nsm_handle *nsm,
521 const struct nlm_reboot *info) 521 const struct nlm_reboot *info)
522{ 522{
523 struct nlm_host *host = NULL; 523 struct nlm_host *host;
524 struct hlist_head *chain; 524 struct hlist_head *chain;
525 struct hlist_node *pos; 525 struct hlist_node *pos;
526 526
@@ -532,12 +532,13 @@ static struct nlm_host *next_host_state(struct hlist_head *cache,
532 host->h_state++; 532 host->h_state++;
533 533
534 nlm_get_host(host); 534 nlm_get_host(host);
535 goto out; 535 mutex_unlock(&nlm_host_mutex);
536 return host;
536 } 537 }
537 } 538 }
538out: 539
539 mutex_unlock(&nlm_host_mutex); 540 mutex_unlock(&nlm_host_mutex);
540 return host; 541 return NULL;
541} 542}
542 543
543/** 544/**
diff --git a/fs/locks.c b/fs/locks.c
index 08415b2a6d36..0f3998291f78 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -444,15 +444,9 @@ static void lease_release_private_callback(struct file_lock *fl)
444 fl->fl_file->f_owner.signum = 0; 444 fl->fl_file->f_owner.signum = 0;
445} 445}
446 446
447static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try)
448{
449 return fl->fl_file == try->fl_file;
450}
451
452static const struct lock_manager_operations lease_manager_ops = { 447static const struct lock_manager_operations lease_manager_ops = {
453 .fl_break = lease_break_callback, 448 .fl_break = lease_break_callback,
454 .fl_release_private = lease_release_private_callback, 449 .fl_release_private = lease_release_private_callback,
455 .fl_mylease = lease_mylease_callback,
456 .fl_change = lease_modify, 450 .fl_change = lease_modify,
457}; 451};
458 452
@@ -1405,7 +1399,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1405 for (before = &inode->i_flock; 1399 for (before = &inode->i_flock;
1406 ((fl = *before) != NULL) && IS_LEASE(fl); 1400 ((fl = *before) != NULL) && IS_LEASE(fl);
1407 before = &fl->fl_next) { 1401 before = &fl->fl_next) {
1408 if (lease->fl_lmops->fl_mylease(fl, lease)) 1402 if (fl->fl_file == filp)
1409 my_before = before; 1403 my_before = before;
1410 else if (fl->fl_type == (F_INPROGRESS | F_UNLCK)) 1404 else if (fl->fl_type == (F_INPROGRESS | F_UNLCK))
1411 /* 1405 /*
diff --git a/fs/mpage.c b/fs/mpage.c
index fd56ca2ea556..d78455a81ec9 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -40,7 +40,7 @@
40 * status of that page is hard. See end_buffer_async_read() for the details. 40 * status of that page is hard. See end_buffer_async_read() for the details.
41 * There is no point in duplicating all that complexity. 41 * There is no point in duplicating all that complexity.
42 */ 42 */
43static void mpage_end_io_read(struct bio *bio, int err) 43static void mpage_end_io(struct bio *bio, int err)
44{ 44{
45 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 45 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
46 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 46 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -50,44 +50,29 @@ static void mpage_end_io_read(struct bio *bio, int err)
50 50
51 if (--bvec >= bio->bi_io_vec) 51 if (--bvec >= bio->bi_io_vec)
52 prefetchw(&bvec->bv_page->flags); 52 prefetchw(&bvec->bv_page->flags);
53 53 if (bio_data_dir(bio) == READ) {
54 if (uptodate) { 54 if (uptodate) {
55 SetPageUptodate(page); 55 SetPageUptodate(page);
56 } else { 56 } else {
57 ClearPageUptodate(page); 57 ClearPageUptodate(page);
58 SetPageError(page); 58 SetPageError(page);
59 } 59 }
60 unlock_page(page); 60 unlock_page(page);
61 } while (bvec >= bio->bi_io_vec); 61 } else { /* bio_data_dir(bio) == WRITE */
62 bio_put(bio); 62 if (!uptodate) {
63} 63 SetPageError(page);
64 64 if (page->mapping)
65static void mpage_end_io_write(struct bio *bio, int err) 65 set_bit(AS_EIO, &page->mapping->flags);
66{ 66 }
67 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 67 end_page_writeback(page);
68 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
69
70 do {
71 struct page *page = bvec->bv_page;
72
73 if (--bvec >= bio->bi_io_vec)
74 prefetchw(&bvec->bv_page->flags);
75
76 if (!uptodate){
77 SetPageError(page);
78 if (page->mapping)
79 set_bit(AS_EIO, &page->mapping->flags);
80 } 68 }
81 end_page_writeback(page);
82 } while (bvec >= bio->bi_io_vec); 69 } while (bvec >= bio->bi_io_vec);
83 bio_put(bio); 70 bio_put(bio);
84} 71}
85 72
86static struct bio *mpage_bio_submit(int rw, struct bio *bio) 73static struct bio *mpage_bio_submit(int rw, struct bio *bio)
87{ 74{
88 bio->bi_end_io = mpage_end_io_read; 75 bio->bi_end_io = mpage_end_io;
89 if (rw == WRITE)
90 bio->bi_end_io = mpage_end_io_write;
91 submit_bio(rw, bio); 76 submit_bio(rw, bio);
92 return NULL; 77 return NULL;
93} 78}
diff --git a/fs/namei.c b/fs/namei.c
index 0b14f6910fc6..ec4b2d0190a8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -368,18 +368,6 @@ void path_get(struct path *path)
368EXPORT_SYMBOL(path_get); 368EXPORT_SYMBOL(path_get);
369 369
370/** 370/**
371 * path_get_long - get a long reference to a path
372 * @path: path to get the reference to
373 *
374 * Given a path increment the reference count to the dentry and the vfsmount.
375 */
376void path_get_long(struct path *path)
377{
378 mntget_long(path->mnt);
379 dget(path->dentry);
380}
381
382/**
383 * path_put - put a reference to a path 371 * path_put - put a reference to a path
384 * @path: path to put the reference to 372 * @path: path to put the reference to
385 * 373 *
@@ -393,18 +381,6 @@ void path_put(struct path *path)
393EXPORT_SYMBOL(path_put); 381EXPORT_SYMBOL(path_put);
394 382
395/** 383/**
396 * path_put_long - put a long reference to a path
397 * @path: path to put the reference to
398 *
399 * Given a path decrement the reference count to the dentry and the vfsmount.
400 */
401void path_put_long(struct path *path)
402{
403 dput(path->dentry);
404 mntput_long(path->mnt);
405}
406
407/**
408 * nameidata_drop_rcu - drop this nameidata out of rcu-walk 384 * nameidata_drop_rcu - drop this nameidata out of rcu-walk
409 * @nd: nameidata pathwalk data to drop 385 * @nd: nameidata pathwalk data to drop
410 * Returns: 0 on success, -ECHILD on failure 386 * Returns: 0 on success, -ECHILD on failure
@@ -479,6 +455,14 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
479 struct fs_struct *fs = current->fs; 455 struct fs_struct *fs = current->fs;
480 struct dentry *parent = nd->path.dentry; 456 struct dentry *parent = nd->path.dentry;
481 457
458 /*
459 * It can be possible to revalidate the dentry that we started
460 * the path walk with. force_reval_path may also revalidate the
461 * dentry already committed to the nameidata.
462 */
463 if (unlikely(parent == dentry))
464 return nameidata_drop_rcu(nd);
465
482 BUG_ON(!(nd->flags & LOOKUP_RCU)); 466 BUG_ON(!(nd->flags & LOOKUP_RCU));
483 if (nd->root.mnt) { 467 if (nd->root.mnt) {
484 spin_lock(&fs->lock); 468 spin_lock(&fs->lock);
@@ -577,12 +561,23 @@ static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
577 */ 561 */
578void release_open_intent(struct nameidata *nd) 562void release_open_intent(struct nameidata *nd)
579{ 563{
580 if (nd->intent.open.file->f_path.dentry == NULL) 564 struct file *file = nd->intent.open.file;
581 put_filp(nd->intent.open.file); 565
582 else 566 if (file && !IS_ERR(file)) {
583 fput(nd->intent.open.file); 567 if (file->f_path.dentry == NULL)
568 put_filp(file);
569 else
570 fput(file);
571 }
584} 572}
585 573
574/*
575 * Call d_revalidate and handle filesystems that request rcu-walk
576 * to be dropped. This may be called and return in rcu-walk mode,
577 * regardless of success or error. If -ECHILD is returned, the caller
578 * must return -ECHILD back up the path walk stack so path walk may
579 * be restarted in ref-walk mode.
580 */
586static int d_revalidate(struct dentry *dentry, struct nameidata *nd) 581static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
587{ 582{
588 int status; 583 int status;
@@ -673,6 +668,9 @@ force_reval_path(struct path *path, struct nameidata *nd)
673 return 0; 668 return 0;
674 669
675 if (!status) { 670 if (!status) {
671 /* Don't d_invalidate in rcu-walk mode */
672 if (nameidata_drop_rcu(nd))
673 return -ECHILD;
676 d_invalidate(dentry); 674 d_invalidate(dentry);
677 status = -ESTALE; 675 status = -ESTALE;
678 } 676 }
@@ -761,7 +759,8 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
761 mntput(path->mnt); 759 mntput(path->mnt);
762} 760}
763 761
764static inline void path_to_nameidata(struct path *path, struct nameidata *nd) 762static inline void path_to_nameidata(const struct path *path,
763 struct nameidata *nd)
765{ 764{
766 if (!(nd->flags & LOOKUP_RCU)) { 765 if (!(nd->flags & LOOKUP_RCU)) {
767 dput(nd->path.dentry); 766 dput(nd->path.dentry);
@@ -773,20 +772,16 @@ static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
773} 772}
774 773
775static __always_inline int 774static __always_inline int
776__do_follow_link(struct path *path, struct nameidata *nd, void **p) 775__do_follow_link(const struct path *link, struct nameidata *nd, void **p)
777{ 776{
778 int error; 777 int error;
779 struct dentry *dentry = path->dentry; 778 struct dentry *dentry = link->dentry;
780 779
781 touch_atime(path->mnt, dentry); 780 touch_atime(link->mnt, dentry);
782 nd_set_link(nd, NULL); 781 nd_set_link(nd, NULL);
783 782
784 if (path->mnt != nd->path.mnt) { 783 if (link->mnt == nd->path.mnt)
785 path_to_nameidata(path, nd); 784 mntget(link->mnt);
786 nd->inode = nd->path.dentry->d_inode;
787 dget(dentry);
788 }
789 mntget(path->mnt);
790 785
791 nd->last_type = LAST_BIND; 786 nd->last_type = LAST_BIND;
792 *p = dentry->d_inode->i_op->follow_link(dentry, nd); 787 *p = dentry->d_inode->i_op->follow_link(dentry, nd);
@@ -877,54 +872,148 @@ int follow_up(struct path *path)
877} 872}
878 873
879/* 874/*
880 * serialization is taken care of in namespace.c 875 * Perform an automount
876 * - return -EISDIR to tell follow_managed() to stop and return the path we
877 * were called with.
881 */ 878 */
882static void __follow_mount_rcu(struct nameidata *nd, struct path *path, 879static int follow_automount(struct path *path, unsigned flags,
883 struct inode **inode) 880 bool *need_mntput)
884{ 881{
885 while (d_mountpoint(path->dentry)) { 882 struct vfsmount *mnt;
886 struct vfsmount *mounted; 883 int err;
887 mounted = __lookup_mnt(path->mnt, path->dentry, 1); 884
888 if (!mounted) 885 if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
889 return; 886 return -EREMOTE;
890 path->mnt = mounted; 887
891 path->dentry = mounted->mnt_root; 888 /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT
892 nd->seq = read_seqcount_begin(&path->dentry->d_seq); 889 * and this is the terminal part of the path.
893 *inode = path->dentry->d_inode; 890 */
891 if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_CONTINUE))
892 return -EISDIR; /* we actually want to stop here */
893
894 /* We want to mount if someone is trying to open/create a file of any
895 * type under the mountpoint, wants to traverse through the mountpoint
896 * or wants to open the mounted directory.
897 *
898 * We don't want to mount if someone's just doing a stat and they've
899 * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and
900 * appended a '/' to the name.
901 */
902 if (!(flags & LOOKUP_FOLLOW) &&
903 !(flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY |
904 LOOKUP_OPEN | LOOKUP_CREATE)))
905 return -EISDIR;
906
907 current->total_link_count++;
908 if (current->total_link_count >= 40)
909 return -ELOOP;
910
911 mnt = path->dentry->d_op->d_automount(path);
912 if (IS_ERR(mnt)) {
913 /*
914 * The filesystem is allowed to return -EISDIR here to indicate
915 * it doesn't want to automount. For instance, autofs would do
916 * this so that its userspace daemon can mount on this dentry.
917 *
918 * However, we can only permit this if it's a terminal point in
919 * the path being looked up; if it wasn't then the remainder of
920 * the path is inaccessible and we should say so.
921 */
922 if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_CONTINUE))
923 return -EREMOTE;
924 return PTR_ERR(mnt);
894 } 925 }
895}
896 926
897static int __follow_mount(struct path *path) 927 if (!mnt) /* mount collision */
898{ 928 return 0;
899 int res = 0; 929
900 while (d_mountpoint(path->dentry)) { 930 err = finish_automount(mnt, path);
901 struct vfsmount *mounted = lookup_mnt(path); 931
902 if (!mounted) 932 switch (err) {
903 break; 933 case -EBUSY:
934 /* Someone else made a mount here whilst we were busy */
935 return 0;
936 case 0:
904 dput(path->dentry); 937 dput(path->dentry);
905 if (res) 938 if (*need_mntput)
906 mntput(path->mnt); 939 mntput(path->mnt);
907 path->mnt = mounted; 940 path->mnt = mnt;
908 path->dentry = dget(mounted->mnt_root); 941 path->dentry = dget(mnt->mnt_root);
909 res = 1; 942 *need_mntput = true;
943 return 0;
944 default:
945 return err;
910 } 946 }
911 return res; 947
912} 948}
913 949
914static void follow_mount(struct path *path) 950/*
951 * Handle a dentry that is managed in some way.
952 * - Flagged for transit management (autofs)
953 * - Flagged as mountpoint
954 * - Flagged as automount point
955 *
956 * This may only be called in refwalk mode.
957 *
958 * Serialization is taken care of in namespace.c
959 */
960static int follow_managed(struct path *path, unsigned flags)
915{ 961{
916 while (d_mountpoint(path->dentry)) { 962 unsigned managed;
917 struct vfsmount *mounted = lookup_mnt(path); 963 bool need_mntput = false;
918 if (!mounted) 964 int ret;
919 break; 965
920 dput(path->dentry); 966 /* Given that we're not holding a lock here, we retain the value in a
921 mntput(path->mnt); 967 * local variable for each dentry as we look at it so that we don't see
922 path->mnt = mounted; 968 * the components of that value change under us */
923 path->dentry = dget(mounted->mnt_root); 969 while (managed = ACCESS_ONCE(path->dentry->d_flags),
970 managed &= DCACHE_MANAGED_DENTRY,
971 unlikely(managed != 0)) {
972 /* Allow the filesystem to manage the transit without i_mutex
973 * being held. */
974 if (managed & DCACHE_MANAGE_TRANSIT) {
975 BUG_ON(!path->dentry->d_op);
976 BUG_ON(!path->dentry->d_op->d_manage);
977 ret = path->dentry->d_op->d_manage(path->dentry,
978 false, false);
979 if (ret < 0)
980 return ret == -EISDIR ? 0 : ret;
981 }
982
983 /* Transit to a mounted filesystem. */
984 if (managed & DCACHE_MOUNTED) {
985 struct vfsmount *mounted = lookup_mnt(path);
986 if (mounted) {
987 dput(path->dentry);
988 if (need_mntput)
989 mntput(path->mnt);
990 path->mnt = mounted;
991 path->dentry = dget(mounted->mnt_root);
992 need_mntput = true;
993 continue;
994 }
995
996 /* Something is mounted on this dentry in another
997 * namespace and/or whatever was mounted there in this
998 * namespace got unmounted before we managed to get the
999 * vfsmount_lock */
1000 }
1001
1002 /* Handle an automount point */
1003 if (managed & DCACHE_NEED_AUTOMOUNT) {
1004 ret = follow_automount(path, flags, &need_mntput);
1005 if (ret < 0)
1006 return ret == -EISDIR ? 0 : ret;
1007 continue;
1008 }
1009
1010 /* We didn't change the current path point */
1011 break;
924 } 1012 }
1013 return 0;
925} 1014}
926 1015
927int follow_down(struct path *path) 1016int follow_down_one(struct path *path)
928{ 1017{
929 struct vfsmount *mounted; 1018 struct vfsmount *mounted;
930 1019
@@ -939,13 +1028,41 @@ int follow_down(struct path *path)
939 return 0; 1028 return 0;
940} 1029}
941 1030
1031/*
1032 * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we
1033 * meet a managed dentry and we're not walking to "..". True is returned to
1034 * continue, false to abort.
1035 */
1036static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1037 struct inode **inode, bool reverse_transit)
1038{
1039 while (d_mountpoint(path->dentry)) {
1040 struct vfsmount *mounted;
1041 if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
1042 !reverse_transit &&
1043 path->dentry->d_op->d_manage(path->dentry, false, true) < 0)
1044 return false;
1045 mounted = __lookup_mnt(path->mnt, path->dentry, 1);
1046 if (!mounted)
1047 break;
1048 path->mnt = mounted;
1049 path->dentry = mounted->mnt_root;
1050 nd->seq = read_seqcount_begin(&path->dentry->d_seq);
1051 *inode = path->dentry->d_inode;
1052 }
1053
1054 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
1055 return reverse_transit;
1056 return true;
1057}
1058
942static int follow_dotdot_rcu(struct nameidata *nd) 1059static int follow_dotdot_rcu(struct nameidata *nd)
943{ 1060{
944 struct inode *inode = nd->inode; 1061 struct inode *inode = nd->inode;
945 1062
946 set_root_rcu(nd); 1063 set_root_rcu(nd);
947 1064
948 while(1) { 1065 while (1) {
949 if (nd->path.dentry == nd->root.dentry && 1066 if (nd->path.dentry == nd->root.dentry &&
950 nd->path.mnt == nd->root.mnt) { 1067 nd->path.mnt == nd->root.mnt) {
951 break; 1068 break;
@@ -968,12 +1085,80 @@ static int follow_dotdot_rcu(struct nameidata *nd)
968 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); 1085 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
969 inode = nd->path.dentry->d_inode; 1086 inode = nd->path.dentry->d_inode;
970 } 1087 }
971 __follow_mount_rcu(nd, &nd->path, &inode); 1088 __follow_mount_rcu(nd, &nd->path, &inode, true);
972 nd->inode = inode; 1089 nd->inode = inode;
973 1090
974 return 0; 1091 return 0;
975} 1092}
976 1093
1094/*
1095 * Follow down to the covering mount currently visible to userspace. At each
1096 * point, the filesystem owning that dentry may be queried as to whether the
1097 * caller is permitted to proceed or not.
1098 *
1099 * Care must be taken as namespace_sem may be held (indicated by mounting_here
1100 * being true).
1101 */
1102int follow_down(struct path *path, bool mounting_here)
1103{
1104 unsigned managed;
1105 int ret;
1106
1107 while (managed = ACCESS_ONCE(path->dentry->d_flags),
1108 unlikely(managed & DCACHE_MANAGED_DENTRY)) {
1109 /* Allow the filesystem to manage the transit without i_mutex
1110 * being held.
1111 *
1112 * We indicate to the filesystem if someone is trying to mount
1113 * something here. This gives autofs the chance to deny anyone
1114 * other than its daemon the right to mount on its
1115 * superstructure.
1116 *
1117 * The filesystem may sleep at this point.
1118 */
1119 if (managed & DCACHE_MANAGE_TRANSIT) {
1120 BUG_ON(!path->dentry->d_op);
1121 BUG_ON(!path->dentry->d_op->d_manage);
1122 ret = path->dentry->d_op->d_manage(
1123 path->dentry, mounting_here, false);
1124 if (ret < 0)
1125 return ret == -EISDIR ? 0 : ret;
1126 }
1127
1128 /* Transit to a mounted filesystem. */
1129 if (managed & DCACHE_MOUNTED) {
1130 struct vfsmount *mounted = lookup_mnt(path);
1131 if (!mounted)
1132 break;
1133 dput(path->dentry);
1134 mntput(path->mnt);
1135 path->mnt = mounted;
1136 path->dentry = dget(mounted->mnt_root);
1137 continue;
1138 }
1139
1140 /* Don't handle automount points here */
1141 break;
1142 }
1143 return 0;
1144}
1145
1146/*
1147 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
1148 */
1149static void follow_mount(struct path *path)
1150{
1151 while (d_mountpoint(path->dentry)) {
1152 struct vfsmount *mounted = lookup_mnt(path);
1153 if (!mounted)
1154 break;
1155 dput(path->dentry);
1156 mntput(path->mnt);
1157 path->mnt = mounted;
1158 path->dentry = dget(mounted->mnt_root);
1159 }
1160}
1161
977static void follow_dotdot(struct nameidata *nd) 1162static void follow_dotdot(struct nameidata *nd)
978{ 1163{
979 set_root(nd); 1164 set_root(nd);
@@ -1038,12 +1223,14 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
1038 struct vfsmount *mnt = nd->path.mnt; 1223 struct vfsmount *mnt = nd->path.mnt;
1039 struct dentry *dentry, *parent = nd->path.dentry; 1224 struct dentry *dentry, *parent = nd->path.dentry;
1040 struct inode *dir; 1225 struct inode *dir;
1226 int err;
1227
1041 /* 1228 /*
1042 * See if the low-level filesystem might want 1229 * See if the low-level filesystem might want
1043 * to use its own hash.. 1230 * to use its own hash..
1044 */ 1231 */
1045 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { 1232 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
1046 int err = parent->d_op->d_hash(parent, nd->inode, name); 1233 err = parent->d_op->d_hash(parent, nd->inode, name);
1047 if (err < 0) 1234 if (err < 0)
1048 return err; 1235 return err;
1049 } 1236 }
@@ -1070,22 +1257,30 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
1070 nd->seq = seq; 1257 nd->seq = seq;
1071 if (dentry->d_flags & DCACHE_OP_REVALIDATE) 1258 if (dentry->d_flags & DCACHE_OP_REVALIDATE)
1072 goto need_revalidate; 1259 goto need_revalidate;
1260done2:
1073 path->mnt = mnt; 1261 path->mnt = mnt;
1074 path->dentry = dentry; 1262 path->dentry = dentry;
1075 __follow_mount_rcu(nd, path, inode); 1263 if (likely(__follow_mount_rcu(nd, path, inode, false)))
1076 } else { 1264 return 0;
1077 dentry = __d_lookup(parent, name); 1265 if (nameidata_drop_rcu(nd))
1078 if (!dentry) 1266 return -ECHILD;
1079 goto need_lookup; 1267 /* fallthru */
1268 }
1269 dentry = __d_lookup(parent, name);
1270 if (!dentry)
1271 goto need_lookup;
1080found: 1272found:
1081 if (dentry->d_flags & DCACHE_OP_REVALIDATE) 1273 if (dentry->d_flags & DCACHE_OP_REVALIDATE)
1082 goto need_revalidate; 1274 goto need_revalidate;
1083done: 1275done:
1084 path->mnt = mnt; 1276 path->mnt = mnt;
1085 path->dentry = dentry; 1277 path->dentry = dentry;
1086 __follow_mount(path); 1278 err = follow_managed(path, nd->flags);
1087 *inode = path->dentry->d_inode; 1279 if (unlikely(err < 0)) {
1088 } 1280 path_put_conditional(path, nd);
1281 return err;
1282 }
1283 *inode = path->dentry->d_inode;
1089 return 0; 1284 return 0;
1090 1285
1091need_lookup: 1286need_lookup:
@@ -1124,6 +1319,8 @@ need_revalidate:
1124 goto need_lookup; 1319 goto need_lookup;
1125 if (IS_ERR(dentry)) 1320 if (IS_ERR(dentry))
1126 goto fail; 1321 goto fail;
1322 if (nd->flags & LOOKUP_RCU)
1323 goto done2;
1127 goto done; 1324 goto done;
1128 1325
1129fail: 1326fail:
@@ -1131,17 +1328,6 @@ fail:
1131} 1328}
1132 1329
1133/* 1330/*
1134 * This is a temporary kludge to deal with "automount" symlinks; proper
1135 * solution is to trigger them on follow_mount(), so that do_lookup()
1136 * would DTRT. To be killed before 2.6.34-final.
1137 */
1138static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
1139{
1140 return inode && unlikely(inode->i_op->follow_link) &&
1141 ((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode));
1142}
1143
1144/*
1145 * Name resolution. 1331 * Name resolution.
1146 * This is the basic name resolution function, turning a pathname into 1332 * This is the basic name resolution function, turning a pathname into
1147 * the final dentry. We expect 'base' to be positive and a directory. 1333 * the final dentry. We expect 'base' to be positive and a directory.
@@ -1279,7 +1465,8 @@ last_component:
1279 err = do_lookup(nd, &this, &next, &inode); 1465 err = do_lookup(nd, &this, &next, &inode);
1280 if (err) 1466 if (err)
1281 break; 1467 break;
1282 if (follow_on_final(inode, lookup_flags)) { 1468 if (inode && unlikely(inode->i_op->follow_link) &&
1469 (lookup_flags & LOOKUP_FOLLOW)) {
1283 if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry)) 1470 if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
1284 return -ECHILD; 1471 return -ECHILD;
1285 BUG_ON(inode != next.dentry->d_inode); 1472 BUG_ON(inode != next.dentry->d_inode);
@@ -2082,8 +2269,6 @@ static struct file *finish_open(struct nameidata *nd,
2082 return filp; 2269 return filp;
2083 2270
2084exit: 2271exit:
2085 if (!IS_ERR(nd->intent.open.file))
2086 release_open_intent(nd);
2087 path_put(&nd->path); 2272 path_put(&nd->path);
2088 return ERR_PTR(error); 2273 return ERR_PTR(error);
2089} 2274}
@@ -2105,11 +2290,13 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2105 dir = nd->path.dentry; 2290 dir = nd->path.dentry;
2106 case LAST_DOT: 2291 case LAST_DOT:
2107 if (need_reval_dot(dir)) { 2292 if (need_reval_dot(dir)) {
2108 error = d_revalidate(nd->path.dentry, nd); 2293 int status = d_revalidate(nd->path.dentry, nd);
2109 if (!error) 2294 if (!status)
2110 error = -ESTALE; 2295 status = -ESTALE;
2111 if (error < 0) 2296 if (status < 0) {
2297 error = status;
2112 goto exit; 2298 goto exit;
2299 }
2113 } 2300 }
2114 /* fallthrough */ 2301 /* fallthrough */
2115 case LAST_ROOT: 2302 case LAST_ROOT:
@@ -2179,11 +2366,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2179 if (open_flag & O_EXCL) 2366 if (open_flag & O_EXCL)
2180 goto exit_dput; 2367 goto exit_dput;
2181 2368
2182 if (__follow_mount(path)) { 2369 error = follow_managed(path, nd->flags);
2183 error = -ELOOP; 2370 if (error < 0)
2184 if (open_flag & O_NOFOLLOW) 2371 goto exit_dput;
2185 goto exit_dput;
2186 }
2187 2372
2188 error = -ENOENT; 2373 error = -ENOENT;
2189 if (!path->dentry->d_inode) 2374 if (!path->dentry->d_inode)
@@ -2206,8 +2391,6 @@ exit_mutex_unlock:
2206exit_dput: 2391exit_dput:
2207 path_put_conditional(path, nd); 2392 path_put_conditional(path, nd);
2208exit: 2393exit:
2209 if (!IS_ERR(nd->intent.open.file))
2210 release_open_intent(nd);
2211 path_put(&nd->path); 2394 path_put(&nd->path);
2212 return ERR_PTR(error); 2395 return ERR_PTR(error);
2213} 2396}
@@ -2294,6 +2477,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
2294 } 2477 }
2295 audit_inode(pathname, nd.path.dentry); 2478 audit_inode(pathname, nd.path.dentry);
2296 filp = finish_open(&nd, open_flag, acc_mode); 2479 filp = finish_open(&nd, open_flag, acc_mode);
2480 release_open_intent(&nd);
2297 return filp; 2481 return filp;
2298 2482
2299creat: 2483creat:
@@ -2328,11 +2512,11 @@ reval:
2328 nd.flags = flags; 2512 nd.flags = flags;
2329 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); 2513 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
2330 while (unlikely(!filp)) { /* trailing symlink */ 2514 while (unlikely(!filp)) { /* trailing symlink */
2331 struct path holder; 2515 struct path link = path;
2516 struct inode *linki = link.dentry->d_inode;
2332 void *cookie; 2517 void *cookie;
2333 error = -ELOOP; 2518 error = -ELOOP;
2334 /* S_ISDIR part is a temporary automount kludge */ 2519 if (!(nd.flags & LOOKUP_FOLLOW))
2335 if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(nd.inode->i_mode))
2336 goto exit_dput; 2520 goto exit_dput;
2337 if (count++ == 32) 2521 if (count++ == 32)
2338 goto exit_dput; 2522 goto exit_dput;
@@ -2348,29 +2532,29 @@ reval:
2348 * just set LAST_BIND. 2532 * just set LAST_BIND.
2349 */ 2533 */
2350 nd.flags |= LOOKUP_PARENT; 2534 nd.flags |= LOOKUP_PARENT;
2351 error = security_inode_follow_link(path.dentry, &nd); 2535 error = security_inode_follow_link(link.dentry, &nd);
2352 if (error) 2536 if (error)
2353 goto exit_dput; 2537 goto exit_dput;
2354 error = __do_follow_link(&path, &nd, &cookie); 2538 error = __do_follow_link(&link, &nd, &cookie);
2355 if (unlikely(error)) { 2539 if (unlikely(error)) {
2356 if (!IS_ERR(cookie) && nd.inode->i_op->put_link) 2540 if (!IS_ERR(cookie) && linki->i_op->put_link)
2357 nd.inode->i_op->put_link(path.dentry, &nd, cookie); 2541 linki->i_op->put_link(link.dentry, &nd, cookie);
2358 /* nd.path had been dropped */ 2542 /* nd.path had been dropped */
2359 nd.path = path; 2543 nd.path = link;
2360 goto out_path; 2544 goto out_path;
2361 } 2545 }
2362 holder = path;
2363 nd.flags &= ~LOOKUP_PARENT; 2546 nd.flags &= ~LOOKUP_PARENT;
2364 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); 2547 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
2365 if (nd.inode->i_op->put_link) 2548 if (linki->i_op->put_link)
2366 nd.inode->i_op->put_link(holder.dentry, &nd, cookie); 2549 linki->i_op->put_link(link.dentry, &nd, cookie);
2367 path_put(&holder); 2550 path_put(&link);
2368 } 2551 }
2369out: 2552out:
2370 if (nd.root.mnt) 2553 if (nd.root.mnt)
2371 path_put(&nd.root); 2554 path_put(&nd.root);
2372 if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL)) 2555 if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
2373 goto reval; 2556 goto reval;
2557 release_open_intent(&nd);
2374 return filp; 2558 return filp;
2375 2559
2376exit_dput: 2560exit_dput:
@@ -2378,8 +2562,6 @@ exit_dput:
2378out_path: 2562out_path:
2379 path_put(&nd.path); 2563 path_put(&nd.path);
2380out_filp: 2564out_filp:
2381 if (!IS_ERR(nd.intent.open.file))
2382 release_open_intent(&nd);
2383 filp = ERR_PTR(error); 2565 filp = ERR_PTR(error);
2384 goto out; 2566 goto out;
2385} 2567}
@@ -3392,6 +3574,7 @@ const struct inode_operations page_symlink_inode_operations = {
3392}; 3574};
3393 3575
3394EXPORT_SYMBOL(user_path_at); 3576EXPORT_SYMBOL(user_path_at);
3577EXPORT_SYMBOL(follow_down_one);
3395EXPORT_SYMBOL(follow_down); 3578EXPORT_SYMBOL(follow_down);
3396EXPORT_SYMBOL(follow_up); 3579EXPORT_SYMBOL(follow_up);
3397EXPORT_SYMBOL(get_write_access); /* binfmt_aout */ 3580EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
diff --git a/fs/namespace.c b/fs/namespace.c
index 3ddfd9046c44..7b0b95371696 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -183,7 +183,7 @@ static inline void mnt_dec_count(struct vfsmount *mnt)
183unsigned int mnt_get_count(struct vfsmount *mnt) 183unsigned int mnt_get_count(struct vfsmount *mnt)
184{ 184{
185#ifdef CONFIG_SMP 185#ifdef CONFIG_SMP
186 unsigned int count = atomic_read(&mnt->mnt_longrefs); 186 unsigned int count = 0;
187 int cpu; 187 int cpu;
188 188
189 for_each_possible_cpu(cpu) { 189 for_each_possible_cpu(cpu) {
@@ -217,7 +217,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
217 if (!mnt->mnt_pcp) 217 if (!mnt->mnt_pcp)
218 goto out_free_devname; 218 goto out_free_devname;
219 219
220 atomic_set(&mnt->mnt_longrefs, 1); 220 this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
221#else 221#else
222 mnt->mnt_count = 1; 222 mnt->mnt_count = 1;
223 mnt->mnt_writers = 0; 223 mnt->mnt_writers = 0;
@@ -611,6 +611,21 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
611 list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts); 611 list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts);
612} 612}
613 613
614static inline void __mnt_make_longterm(struct vfsmount *mnt)
615{
616#ifdef CONFIG_SMP
617 atomic_inc(&mnt->mnt_longterm);
618#endif
619}
620
621/* needs vfsmount lock for write */
622static inline void __mnt_make_shortterm(struct vfsmount *mnt)
623{
624#ifdef CONFIG_SMP
625 atomic_dec(&mnt->mnt_longterm);
626#endif
627}
628
614/* 629/*
615 * vfsmount lock must be held for write 630 * vfsmount lock must be held for write
616 */ 631 */
@@ -624,8 +639,11 @@ static void commit_tree(struct vfsmount *mnt)
624 BUG_ON(parent == mnt); 639 BUG_ON(parent == mnt);
625 640
626 list_add_tail(&head, &mnt->mnt_list); 641 list_add_tail(&head, &mnt->mnt_list);
627 list_for_each_entry(m, &head, mnt_list) 642 list_for_each_entry(m, &head, mnt_list) {
628 m->mnt_ns = n; 643 m->mnt_ns = n;
644 __mnt_make_longterm(m);
645 }
646
629 list_splice(&head, n->list.prev); 647 list_splice(&head, n->list.prev);
630 648
631 list_add_tail(&mnt->mnt_hash, mount_hashtable + 649 list_add_tail(&mnt->mnt_hash, mount_hashtable +
@@ -734,51 +752,30 @@ static inline void mntfree(struct vfsmount *mnt)
734 deactivate_super(sb); 752 deactivate_super(sb);
735} 753}
736 754
737#ifdef CONFIG_SMP 755static void mntput_no_expire(struct vfsmount *mnt)
738static inline void __mntput(struct vfsmount *mnt, int longrefs)
739{ 756{
740 if (!longrefs) {
741put_again: 757put_again:
742 br_read_lock(vfsmount_lock); 758#ifdef CONFIG_SMP
743 if (likely(atomic_read(&mnt->mnt_longrefs))) { 759 br_read_lock(vfsmount_lock);
744 mnt_dec_count(mnt); 760 if (likely(atomic_read(&mnt->mnt_longterm))) {
745 br_read_unlock(vfsmount_lock); 761 mnt_dec_count(mnt);
746 return;
747 }
748 br_read_unlock(vfsmount_lock); 762 br_read_unlock(vfsmount_lock);
749 } else { 763 return;
750 BUG_ON(!atomic_read(&mnt->mnt_longrefs));
751 if (atomic_add_unless(&mnt->mnt_longrefs, -1, 1))
752 return;
753 } 764 }
765 br_read_unlock(vfsmount_lock);
754 766
755 br_write_lock(vfsmount_lock); 767 br_write_lock(vfsmount_lock);
756 if (!longrefs) 768 mnt_dec_count(mnt);
757 mnt_dec_count(mnt);
758 else
759 atomic_dec(&mnt->mnt_longrefs);
760 if (mnt_get_count(mnt)) { 769 if (mnt_get_count(mnt)) {
761 br_write_unlock(vfsmount_lock); 770 br_write_unlock(vfsmount_lock);
762 return; 771 return;
763 } 772 }
764 if (unlikely(mnt->mnt_pinned)) {
765 mnt_add_count(mnt, mnt->mnt_pinned + 1);
766 mnt->mnt_pinned = 0;
767 br_write_unlock(vfsmount_lock);
768 acct_auto_close_mnt(mnt);
769 goto put_again;
770 }
771 br_write_unlock(vfsmount_lock);
772 mntfree(mnt);
773}
774#else 773#else
775static inline void __mntput(struct vfsmount *mnt, int longrefs)
776{
777put_again:
778 mnt_dec_count(mnt); 774 mnt_dec_count(mnt);
779 if (likely(mnt_get_count(mnt))) 775 if (likely(mnt_get_count(mnt)))
780 return; 776 return;
781 br_write_lock(vfsmount_lock); 777 br_write_lock(vfsmount_lock);
778#endif
782 if (unlikely(mnt->mnt_pinned)) { 779 if (unlikely(mnt->mnt_pinned)) {
783 mnt_add_count(mnt, mnt->mnt_pinned + 1); 780 mnt_add_count(mnt, mnt->mnt_pinned + 1);
784 mnt->mnt_pinned = 0; 781 mnt->mnt_pinned = 0;
@@ -789,12 +786,6 @@ put_again:
789 br_write_unlock(vfsmount_lock); 786 br_write_unlock(vfsmount_lock);
790 mntfree(mnt); 787 mntfree(mnt);
791} 788}
792#endif
793
794static void mntput_no_expire(struct vfsmount *mnt)
795{
796 __mntput(mnt, 0);
797}
798 789
799void mntput(struct vfsmount *mnt) 790void mntput(struct vfsmount *mnt)
800{ 791{
@@ -802,7 +793,7 @@ void mntput(struct vfsmount *mnt)
802 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ 793 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
803 if (unlikely(mnt->mnt_expiry_mark)) 794 if (unlikely(mnt->mnt_expiry_mark))
804 mnt->mnt_expiry_mark = 0; 795 mnt->mnt_expiry_mark = 0;
805 __mntput(mnt, 0); 796 mntput_no_expire(mnt);
806 } 797 }
807} 798}
808EXPORT_SYMBOL(mntput); 799EXPORT_SYMBOL(mntput);
@@ -815,33 +806,6 @@ struct vfsmount *mntget(struct vfsmount *mnt)
815} 806}
816EXPORT_SYMBOL(mntget); 807EXPORT_SYMBOL(mntget);
817 808
818void mntput_long(struct vfsmount *mnt)
819{
820#ifdef CONFIG_SMP
821 if (mnt) {
822 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
823 if (unlikely(mnt->mnt_expiry_mark))
824 mnt->mnt_expiry_mark = 0;
825 __mntput(mnt, 1);
826 }
827#else
828 mntput(mnt);
829#endif
830}
831EXPORT_SYMBOL(mntput_long);
832
833struct vfsmount *mntget_long(struct vfsmount *mnt)
834{
835#ifdef CONFIG_SMP
836 if (mnt)
837 atomic_inc(&mnt->mnt_longrefs);
838 return mnt;
839#else
840 return mntget(mnt);
841#endif
842}
843EXPORT_SYMBOL(mntget_long);
844
845void mnt_pin(struct vfsmount *mnt) 809void mnt_pin(struct vfsmount *mnt)
846{ 810{
847 br_write_lock(vfsmount_lock); 811 br_write_lock(vfsmount_lock);
@@ -1216,7 +1180,7 @@ void release_mounts(struct list_head *head)
1216 dput(dentry); 1180 dput(dentry);
1217 mntput(m); 1181 mntput(m);
1218 } 1182 }
1219 mntput_long(mnt); 1183 mntput(mnt);
1220 } 1184 }
1221} 1185}
1222 1186
@@ -1226,19 +1190,21 @@ void release_mounts(struct list_head *head)
1226 */ 1190 */
1227void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) 1191void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
1228{ 1192{
1193 LIST_HEAD(tmp_list);
1229 struct vfsmount *p; 1194 struct vfsmount *p;
1230 1195
1231 for (p = mnt; p; p = next_mnt(p, mnt)) 1196 for (p = mnt; p; p = next_mnt(p, mnt))
1232 list_move(&p->mnt_hash, kill); 1197 list_move(&p->mnt_hash, &tmp_list);
1233 1198
1234 if (propagate) 1199 if (propagate)
1235 propagate_umount(kill); 1200 propagate_umount(&tmp_list);
1236 1201
1237 list_for_each_entry(p, kill, mnt_hash) { 1202 list_for_each_entry(p, &tmp_list, mnt_hash) {
1238 list_del_init(&p->mnt_expire); 1203 list_del_init(&p->mnt_expire);
1239 list_del_init(&p->mnt_list); 1204 list_del_init(&p->mnt_list);
1240 __touch_mnt_namespace(p->mnt_ns); 1205 __touch_mnt_namespace(p->mnt_ns);
1241 p->mnt_ns = NULL; 1206 p->mnt_ns = NULL;
1207 __mnt_make_shortterm(p);
1242 list_del_init(&p->mnt_child); 1208 list_del_init(&p->mnt_child);
1243 if (p->mnt_parent != p) { 1209 if (p->mnt_parent != p) {
1244 p->mnt_parent->mnt_ghosts++; 1210 p->mnt_parent->mnt_ghosts++;
@@ -1246,6 +1212,7 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
1246 } 1212 }
1247 change_mnt_propagation(p, MS_PRIVATE); 1213 change_mnt_propagation(p, MS_PRIVATE);
1248 } 1214 }
1215 list_splice(&tmp_list, kill);
1249} 1216}
1250 1217
1251static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts); 1218static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts);
@@ -1844,9 +1811,10 @@ static int do_move_mount(struct path *path, char *old_name)
1844 return err; 1811 return err;
1845 1812
1846 down_write(&namespace_sem); 1813 down_write(&namespace_sem);
1847 while (d_mountpoint(path->dentry) && 1814 err = follow_down(path, true);
1848 follow_down(path)) 1815 if (err < 0)
1849 ; 1816 goto out;
1817
1850 err = -EINVAL; 1818 err = -EINVAL;
1851 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) 1819 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
1852 goto out; 1820 goto out;
@@ -1904,6 +1872,8 @@ out:
1904 return err; 1872 return err;
1905} 1873}
1906 1874
1875static int do_add_mount(struct vfsmount *, struct path *, int);
1876
1907/* 1877/*
1908 * create a new mount for userspace and request it to be added into the 1878 * create a new mount for userspace and request it to be added into the
1909 * namespace's tree 1879 * namespace's tree
@@ -1912,6 +1882,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
1912 int mnt_flags, char *name, void *data) 1882 int mnt_flags, char *name, void *data)
1913{ 1883{
1914 struct vfsmount *mnt; 1884 struct vfsmount *mnt;
1885 int err;
1915 1886
1916 if (!type) 1887 if (!type)
1917 return -EINVAL; 1888 return -EINVAL;
@@ -1924,15 +1895,47 @@ static int do_new_mount(struct path *path, char *type, int flags,
1924 if (IS_ERR(mnt)) 1895 if (IS_ERR(mnt))
1925 return PTR_ERR(mnt); 1896 return PTR_ERR(mnt);
1926 1897
1927 return do_add_mount(mnt, path, mnt_flags, NULL); 1898 err = do_add_mount(mnt, path, mnt_flags);
1899 if (err)
1900 mntput(mnt);
1901 return err;
1902}
1903
1904int finish_automount(struct vfsmount *m, struct path *path)
1905{
1906 int err;
1907 /* The new mount record should have at least 2 refs to prevent it being
1908 * expired before we get a chance to add it
1909 */
1910 BUG_ON(mnt_get_count(m) < 2);
1911
1912 if (m->mnt_sb == path->mnt->mnt_sb &&
1913 m->mnt_root == path->dentry) {
1914 err = -ELOOP;
1915 goto fail;
1916 }
1917
1918 err = do_add_mount(m, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
1919 if (!err)
1920 return 0;
1921fail:
1922 /* remove m from any expiration list it may be on */
1923 if (!list_empty(&m->mnt_expire)) {
1924 down_write(&namespace_sem);
1925 br_write_lock(vfsmount_lock);
1926 list_del_init(&m->mnt_expire);
1927 br_write_unlock(vfsmount_lock);
1928 up_write(&namespace_sem);
1929 }
1930 mntput(m);
1931 mntput(m);
1932 return err;
1928} 1933}
1929 1934
1930/* 1935/*
1931 * add a mount into a namespace's mount tree 1936 * add a mount into a namespace's mount tree
1932 * - provide the option of adding the new mount to an expiration list
1933 */ 1937 */
1934int do_add_mount(struct vfsmount *newmnt, struct path *path, 1938static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
1935 int mnt_flags, struct list_head *fslist)
1936{ 1939{
1937 int err; 1940 int err;
1938 1941
@@ -1940,9 +1943,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
1940 1943
1941 down_write(&namespace_sem); 1944 down_write(&namespace_sem);
1942 /* Something was mounted here while we slept */ 1945 /* Something was mounted here while we slept */
1943 while (d_mountpoint(path->dentry) && 1946 err = follow_down(path, true);
1944 follow_down(path)) 1947 if (err < 0)
1945 ; 1948 goto unlock;
1949
1946 err = -EINVAL; 1950 err = -EINVAL;
1947 if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt)) 1951 if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
1948 goto unlock; 1952 goto unlock;
@@ -1958,22 +1962,29 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
1958 goto unlock; 1962 goto unlock;
1959 1963
1960 newmnt->mnt_flags = mnt_flags; 1964 newmnt->mnt_flags = mnt_flags;
1961 if ((err = graft_tree(newmnt, path))) 1965 err = graft_tree(newmnt, path);
1962 goto unlock;
1963
1964 if (fslist) /* add to the specified expiration list */
1965 list_add_tail(&newmnt->mnt_expire, fslist);
1966
1967 up_write(&namespace_sem);
1968 return 0;
1969 1966
1970unlock: 1967unlock:
1971 up_write(&namespace_sem); 1968 up_write(&namespace_sem);
1972 mntput_long(newmnt);
1973 return err; 1969 return err;
1974} 1970}
1975 1971
1976EXPORT_SYMBOL_GPL(do_add_mount); 1972/**
1973 * mnt_set_expiry - Put a mount on an expiration list
1974 * @mnt: The mount to list.
1975 * @expiry_list: The list to add the mount to.
1976 */
1977void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
1978{
1979 down_write(&namespace_sem);
1980 br_write_lock(vfsmount_lock);
1981
1982 list_add_tail(&mnt->mnt_expire, expiry_list);
1983
1984 br_write_unlock(vfsmount_lock);
1985 up_write(&namespace_sem);
1986}
1987EXPORT_SYMBOL(mnt_set_expiry);
1977 1988
1978/* 1989/*
1979 * process a list of expirable mountpoints with the intent of discarding any 1990 * process a list of expirable mountpoints with the intent of discarding any
@@ -2262,6 +2273,22 @@ static struct mnt_namespace *alloc_mnt_ns(void)
2262 return new_ns; 2273 return new_ns;
2263} 2274}
2264 2275
2276void mnt_make_longterm(struct vfsmount *mnt)
2277{
2278 __mnt_make_longterm(mnt);
2279}
2280
2281void mnt_make_shortterm(struct vfsmount *mnt)
2282{
2283#ifdef CONFIG_SMP
2284 if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
2285 return;
2286 br_write_lock(vfsmount_lock);
2287 atomic_dec(&mnt->mnt_longterm);
2288 br_write_unlock(vfsmount_lock);
2289#endif
2290}
2291
2265/* 2292/*
2266 * Allocate a new namespace structure and populate it with contents 2293 * Allocate a new namespace structure and populate it with contents
2267 * copied from the namespace of the passed in task structure. 2294 * copied from the namespace of the passed in task structure.
@@ -2299,14 +2326,19 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2299 q = new_ns->root; 2326 q = new_ns->root;
2300 while (p) { 2327 while (p) {
2301 q->mnt_ns = new_ns; 2328 q->mnt_ns = new_ns;
2329 __mnt_make_longterm(q);
2302 if (fs) { 2330 if (fs) {
2303 if (p == fs->root.mnt) { 2331 if (p == fs->root.mnt) {
2332 fs->root.mnt = mntget(q);
2333 __mnt_make_longterm(q);
2334 mnt_make_shortterm(p);
2304 rootmnt = p; 2335 rootmnt = p;
2305 fs->root.mnt = mntget_long(q);
2306 } 2336 }
2307 if (p == fs->pwd.mnt) { 2337 if (p == fs->pwd.mnt) {
2338 fs->pwd.mnt = mntget(q);
2339 __mnt_make_longterm(q);
2340 mnt_make_shortterm(p);
2308 pwdmnt = p; 2341 pwdmnt = p;
2309 fs->pwd.mnt = mntget_long(q);
2310 } 2342 }
2311 } 2343 }
2312 p = next_mnt(p, mnt_ns->root); 2344 p = next_mnt(p, mnt_ns->root);
@@ -2315,9 +2347,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2315 up_write(&namespace_sem); 2347 up_write(&namespace_sem);
2316 2348
2317 if (rootmnt) 2349 if (rootmnt)
2318 mntput_long(rootmnt); 2350 mntput(rootmnt);
2319 if (pwdmnt) 2351 if (pwdmnt)
2320 mntput_long(pwdmnt); 2352 mntput(pwdmnt);
2321 2353
2322 return new_ns; 2354 return new_ns;
2323} 2355}
@@ -2350,6 +2382,7 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
2350 new_ns = alloc_mnt_ns(); 2382 new_ns = alloc_mnt_ns();
2351 if (!IS_ERR(new_ns)) { 2383 if (!IS_ERR(new_ns)) {
2352 mnt->mnt_ns = new_ns; 2384 mnt->mnt_ns = new_ns;
2385 __mnt_make_longterm(mnt);
2353 new_ns->root = mnt; 2386 new_ns->root = mnt;
2354 list_add(&new_ns->list, &new_ns->root->mnt_list); 2387 list_add(&new_ns->list, &new_ns->root->mnt_list);
2355 } 2388 }
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 199016528fcb..e3d294269058 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -135,33 +135,6 @@ out_err:
135 135
136#if defined(CONFIG_NFS_V4_1) 136#if defined(CONFIG_NFS_V4_1)
137/* 137/*
138 * * CB_SEQUENCE operations will fail until the callback sessionid is set.
139 * */
140int nfs4_set_callback_sessionid(struct nfs_client *clp)
141{
142 struct svc_serv *serv = clp->cl_rpcclient->cl_xprt->bc_serv;
143 struct nfs4_sessionid *bc_sid;
144
145 if (!serv->sv_bc_xprt)
146 return -EINVAL;
147
148 /* on success freed in xprt_free */
149 bc_sid = kmalloc(sizeof(struct nfs4_sessionid), GFP_KERNEL);
150 if (!bc_sid)
151 return -ENOMEM;
152 memcpy(bc_sid->data, &clp->cl_session->sess_id.data,
153 NFS4_MAX_SESSIONID_LEN);
154 spin_lock_bh(&serv->sv_cb_lock);
155 serv->sv_bc_xprt->xpt_bc_sid = bc_sid;
156 spin_unlock_bh(&serv->sv_cb_lock);
157 dprintk("%s set xpt_bc_sid=%u:%u:%u:%u for sv_bc_xprt %p\n", __func__,
158 ((u32 *)bc_sid->data)[0], ((u32 *)bc_sid->data)[1],
159 ((u32 *)bc_sid->data)[2], ((u32 *)bc_sid->data)[3],
160 serv->sv_bc_xprt);
161 return 0;
162}
163
164/*
165 * The callback service for NFSv4.1 callbacks 138 * The callback service for NFSv4.1 callbacks
166 */ 139 */
167static int 140static int
@@ -266,10 +239,6 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
266 struct nfs_callback_data *cb_info) 239 struct nfs_callback_data *cb_info)
267{ 240{
268} 241}
269int nfs4_set_callback_sessionid(struct nfs_client *clp)
270{
271 return 0;
272}
273#endif /* CONFIG_NFS_V4_1 */ 242#endif /* CONFIG_NFS_V4_1 */
274 243
275/* 244/*
@@ -359,78 +328,58 @@ void nfs_callback_down(int minorversion)
359 mutex_unlock(&nfs_callback_mutex); 328 mutex_unlock(&nfs_callback_mutex);
360} 329}
361 330
362static int check_gss_callback_principal(struct nfs_client *clp, 331/* Boolean check of RPC_AUTH_GSS principal */
363 struct svc_rqst *rqstp) 332int
333check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
364{ 334{
365 struct rpc_clnt *r = clp->cl_rpcclient; 335 struct rpc_clnt *r = clp->cl_rpcclient;
366 char *p = svc_gss_principal(rqstp); 336 char *p = svc_gss_principal(rqstp);
367 337
338 if (rqstp->rq_authop->flavour != RPC_AUTH_GSS)
339 return 1;
340
368 /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */ 341 /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */
369 if (clp->cl_minorversion != 0) 342 if (clp->cl_minorversion != 0)
370 return SVC_DROP; 343 return 0;
371 /* 344 /*
372 * It might just be a normal user principal, in which case 345 * It might just be a normal user principal, in which case
373 * userspace won't bother to tell us the name at all. 346 * userspace won't bother to tell us the name at all.
374 */ 347 */
375 if (p == NULL) 348 if (p == NULL)
376 return SVC_DENIED; 349 return 0;
377 350
378 /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */ 351 /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
379 352
380 if (memcmp(p, "nfs@", 4) != 0) 353 if (memcmp(p, "nfs@", 4) != 0)
381 return SVC_DENIED; 354 return 0;
382 p += 4; 355 p += 4;
383 if (strcmp(p, r->cl_server) != 0) 356 if (strcmp(p, r->cl_server) != 0)
384 return SVC_DENIED; 357 return 0;
385 return SVC_OK; 358 return 1;
386} 359}
387 360
388/* pg_authenticate method helper */ 361/*
389static struct nfs_client *nfs_cb_find_client(struct svc_rqst *rqstp) 362 * pg_authenticate method for nfsv4 callback threads.
390{ 363 *
391 struct nfs4_sessionid *sessionid = bc_xprt_sid(rqstp); 364 * The authflavor has been negotiated, so an incorrect flavor is a server
392 int is_cb_compound = rqstp->rq_proc == CB_COMPOUND ? 1 : 0; 365 * bug. Drop packets with incorrect authflavor.
393 366 *
394 dprintk("--> %s rq_proc %d\n", __func__, rqstp->rq_proc); 367 * All other checking done after NFS decoding where the nfs_client can be
395 if (svc_is_backchannel(rqstp)) 368 * found in nfs4_callback_compound
396 /* Sessionid (usually) set after CB_NULL ping */ 369 */
397 return nfs4_find_client_sessionid(svc_addr(rqstp), sessionid,
398 is_cb_compound);
399 else
400 /* No callback identifier in pg_authenticate */
401 return nfs4_find_client_no_ident(svc_addr(rqstp));
402}
403
404/* pg_authenticate method for nfsv4 callback threads. */
405static int nfs_callback_authenticate(struct svc_rqst *rqstp) 370static int nfs_callback_authenticate(struct svc_rqst *rqstp)
406{ 371{
407 struct nfs_client *clp;
408 RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
409 int ret = SVC_OK;
410
411 /* Don't talk to strangers */
412 clp = nfs_cb_find_client(rqstp);
413 if (clp == NULL)
414 return SVC_DROP;
415
416 dprintk("%s: %s NFSv4 callback!\n", __func__,
417 svc_print_addr(rqstp, buf, sizeof(buf)));
418
419 switch (rqstp->rq_authop->flavour) { 372 switch (rqstp->rq_authop->flavour) {
420 case RPC_AUTH_NULL: 373 case RPC_AUTH_NULL:
421 if (rqstp->rq_proc != CB_NULL) 374 if (rqstp->rq_proc != CB_NULL)
422 ret = SVC_DENIED; 375 return SVC_DROP;
423 break; 376 break;
424 case RPC_AUTH_UNIX: 377 case RPC_AUTH_GSS:
425 break; 378 /* No RPC_AUTH_GSS support yet in NFSv4.1 */
426 case RPC_AUTH_GSS: 379 if (svc_is_backchannel(rqstp))
427 ret = check_gss_callback_principal(clp, rqstp); 380 return SVC_DROP;
428 break;
429 default:
430 ret = SVC_DENIED;
431 } 381 }
432 nfs_put_client(clp); 382 return SVC_OK;
433 return ret;
434} 383}
435 384
436/* 385/*
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index d3b44f9bd747..46d93ce7311b 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -7,6 +7,7 @@
7 */ 7 */
8#ifndef __LINUX_FS_NFS_CALLBACK_H 8#ifndef __LINUX_FS_NFS_CALLBACK_H
9#define __LINUX_FS_NFS_CALLBACK_H 9#define __LINUX_FS_NFS_CALLBACK_H
10#include <linux/sunrpc/svc.h>
10 11
11#define NFS4_CALLBACK 0x40000000 12#define NFS4_CALLBACK 0x40000000
12#define NFS4_CALLBACK_XDRSIZE 2048 13#define NFS4_CALLBACK_XDRSIZE 2048
@@ -37,7 +38,6 @@ enum nfs4_callback_opnum {
37struct cb_process_state { 38struct cb_process_state {
38 __be32 drc_status; 39 __be32 drc_status;
39 struct nfs_client *clp; 40 struct nfs_client *clp;
40 struct nfs4_sessionid *svc_sid; /* v4.1 callback service sessionid */
41}; 41};
42 42
43struct cb_compound_hdr_arg { 43struct cb_compound_hdr_arg {
@@ -168,7 +168,7 @@ extern unsigned nfs4_callback_layoutrecall(
168extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses); 168extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
169extern void nfs4_cb_take_slot(struct nfs_client *clp); 169extern void nfs4_cb_take_slot(struct nfs_client *clp);
170#endif /* CONFIG_NFS_V4_1 */ 170#endif /* CONFIG_NFS_V4_1 */
171 171extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
172extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, 172extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
173 struct cb_getattrres *res, 173 struct cb_getattrres *res,
174 struct cb_process_state *cps); 174 struct cb_process_state *cps);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 4bb91cb2620d..89587573fe50 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -373,17 +373,11 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
373{ 373{
374 struct nfs_client *clp; 374 struct nfs_client *clp;
375 int i; 375 int i;
376 __be32 status; 376 __be32 status = htonl(NFS4ERR_BADSESSION);
377 377
378 cps->clp = NULL; 378 cps->clp = NULL;
379 379
380 status = htonl(NFS4ERR_BADSESSION); 380 clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);
381 /* Incoming session must match the callback session */
382 if (memcmp(&args->csa_sessionid, cps->svc_sid, NFS4_MAX_SESSIONID_LEN))
383 goto out;
384
385 clp = nfs4_find_client_sessionid(args->csa_addr,
386 &args->csa_sessionid, 1);
387 if (clp == NULL) 381 if (clp == NULL)
388 goto out; 382 goto out;
389 383
@@ -414,9 +408,9 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
414 res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; 408 res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
415 res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; 409 res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
416 nfs4_cb_take_slot(clp); 410 nfs4_cb_take_slot(clp);
417 cps->clp = clp; /* put in nfs4_callback_compound */
418 411
419out: 412out:
413 cps->clp = clp; /* put in nfs4_callback_compound */
420 for (i = 0; i < args->csa_nrclists; i++) 414 for (i = 0; i < args->csa_nrclists; i++)
421 kfree(args->csa_rclists[i].rcl_refcalls); 415 kfree(args->csa_rclists[i].rcl_refcalls);
422 kfree(args->csa_rclists); 416 kfree(args->csa_rclists);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 23112c263f81..14e0f9371d14 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -794,10 +794,9 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
794 794
795 if (hdr_arg.minorversion == 0) { 795 if (hdr_arg.minorversion == 0) {
796 cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident); 796 cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident);
797 if (!cps.clp) 797 if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
798 return rpc_drop_reply; 798 return rpc_drop_reply;
799 } else 799 }
800 cps.svc_sid = bc_xprt_sid(rqstp);
801 800
802 hdr_res.taglen = hdr_arg.taglen; 801 hdr_res.taglen = hdr_arg.taglen;
803 hdr_res.tag = hdr_arg.tag; 802 hdr_res.tag = hdr_arg.tag;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 192f2f860265..bd3ca32879e7 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1206,16 +1206,11 @@ nfs4_find_client_ident(int cb_ident)
1206 * For CB_COMPOUND calls, find a client by IP address, protocol version, 1206 * For CB_COMPOUND calls, find a client by IP address, protocol version,
1207 * minorversion, and sessionID 1207 * minorversion, and sessionID
1208 * 1208 *
1209 * CREATE_SESSION triggers a CB_NULL ping from servers. The callback service
1210 * sessionid can only be set after the CREATE_SESSION return, so a CB_NULL
1211 * can arrive before the callback sessionid is set. For CB_NULL calls,
1212 * find a client by IP address protocol version, and minorversion.
1213 *
1214 * Returns NULL if no such client 1209 * Returns NULL if no such client
1215 */ 1210 */
1216struct nfs_client * 1211struct nfs_client *
1217nfs4_find_client_sessionid(const struct sockaddr *addr, 1212nfs4_find_client_sessionid(const struct sockaddr *addr,
1218 struct nfs4_sessionid *sid, int is_cb_compound) 1213 struct nfs4_sessionid *sid)
1219{ 1214{
1220 struct nfs_client *clp; 1215 struct nfs_client *clp;
1221 1216
@@ -1227,9 +1222,9 @@ nfs4_find_client_sessionid(const struct sockaddr *addr,
1227 if (!nfs4_has_session(clp)) 1222 if (!nfs4_has_session(clp))
1228 continue; 1223 continue;
1229 1224
1230 /* Match sessionid unless cb_null call*/ 1225 /* Match sessionid*/
1231 if (is_cb_compound && (memcmp(clp->cl_session->sess_id.data, 1226 if (memcmp(clp->cl_session->sess_id.data,
1232 sid->data, NFS4_MAX_SESSIONID_LEN) != 0)) 1227 sid->data, NFS4_MAX_SESSIONID_LEN) != 0)
1233 continue; 1228 continue;
1234 1229
1235 atomic_inc(&clp->cl_count); 1230 atomic_inc(&clp->cl_count);
@@ -1244,7 +1239,7 @@ nfs4_find_client_sessionid(const struct sockaddr *addr,
1244 1239
1245struct nfs_client * 1240struct nfs_client *
1246nfs4_find_client_sessionid(const struct sockaddr *addr, 1241nfs4_find_client_sessionid(const struct sockaddr *addr,
1247 struct nfs4_sessionid *sid, int is_cb_compound) 1242 struct nfs4_sessionid *sid)
1248{ 1243{
1249 return NULL; 1244 return NULL;
1250} 1245}
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 364e4328f392..bbbc6bf5cb2e 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -23,8 +23,6 @@
23 23
24static void nfs_do_free_delegation(struct nfs_delegation *delegation) 24static void nfs_do_free_delegation(struct nfs_delegation *delegation)
25{ 25{
26 if (delegation->cred)
27 put_rpccred(delegation->cred);
28 kfree(delegation); 26 kfree(delegation);
29} 27}
30 28
@@ -37,6 +35,10 @@ static void nfs_free_delegation_callback(struct rcu_head *head)
37 35
38static void nfs_free_delegation(struct nfs_delegation *delegation) 36static void nfs_free_delegation(struct nfs_delegation *delegation)
39{ 37{
38 if (delegation->cred) {
39 put_rpccred(delegation->cred);
40 delegation->cred = NULL;
41 }
40 call_rcu(&delegation->rcu, nfs_free_delegation_callback); 42 call_rcu(&delegation->rcu, nfs_free_delegation_callback);
41} 43}
42 44
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 95b081bc9e25..2c3eb33b904d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -970,7 +970,7 @@ int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd)
970{ 970{
971 struct nfs_server *server = NFS_SERVER(inode); 971 struct nfs_server *server = NFS_SERVER(inode);
972 972
973 if (test_bit(NFS_INO_MOUNTPOINT, &NFS_I(inode)->flags)) 973 if (IS_AUTOMOUNT(inode))
974 return 0; 974 return 0;
975 if (nd != NULL) { 975 if (nd != NULL) {
976 /* VFS wants an on-the-wire revalidation */ 976 /* VFS wants an on-the-wire revalidation */
@@ -1173,6 +1173,7 @@ const struct dentry_operations nfs_dentry_operations = {
1173 .d_revalidate = nfs_lookup_revalidate, 1173 .d_revalidate = nfs_lookup_revalidate,
1174 .d_delete = nfs_dentry_delete, 1174 .d_delete = nfs_dentry_delete,
1175 .d_iput = nfs_dentry_iput, 1175 .d_iput = nfs_dentry_iput,
1176 .d_automount = nfs_d_automount,
1176}; 1177};
1177 1178
1178static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) 1179static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
@@ -1246,6 +1247,7 @@ const struct dentry_operations nfs4_dentry_operations = {
1246 .d_revalidate = nfs_open_revalidate, 1247 .d_revalidate = nfs_open_revalidate,
1247 .d_delete = nfs_dentry_delete, 1248 .d_delete = nfs_dentry_delete,
1248 .d_iput = nfs_dentry_iput, 1249 .d_iput = nfs_dentry_iput,
1250 .d_automount = nfs_d_automount,
1249}; 1251};
1250 1252
1251/* 1253/*
@@ -1406,11 +1408,15 @@ no_open:
1406static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) 1408static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1407{ 1409{
1408 struct dentry *parent = NULL; 1410 struct dentry *parent = NULL;
1409 struct inode *inode = dentry->d_inode; 1411 struct inode *inode;
1410 struct inode *dir; 1412 struct inode *dir;
1411 struct nfs_open_context *ctx; 1413 struct nfs_open_context *ctx;
1412 int openflags, ret = 0; 1414 int openflags, ret = 0;
1413 1415
1416 if (nd->flags & LOOKUP_RCU)
1417 return -ECHILD;
1418
1419 inode = dentry->d_inode;
1414 if (!is_atomic_open(nd) || d_mountpoint(dentry)) 1420 if (!is_atomic_open(nd) || d_mountpoint(dentry))
1415 goto no_open; 1421 goto no_open;
1416 1422
@@ -1579,6 +1585,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1579{ 1585{
1580 struct iattr attr; 1586 struct iattr attr;
1581 int error; 1587 int error;
1588 int open_flags = 0;
1582 1589
1583 dfprintk(VFS, "NFS: create(%s/%ld), %s\n", 1590 dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
1584 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1591 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1586,7 +1593,10 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1586 attr.ia_mode = mode; 1593 attr.ia_mode = mode;
1587 attr.ia_valid = ATTR_MODE; 1594 attr.ia_valid = ATTR_MODE;
1588 1595
1589 error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL); 1596 if ((nd->flags & LOOKUP_CREATE) != 0)
1597 open_flags = nd->intent.open.flags;
1598
1599 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL);
1590 if (error != 0) 1600 if (error != 0)
1591 goto out_err; 1601 goto out_err;
1592 return 0; 1602 return 0;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e6ace0d93c71..9943a75bb6d1 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -407,15 +407,18 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
407 pos += vec->iov_len; 407 pos += vec->iov_len;
408 } 408 }
409 409
410 /*
411 * If no bytes were started, return the error, and let the
412 * generic layer handle the completion.
413 */
414 if (requested_bytes == 0) {
415 nfs_direct_req_release(dreq);
416 return result < 0 ? result : -EIO;
417 }
418
410 if (put_dreq(dreq)) 419 if (put_dreq(dreq))
411 nfs_direct_complete(dreq); 420 nfs_direct_complete(dreq);
412 421 return 0;
413 if (requested_bytes != 0)
414 return 0;
415
416 if (result < 0)
417 return result;
418 return -EIO;
419} 422}
420 423
421static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, 424static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
@@ -841,15 +844,18 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
841 pos += vec->iov_len; 844 pos += vec->iov_len;
842 } 845 }
843 846
847 /*
848 * If no bytes were started, return the error, and let the
849 * generic layer handle the completion.
850 */
851 if (requested_bytes == 0) {
852 nfs_direct_req_release(dreq);
853 return result < 0 ? result : -EIO;
854 }
855
844 if (put_dreq(dreq)) 856 if (put_dreq(dreq))
845 nfs_direct_write_complete(dreq, dreq->inode); 857 nfs_direct_write_complete(dreq, dreq->inode);
846 858 return 0;
847 if (requested_bytes != 0)
848 return 0;
849
850 if (result < 0)
851 return result;
852 return -EIO;
853} 859}
854 860
855static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, 861static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ce00b704452c..1cc600e77bb4 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -300,7 +300,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
300 else 300 else
301 inode->i_op = &nfs_mountpoint_inode_operations; 301 inode->i_op = &nfs_mountpoint_inode_operations;
302 inode->i_fop = NULL; 302 inode->i_fop = NULL;
303 set_bit(NFS_INO_MOUNTPOINT, &nfsi->flags); 303 inode->i_flags |= S_AUTOMOUNT;
304 } 304 }
305 } else if (S_ISLNK(inode->i_mode)) 305 } else if (S_ISLNK(inode->i_mode))
306 inode->i_op = &nfs_symlink_inode_operations; 306 inode->i_op = &nfs_symlink_inode_operations;
@@ -881,9 +881,10 @@ out:
881 return ret; 881 return ret;
882} 882}
883 883
884static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) 884static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
885{ 885{
886 struct nfs_inode *nfsi = NFS_I(inode); 886 struct nfs_inode *nfsi = NFS_I(inode);
887 unsigned long ret = 0;
887 888
888 if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) 889 if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
889 && (fattr->valid & NFS_ATTR_FATTR_CHANGE) 890 && (fattr->valid & NFS_ATTR_FATTR_CHANGE)
@@ -891,25 +892,32 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
891 nfsi->change_attr = fattr->change_attr; 892 nfsi->change_attr = fattr->change_attr;
892 if (S_ISDIR(inode->i_mode)) 893 if (S_ISDIR(inode->i_mode))
893 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 894 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
895 ret |= NFS_INO_INVALID_ATTR;
894 } 896 }
895 /* If we have atomic WCC data, we may update some attributes */ 897 /* If we have atomic WCC data, we may update some attributes */
896 if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) 898 if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
897 && (fattr->valid & NFS_ATTR_FATTR_CTIME) 899 && (fattr->valid & NFS_ATTR_FATTR_CTIME)
898 && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) 900 && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
899 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); 901 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
902 ret |= NFS_INO_INVALID_ATTR;
903 }
900 904
901 if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) 905 if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
902 && (fattr->valid & NFS_ATTR_FATTR_MTIME) 906 && (fattr->valid & NFS_ATTR_FATTR_MTIME)
903 && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { 907 && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
904 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); 908 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
905 if (S_ISDIR(inode->i_mode)) 909 if (S_ISDIR(inode->i_mode))
906 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 910 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
911 ret |= NFS_INO_INVALID_ATTR;
907 } 912 }
908 if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) 913 if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
909 && (fattr->valid & NFS_ATTR_FATTR_SIZE) 914 && (fattr->valid & NFS_ATTR_FATTR_SIZE)
910 && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) 915 && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
911 && nfsi->npages == 0) 916 && nfsi->npages == 0) {
912 i_size_write(inode, nfs_size_to_loff_t(fattr->size)); 917 i_size_write(inode, nfs_size_to_loff_t(fattr->size));
918 ret |= NFS_INO_INVALID_ATTR;
919 }
920 return ret;
913} 921}
914 922
915/** 923/**
@@ -1208,7 +1216,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1208 /* Update the fsid? */ 1216 /* Update the fsid? */
1209 if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) && 1217 if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
1210 !nfs_fsid_equal(&server->fsid, &fattr->fsid) && 1218 !nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
1211 !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags)) 1219 !IS_AUTOMOUNT(inode))
1212 server->fsid = fattr->fsid; 1220 server->fsid = fattr->fsid;
1213 1221
1214 /* 1222 /*
@@ -1223,7 +1231,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1223 | NFS_INO_REVAL_PAGECACHE); 1231 | NFS_INO_REVAL_PAGECACHE);
1224 1232
1225 /* Do atomic weak cache consistency updates */ 1233 /* Do atomic weak cache consistency updates */
1226 nfs_wcc_update_inode(inode, fattr); 1234 invalid |= nfs_wcc_update_inode(inode, fattr);
1227 1235
1228 /* More cache consistency checks */ 1236 /* More cache consistency checks */
1229 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { 1237 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index bfa3a34af801..cf9fdbdabc67 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -133,8 +133,7 @@ extern void nfs_put_client(struct nfs_client *);
133extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *); 133extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *);
134extern struct nfs_client *nfs4_find_client_ident(int); 134extern struct nfs_client *nfs4_find_client_ident(int);
135extern struct nfs_client * 135extern struct nfs_client *
136nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *, 136nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *);
137 int);
138extern struct nfs_server *nfs_create_server( 137extern struct nfs_server *nfs_create_server(
139 const struct nfs_parsed_mount_data *, 138 const struct nfs_parsed_mount_data *,
140 struct nfs_fh *); 139 struct nfs_fh *);
@@ -252,6 +251,7 @@ extern char *nfs_path(const char *base,
252 const struct dentry *droot, 251 const struct dentry *droot,
253 const struct dentry *dentry, 252 const struct dentry *dentry,
254 char *buffer, ssize_t buflen); 253 char *buffer, ssize_t buflen);
254extern struct vfsmount *nfs_d_automount(struct path *path);
255 255
256/* getroot.c */ 256/* getroot.c */
257extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *); 257extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 74aaf3963c10..f32b8603dca8 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -97,9 +97,8 @@ Elong:
97} 97}
98 98
99/* 99/*
100 * nfs_follow_mountpoint - handle crossing a mountpoint on the server 100 * nfs_d_automount - Handle crossing a mountpoint on the server
101 * @dentry - dentry of mountpoint 101 * @path - The mountpoint
102 * @nd - nameidata info
103 * 102 *
104 * When we encounter a mountpoint on the server, we want to set up 103 * When we encounter a mountpoint on the server, we want to set up
105 * a mountpoint on the client too, to prevent inode numbers from 104 * a mountpoint on the client too, to prevent inode numbers from
@@ -109,87 +108,65 @@ Elong:
109 * situation, and that different filesystems may want to use 108 * situation, and that different filesystems may want to use
110 * different security flavours. 109 * different security flavours.
111 */ 110 */
112static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) 111struct vfsmount *nfs_d_automount(struct path *path)
113{ 112{
114 struct vfsmount *mnt; 113 struct vfsmount *mnt;
115 struct nfs_server *server = NFS_SERVER(dentry->d_inode); 114 struct nfs_server *server = NFS_SERVER(path->dentry->d_inode);
116 struct dentry *parent; 115 struct dentry *parent;
117 struct nfs_fh *fh = NULL; 116 struct nfs_fh *fh = NULL;
118 struct nfs_fattr *fattr = NULL; 117 struct nfs_fattr *fattr = NULL;
119 int err; 118 int err;
120 119
121 dprintk("--> nfs_follow_mountpoint()\n"); 120 dprintk("--> nfs_d_automount()\n");
122 121
123 err = -ESTALE; 122 mnt = ERR_PTR(-ESTALE);
124 if (IS_ROOT(dentry)) 123 if (IS_ROOT(path->dentry))
125 goto out_err; 124 goto out_nofree;
126 125
127 err = -ENOMEM; 126 mnt = ERR_PTR(-ENOMEM);
128 fh = nfs_alloc_fhandle(); 127 fh = nfs_alloc_fhandle();
129 fattr = nfs_alloc_fattr(); 128 fattr = nfs_alloc_fattr();
130 if (fh == NULL || fattr == NULL) 129 if (fh == NULL || fattr == NULL)
131 goto out_err; 130 goto out;
132 131
133 dprintk("%s: enter\n", __func__); 132 dprintk("%s: enter\n", __func__);
134 dput(nd->path.dentry);
135 nd->path.dentry = dget(dentry);
136 133
137 /* Look it up again */ 134 /* Look it up again to get its attributes */
138 parent = dget_parent(nd->path.dentry); 135 parent = dget_parent(path->dentry);
139 err = server->nfs_client->rpc_ops->lookup(parent->d_inode, 136 err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
140 &nd->path.dentry->d_name, 137 &path->dentry->d_name,
141 fh, fattr); 138 fh, fattr);
142 dput(parent); 139 dput(parent);
143 if (err != 0) 140 if (err != 0) {
144 goto out_err; 141 mnt = ERR_PTR(err);
142 goto out;
143 }
145 144
146 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) 145 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
147 mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry); 146 mnt = nfs_do_refmount(path->mnt, path->dentry);
148 else 147 else
149 mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh, 148 mnt = nfs_do_submount(path->mnt, path->dentry, fh, fattr);
150 fattr);
151 err = PTR_ERR(mnt);
152 if (IS_ERR(mnt)) 149 if (IS_ERR(mnt))
153 goto out_err; 150 goto out;
154 151
155 mntget(mnt); 152 dprintk("%s: done, success\n", __func__);
156 err = do_add_mount(mnt, &nd->path, nd->path.mnt->mnt_flags|MNT_SHRINKABLE, 153 mntget(mnt); /* prevent immediate expiration */
157 &nfs_automount_list); 154 mnt_set_expiry(mnt, &nfs_automount_list);
158 if (err < 0) {
159 mntput(mnt);
160 if (err == -EBUSY)
161 goto out_follow;
162 goto out_err;
163 }
164 path_put(&nd->path);
165 nd->path.mnt = mnt;
166 nd->path.dentry = dget(mnt->mnt_root);
167 schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); 155 schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
156
168out: 157out:
169 nfs_free_fattr(fattr); 158 nfs_free_fattr(fattr);
170 nfs_free_fhandle(fh); 159 nfs_free_fhandle(fh);
171 dprintk("%s: done, returned %d\n", __func__, err); 160out_nofree:
172 161 dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt);
173 dprintk("<-- nfs_follow_mountpoint() = %d\n", err); 162 return mnt;
174 return ERR_PTR(err);
175out_err:
176 path_put(&nd->path);
177 goto out;
178out_follow:
179 while (d_mountpoint(nd->path.dentry) &&
180 follow_down(&nd->path))
181 ;
182 err = 0;
183 goto out;
184} 163}
185 164
186const struct inode_operations nfs_mountpoint_inode_operations = { 165const struct inode_operations nfs_mountpoint_inode_operations = {
187 .follow_link = nfs_follow_mountpoint,
188 .getattr = nfs_getattr, 166 .getattr = nfs_getattr,
189}; 167};
190 168
191const struct inode_operations nfs_referral_inode_operations = { 169const struct inode_operations nfs_referral_inode_operations = {
192 .follow_link = nfs_follow_mountpoint,
193}; 170};
194 171
195static void nfs_expire_automounts(struct work_struct *work) 172static void nfs_expire_automounts(struct work_struct *work)
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9f88c5f4c7e2..274342771655 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -311,8 +311,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
311 if (!nfs_server_capable(inode, NFS_CAP_ACLS)) 311 if (!nfs_server_capable(inode, NFS_CAP_ACLS))
312 goto out; 312 goto out;
313 313
314 /* We are doing this here, because XDR marshalling can only 314 /* We are doing this here because XDR marshalling does not
315 return -ENOMEM. */ 315 * return any results, it BUGs. */
316 status = -ENOSPC; 316 status = -ENOSPC;
317 if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES) 317 if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES)
318 goto out; 318 goto out;
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 01c5e8b1941d..183c6b123d0f 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1328,10 +1328,13 @@ static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
1328 1328
1329 encode_nfs_fh3(xdr, NFS_FH(args->inode)); 1329 encode_nfs_fh3(xdr, NFS_FH(args->inode));
1330 encode_uint32(xdr, args->mask); 1330 encode_uint32(xdr, args->mask);
1331
1332 base = req->rq_slen;
1331 if (args->npages != 0) 1333 if (args->npages != 0)
1332 xdr_write_pages(xdr, args->pages, 0, args->len); 1334 xdr_write_pages(xdr, args->pages, 0, args->len);
1335 else
1336 xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE);
1333 1337
1334 base = req->rq_slen;
1335 error = nfsacl_encode(xdr->buf, base, args->inode, 1338 error = nfsacl_encode(xdr->buf, base, args->inode,
1336 (args->mask & NFS_ACL) ? 1339 (args->mask & NFS_ACL) ?
1337 args->acl_access : NULL, 1, 0); 1340 args->acl_access : NULL, 1, 0);
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 51fe64ace55a..f5c9b125e8cc 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -214,7 +214,7 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
214 214
215 /* ipv6 length plus port is legal */ 215 /* ipv6 length plus port is legal */
216 if (rlen > INET6_ADDRSTRLEN + 8) { 216 if (rlen > INET6_ADDRSTRLEN + 8) {
217 dprintk("%s Invalid address, length %d\n", __func__, 217 dprintk("%s: Invalid address, length %d\n", __func__,
218 rlen); 218 rlen);
219 goto out_err; 219 goto out_err;
220 } 220 }
@@ -225,6 +225,11 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
225 /* replace the port dots with dashes for the in4_pton() delimiter*/ 225 /* replace the port dots with dashes for the in4_pton() delimiter*/
226 for (i = 0; i < 2; i++) { 226 for (i = 0; i < 2; i++) {
227 char *res = strrchr(buf, '.'); 227 char *res = strrchr(buf, '.');
228 if (!res) {
229 dprintk("%s: Failed finding expected dots in port\n",
230 __func__);
231 goto out_free;
232 }
228 *res = '-'; 233 *res = '-';
229 } 234 }
230 235
@@ -240,7 +245,7 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
240 port = htons((tmp[0] << 8) | (tmp[1])); 245 port = htons((tmp[0] << 8) | (tmp[1]));
241 246
242 ds = nfs4_pnfs_ds_add(inode, ip_addr, port); 247 ds = nfs4_pnfs_ds_add(inode, ip_addr, port);
243 dprintk("%s Decoded address and port %s\n", __func__, buf); 248 dprintk("%s: Decoded address and port %s\n", __func__, buf);
244out_free: 249out_free:
245 kfree(buf); 250 kfree(buf);
246out_err: 251out_err:
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9d992b0346e3..78936a8f40ab 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -50,6 +50,7 @@
50#include <linux/module.h> 50#include <linux/module.h>
51#include <linux/sunrpc/bc_xprt.h> 51#include <linux/sunrpc/bc_xprt.h>
52#include <linux/xattr.h> 52#include <linux/xattr.h>
53#include <linux/utsname.h>
53 54
54#include "nfs4_fs.h" 55#include "nfs4_fs.h"
55#include "delegation.h" 56#include "delegation.h"
@@ -4572,27 +4573,16 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
4572 *p = htonl((u32)clp->cl_boot_time.tv_nsec); 4573 *p = htonl((u32)clp->cl_boot_time.tv_nsec);
4573 args.verifier = &verifier; 4574 args.verifier = &verifier;
4574 4575
4575 while (1) { 4576 args.id_len = scnprintf(args.id, sizeof(args.id),
4576 args.id_len = scnprintf(args.id, sizeof(args.id), 4577 "%s/%s.%s/%u",
4577 "%s/%s %u", 4578 clp->cl_ipaddr,
4578 clp->cl_ipaddr, 4579 init_utsname()->nodename,
4579 rpc_peeraddr2str(clp->cl_rpcclient, 4580 init_utsname()->domainname,
4580 RPC_DISPLAY_ADDR), 4581 clp->cl_rpcclient->cl_auth->au_flavor);
4581 clp->cl_id_uniquifier);
4582
4583 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
4584
4585 if (status != -NFS4ERR_CLID_INUSE)
4586 break;
4587
4588 if (signalled())
4589 break;
4590
4591 if (++clp->cl_id_uniquifier == 0)
4592 break;
4593 }
4594 4582
4595 status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags); 4583 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
4584 if (!status)
4585 status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
4596 dprintk("<-- %s status= %d\n", __func__, status); 4586 dprintk("<-- %s status= %d\n", __func__, status);
4597 return status; 4587 return status;
4598} 4588}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2336d532cf66..e6742b57a04c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -232,12 +232,6 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
232 status = nfs4_proc_create_session(clp); 232 status = nfs4_proc_create_session(clp);
233 if (status != 0) 233 if (status != 0)
234 goto out; 234 goto out;
235 status = nfs4_set_callback_sessionid(clp);
236 if (status != 0) {
237 printk(KERN_WARNING "Sessionid not set. No callback service\n");
238 nfs_callback_down(1);
239 status = 0;
240 }
241 nfs41_setup_state_renewal(clp); 235 nfs41_setup_state_renewal(clp);
242 nfs_mark_client_ready(clp, NFS_CS_READY); 236 nfs_mark_client_ready(clp, NFS_CS_READY);
243out: 237out:
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 2ab8e5cb8f59..4e2c168b6ee9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -6086,11 +6086,11 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6086 __be32 *p = xdr_inline_decode(xdr, 4); 6086 __be32 *p = xdr_inline_decode(xdr, 4);
6087 if (unlikely(!p)) 6087 if (unlikely(!p))
6088 goto out_overflow; 6088 goto out_overflow;
6089 if (!ntohl(*p++)) { 6089 if (*p == xdr_zero) {
6090 p = xdr_inline_decode(xdr, 4); 6090 p = xdr_inline_decode(xdr, 4);
6091 if (unlikely(!p)) 6091 if (unlikely(!p))
6092 goto out_overflow; 6092 goto out_overflow;
6093 if (!ntohl(*p++)) 6093 if (*p == xdr_zero)
6094 return -EAGAIN; 6094 return -EAGAIN;
6095 entry->eof = 1; 6095 entry->eof = 1;
6096 return -EBADCOOKIE; 6096 return -EBADCOOKIE;
@@ -6101,7 +6101,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6101 goto out_overflow; 6101 goto out_overflow;
6102 entry->prev_cookie = entry->cookie; 6102 entry->prev_cookie = entry->cookie;
6103 p = xdr_decode_hyper(p, &entry->cookie); 6103 p = xdr_decode_hyper(p, &entry->cookie);
6104 entry->len = ntohl(*p++); 6104 entry->len = be32_to_cpup(p);
6105 6105
6106 p = xdr_inline_decode(xdr, entry->len); 6106 p = xdr_inline_decode(xdr, entry->len);
6107 if (unlikely(!p)) 6107 if (unlikely(!p))
@@ -6132,9 +6132,6 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6132 if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE) 6132 if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE)
6133 entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); 6133 entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
6134 6134
6135 if (verify_attr_len(xdr, p, len) < 0)
6136 goto out_overflow;
6137
6138 return 0; 6135 return 0;
6139 6136
6140out_overflow: 6137out_overflow:
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index bc4089769735..1b1bc1a0fb0a 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -951,7 +951,7 @@ pnfs_put_deviceid_cache(struct nfs_client *clp)
951{ 951{
952 struct pnfs_deviceid_cache *local = clp->cl_devid_cache; 952 struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
953 953
954 dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache); 954 dprintk("--> %s ({%d})\n", __func__, atomic_read(&local->dc_ref));
955 if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) { 955 if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
956 int i; 956 int i;
957 /* Verify cache is empty */ 957 /* Verify cache is empty */
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 10d648ea128b..c8278f4046cb 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -932,7 +932,7 @@ out_bad:
932 while (!list_empty(&list)) { 932 while (!list_empty(&list)) {
933 data = list_entry(list.next, struct nfs_write_data, pages); 933 data = list_entry(list.next, struct nfs_write_data, pages);
934 list_del(&data->pages); 934 list_del(&data->pages);
935 nfs_writedata_release(data); 935 nfs_writedata_free(data);
936 } 936 }
937 nfs_redirty_request(req); 937 nfs_redirty_request(req);
938 return -ENOMEM; 938 return -ENOMEM;
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index fc1c52571c03..84c27d69d421 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -42,6 +42,11 @@ struct nfsacl_encode_desc {
42 gid_t gid; 42 gid_t gid;
43}; 43};
44 44
45struct nfsacl_simple_acl {
46 struct posix_acl acl;
47 struct posix_acl_entry ace[4];
48};
49
45static int 50static int
46xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem) 51xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
47{ 52{
@@ -72,9 +77,20 @@ xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
72 return 0; 77 return 0;
73} 78}
74 79
75unsigned int 80/**
76nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode, 81 * nfsacl_encode - Encode an NFSv3 ACL
77 struct posix_acl *acl, int encode_entries, int typeflag) 82 *
83 * @buf: destination xdr_buf to contain XDR encoded ACL
84 * @base: byte offset in xdr_buf where XDR'd ACL begins
85 * @inode: inode of file whose ACL this is
86 * @acl: posix_acl to encode
87 * @encode_entries: whether to encode ACEs as well
88 * @typeflag: ACL type: NFS_ACL_DEFAULT or zero
89 *
90 * Returns size of encoded ACL in bytes or a negative errno value.
91 */
92int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
93 struct posix_acl *acl, int encode_entries, int typeflag)
78{ 94{
79 int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0; 95 int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0;
80 struct nfsacl_encode_desc nfsacl_desc = { 96 struct nfsacl_encode_desc nfsacl_desc = {
@@ -88,17 +104,22 @@ nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
88 .uid = inode->i_uid, 104 .uid = inode->i_uid,
89 .gid = inode->i_gid, 105 .gid = inode->i_gid,
90 }; 106 };
107 struct nfsacl_simple_acl aclbuf;
91 int err; 108 int err;
92 struct posix_acl *acl2 = NULL;
93 109
94 if (entries > NFS_ACL_MAX_ENTRIES || 110 if (entries > NFS_ACL_MAX_ENTRIES ||
95 xdr_encode_word(buf, base, entries)) 111 xdr_encode_word(buf, base, entries))
96 return -EINVAL; 112 return -EINVAL;
97 if (encode_entries && acl && acl->a_count == 3) { 113 if (encode_entries && acl && acl->a_count == 3) {
98 /* Fake up an ACL_MASK entry. */ 114 struct posix_acl *acl2 = &aclbuf.acl;
99 acl2 = posix_acl_alloc(4, GFP_KERNEL); 115
100 if (!acl2) 116 /* Avoid the use of posix_acl_alloc(). nfsacl_encode() is
101 return -ENOMEM; 117 * invoked in contexts where a memory allocation failure is
118 * fatal. Fortunately this fake ACL is small enough to
119 * construct on the stack. */
120 memset(acl2, 0, sizeof(acl2));
121 posix_acl_init(acl2, 4);
122
102 /* Insert entries in canonical order: other orders seem 123 /* Insert entries in canonical order: other orders seem
103 to confuse Solaris VxFS. */ 124 to confuse Solaris VxFS. */
104 acl2->a_entries[0] = acl->a_entries[0]; /* ACL_USER_OBJ */ 125 acl2->a_entries[0] = acl->a_entries[0]; /* ACL_USER_OBJ */
@@ -109,8 +130,6 @@ nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
109 nfsacl_desc.acl = acl2; 130 nfsacl_desc.acl = acl2;
110 } 131 }
111 err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc); 132 err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc);
112 if (acl2)
113 posix_acl_release(acl2);
114 if (!err) 133 if (!err)
115 err = 8 + nfsacl_desc.desc.elem_size * 134 err = 8 + nfsacl_desc.desc.elem_size *
116 nfsacl_desc.desc.array_len; 135 nfsacl_desc.desc.array_len;
@@ -224,9 +243,18 @@ posix_acl_from_nfsacl(struct posix_acl *acl)
224 return 0; 243 return 0;
225} 244}
226 245
227unsigned int 246/**
228nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt, 247 * nfsacl_decode - Decode an NFSv3 ACL
229 struct posix_acl **pacl) 248 *
249 * @buf: xdr_buf containing XDR'd ACL data to decode
250 * @base: byte offset in xdr_buf where XDR'd ACL begins
251 * @aclcnt: count of ACEs in decoded posix_acl
252 * @pacl: buffer in which to place decoded posix_acl
253 *
254 * Returns the length of the decoded ACL in bytes, or a negative errno value.
255 */
256int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
257 struct posix_acl **pacl)
230{ 258{
231 struct nfsacl_decode_desc nfsacl_desc = { 259 struct nfsacl_decode_desc nfsacl_desc = {
232 .desc = { 260 .desc = {
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
new file mode 100644
index 000000000000..34e5c40af5ef
--- /dev/null
+++ b/fs/nfsd/acl.h
@@ -0,0 +1,59 @@
1/*
2 * Common NFSv4 ACL handling definitions.
3 *
4 * Copyright (c) 2002 The Regents of the University of Michigan.
5 * All rights reserved.
6 *
7 * Marius Aamodt Eriksen <marius@umich.edu>
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
23 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
24 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
29 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
34
35#ifndef LINUX_NFS4_ACL_H
36#define LINUX_NFS4_ACL_H
37
38#include <linux/posix_acl.h>
39
40/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to
41 * fit in a page: */
42#define NFS4_ACL_MAX 170
43
44struct nfs4_acl *nfs4_acl_new(int);
45int nfs4_acl_get_whotype(char *, u32);
46int nfs4_acl_write_who(int who, char *p);
47int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
48 uid_t who, u32 mask);
49
50#define NFS4_ACL_TYPE_DEFAULT 0x01
51#define NFS4_ACL_DIR 0x02
52#define NFS4_ACL_OWNER 0x04
53
54struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *,
55 struct posix_acl *, unsigned int flags);
56int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **,
57 struct posix_acl **, unsigned int flags);
58
59#endif /* LINUX_NFS4_ACL_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c0fcb7ab7f6d..8b31e5f8795d 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1,4 +1,3 @@
1#define MSNFS /* HACK HACK */
2/* 1/*
3 * NFS exporting and validation. 2 * NFS exporting and validation.
4 * 3 *
@@ -1444,9 +1443,6 @@ static struct flags {
1444 { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, 1443 { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
1445 { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, 1444 { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
1446 { NFSEXP_V4ROOT, {"v4root", ""}}, 1445 { NFSEXP_V4ROOT, {"v4root", ""}},
1447#ifdef MSNFS
1448 { NFSEXP_MSNFS, {"msnfs", ""}},
1449#endif
1450 { 0, {"", ""}} 1446 { 0, {"", ""}}
1451}; 1447};
1452 1448
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
new file mode 100644
index 000000000000..2f3be1321534
--- /dev/null
+++ b/fs/nfsd/idmap.h
@@ -0,0 +1,62 @@
1/*
2 * Mapping of UID to name and vice versa.
3 *
4 * Copyright (c) 2002, 2003 The Regents of the University of
5 * Michigan. All rights reserved.
6> *
7 * Marius Aamodt Eriksen <marius@umich.edu>
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
23 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
24 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
29 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
34
35#ifndef LINUX_NFSD_IDMAP_H
36#define LINUX_NFSD_IDMAP_H
37
38#include <linux/in.h>
39#include <linux/sunrpc/svc.h>
40
41/* XXX from linux/nfs_idmap.h */
42#define IDMAP_NAMESZ 128
43
44#ifdef CONFIG_NFSD_V4
45int nfsd_idmap_init(void);
46void nfsd_idmap_shutdown(void);
47#else
48static inline int nfsd_idmap_init(void)
49{
50 return 0;
51}
52static inline void nfsd_idmap_shutdown(void)
53{
54}
55#endif
56
57__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);
58__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *);
59int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *);
60int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *);
61
62#endif /* LINUX_NFSD_IDMAP_H */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 5b7e3021e06b..2247fc91d5e9 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -151,10 +151,10 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
151 __be32 nfserr; 151 __be32 nfserr;
152 u32 max_blocksize = svc_max_payload(rqstp); 152 u32 max_blocksize = svc_max_payload(rqstp);
153 153
154 dprintk("nfsd: READ(3) %s %lu bytes at %lu\n", 154 dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
155 SVCFH_fmt(&argp->fh), 155 SVCFH_fmt(&argp->fh),
156 (unsigned long) argp->count, 156 (unsigned long) argp->count,
157 (unsigned long) argp->offset); 157 (unsigned long long) argp->offset);
158 158
159 /* Obtain buffer pointer for payload. 159 /* Obtain buffer pointer for payload.
160 * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) 160 * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
@@ -191,10 +191,10 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
191 __be32 nfserr; 191 __be32 nfserr;
192 unsigned long cnt = argp->len; 192 unsigned long cnt = argp->len;
193 193
194 dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n", 194 dprintk("nfsd: WRITE(3) %s %d bytes at %Lu%s\n",
195 SVCFH_fmt(&argp->fh), 195 SVCFH_fmt(&argp->fh),
196 argp->len, 196 argp->len,
197 (unsigned long) argp->offset, 197 (unsigned long long) argp->offset,
198 argp->stable? " stable" : ""); 198 argp->stable? " stable" : "");
199 199
200 fh_copy(&resp->fh, &argp->fh); 200 fh_copy(&resp->fh, &argp->fh);
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index e48052615159..ad88f1c0a4c3 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -36,7 +36,7 @@
36 36
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/nfs_fs.h> 38#include <linux/nfs_fs.h>
39#include <linux/nfs4_acl.h> 39#include "acl.h"
40 40
41 41
42/* mode bit translations: */ 42/* mode bit translations: */
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 21a63da305ff..3be975e18919 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -628,10 +628,8 @@ static int max_cb_time(void)
628 return max(nfsd4_lease/10, (time_t)1) * HZ; 628 return max(nfsd4_lease/10, (time_t)1) * HZ;
629} 629}
630 630
631/* Reference counting, callback cleanup, etc., all look racy as heck.
632 * And why is cl_cb_set an atomic? */
633 631
634int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn) 632static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
635{ 633{
636 struct rpc_timeout timeparms = { 634 struct rpc_timeout timeparms = {
637 .to_initval = max_cb_time(), 635 .to_initval = max_cb_time(),
@@ -641,6 +639,7 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
641 .net = &init_net, 639 .net = &init_net,
642 .address = (struct sockaddr *) &conn->cb_addr, 640 .address = (struct sockaddr *) &conn->cb_addr,
643 .addrsize = conn->cb_addrlen, 641 .addrsize = conn->cb_addrlen,
642 .saddress = (struct sockaddr *) &conn->cb_saddr,
644 .timeout = &timeparms, 643 .timeout = &timeparms,
645 .program = &cb_program, 644 .program = &cb_program,
646 .version = 0, 645 .version = 0,
@@ -657,6 +656,10 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
657 args.protocol = XPRT_TRANSPORT_TCP; 656 args.protocol = XPRT_TRANSPORT_TCP;
658 clp->cl_cb_ident = conn->cb_ident; 657 clp->cl_cb_ident = conn->cb_ident;
659 } else { 658 } else {
659 if (!conn->cb_xprt)
660 return -EINVAL;
661 clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
662 clp->cl_cb_session = ses;
660 args.bc_xprt = conn->cb_xprt; 663 args.bc_xprt = conn->cb_xprt;
661 args.prognumber = clp->cl_cb_session->se_cb_prog; 664 args.prognumber = clp->cl_cb_session->se_cb_prog;
662 args.protocol = XPRT_TRANSPORT_BC_TCP; 665 args.protocol = XPRT_TRANSPORT_BC_TCP;
@@ -679,14 +682,20 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
679 (int)clp->cl_name.len, clp->cl_name.data, reason); 682 (int)clp->cl_name.len, clp->cl_name.data, reason);
680} 683}
681 684
685static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
686{
687 clp->cl_cb_state = NFSD4_CB_DOWN;
688 warn_no_callback_path(clp, reason);
689}
690
682static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) 691static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
683{ 692{
684 struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); 693 struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
685 694
686 if (task->tk_status) 695 if (task->tk_status)
687 warn_no_callback_path(clp, task->tk_status); 696 nfsd4_mark_cb_down(clp, task->tk_status);
688 else 697 else
689 atomic_set(&clp->cl_cb_set, 1); 698 clp->cl_cb_state = NFSD4_CB_UP;
690} 699}
691 700
692static const struct rpc_call_ops nfsd4_cb_probe_ops = { 701static const struct rpc_call_ops nfsd4_cb_probe_ops = {
@@ -709,6 +718,11 @@ int set_callback_cred(void)
709 718
710static struct workqueue_struct *callback_wq; 719static struct workqueue_struct *callback_wq;
711 720
721static void run_nfsd4_cb(struct nfsd4_callback *cb)
722{
723 queue_work(callback_wq, &cb->cb_work);
724}
725
712static void do_probe_callback(struct nfs4_client *clp) 726static void do_probe_callback(struct nfs4_client *clp)
713{ 727{
714 struct nfsd4_callback *cb = &clp->cl_cb_null; 728 struct nfsd4_callback *cb = &clp->cl_cb_null;
@@ -723,7 +737,7 @@ static void do_probe_callback(struct nfs4_client *clp)
723 737
724 cb->cb_ops = &nfsd4_cb_probe_ops; 738 cb->cb_ops = &nfsd4_cb_probe_ops;
725 739
726 queue_work(callback_wq, &cb->cb_work); 740 run_nfsd4_cb(cb);
727} 741}
728 742
729/* 743/*
@@ -732,14 +746,21 @@ static void do_probe_callback(struct nfs4_client *clp)
732 */ 746 */
733void nfsd4_probe_callback(struct nfs4_client *clp) 747void nfsd4_probe_callback(struct nfs4_client *clp)
734{ 748{
749 /* XXX: atomicity? Also, should we be using cl_cb_flags? */
750 clp->cl_cb_state = NFSD4_CB_UNKNOWN;
735 set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags); 751 set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
736 do_probe_callback(clp); 752 do_probe_callback(clp);
737} 753}
738 754
739void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) 755void nfsd4_probe_callback_sync(struct nfs4_client *clp)
740{ 756{
741 BUG_ON(atomic_read(&clp->cl_cb_set)); 757 nfsd4_probe_callback(clp);
758 flush_workqueue(callback_wq);
759}
742 760
761void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
762{
763 clp->cl_cb_state = NFSD4_CB_UNKNOWN;
743 spin_lock(&clp->cl_lock); 764 spin_lock(&clp->cl_lock);
744 memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn)); 765 memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
745 spin_unlock(&clp->cl_lock); 766 spin_unlock(&clp->cl_lock);
@@ -750,24 +771,14 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
750 * If the slot is available, then mark it busy. Otherwise, set the 771 * If the slot is available, then mark it busy. Otherwise, set the
751 * thread for sleeping on the callback RPC wait queue. 772 * thread for sleeping on the callback RPC wait queue.
752 */ 773 */
753static int nfsd41_cb_setup_sequence(struct nfs4_client *clp, 774static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task)
754 struct rpc_task *task)
755{ 775{
756 u32 *ptr = (u32 *)clp->cl_cb_session->se_sessionid.data;
757 int status = 0;
758
759 dprintk("%s: %u:%u:%u:%u\n", __func__,
760 ptr[0], ptr[1], ptr[2], ptr[3]);
761
762 if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { 776 if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
763 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL); 777 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
764 dprintk("%s slot is busy\n", __func__); 778 dprintk("%s slot is busy\n", __func__);
765 status = -EAGAIN; 779 return false;
766 goto out;
767 } 780 }
768out: 781 return true;
769 dprintk("%s status=%d\n", __func__, status);
770 return status;
771} 782}
772 783
773/* 784/*
@@ -780,20 +791,19 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
780 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); 791 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
781 struct nfs4_client *clp = dp->dl_client; 792 struct nfs4_client *clp = dp->dl_client;
782 u32 minorversion = clp->cl_minorversion; 793 u32 minorversion = clp->cl_minorversion;
783 int status = 0;
784 794
785 cb->cb_minorversion = minorversion; 795 cb->cb_minorversion = minorversion;
786 if (minorversion) { 796 if (minorversion) {
787 status = nfsd41_cb_setup_sequence(clp, task); 797 if (!nfsd41_cb_get_slot(clp, task))
788 if (status) {
789 if (status != -EAGAIN) {
790 /* terminate rpc task */
791 task->tk_status = status;
792 task->tk_action = NULL;
793 }
794 return; 798 return;
795 }
796 } 799 }
800 spin_lock(&clp->cl_lock);
801 if (list_empty(&cb->cb_per_client)) {
802 /* This is the first call, not a restart */
803 cb->cb_done = false;
804 list_add(&cb->cb_per_client, &clp->cl_callbacks);
805 }
806 spin_unlock(&clp->cl_lock);
797 rpc_call_start(task); 807 rpc_call_start(task);
798} 808}
799 809
@@ -829,15 +839,18 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
829 839
830 nfsd4_cb_done(task, calldata); 840 nfsd4_cb_done(task, calldata);
831 841
832 if (current_rpc_client == NULL) { 842 if (current_rpc_client != task->tk_client) {
833 /* We're shutting down; give up. */ 843 /* We're shutting down or changing cl_cb_client; leave
834 /* XXX: err, or is it ok just to fall through 844 * it to nfsd4_process_cb_update to restart the call if
835 * and rpc_restart_call? */ 845 * necessary. */
836 return; 846 return;
837 } 847 }
838 848
849 if (cb->cb_done)
850 return;
839 switch (task->tk_status) { 851 switch (task->tk_status) {
840 case 0: 852 case 0:
853 cb->cb_done = true;
841 return; 854 return;
842 case -EBADHANDLE: 855 case -EBADHANDLE:
843 case -NFS4ERR_BAD_STATEID: 856 case -NFS4ERR_BAD_STATEID:
@@ -846,32 +859,30 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
846 break; 859 break;
847 default: 860 default:
848 /* Network partition? */ 861 /* Network partition? */
849 atomic_set(&clp->cl_cb_set, 0); 862 nfsd4_mark_cb_down(clp, task->tk_status);
850 warn_no_callback_path(clp, task->tk_status);
851 if (current_rpc_client != task->tk_client) {
852 /* queue a callback on the new connection: */
853 atomic_inc(&dp->dl_count);
854 nfsd4_cb_recall(dp);
855 return;
856 }
857 } 863 }
858 if (dp->dl_retries--) { 864 if (dp->dl_retries--) {
859 rpc_delay(task, 2*HZ); 865 rpc_delay(task, 2*HZ);
860 task->tk_status = 0; 866 task->tk_status = 0;
861 rpc_restart_call_prepare(task); 867 rpc_restart_call_prepare(task);
862 return; 868 return;
863 } else {
864 atomic_set(&clp->cl_cb_set, 0);
865 warn_no_callback_path(clp, task->tk_status);
866 } 869 }
870 nfsd4_mark_cb_down(clp, task->tk_status);
871 cb->cb_done = true;
867} 872}
868 873
869static void nfsd4_cb_recall_release(void *calldata) 874static void nfsd4_cb_recall_release(void *calldata)
870{ 875{
871 struct nfsd4_callback *cb = calldata; 876 struct nfsd4_callback *cb = calldata;
877 struct nfs4_client *clp = cb->cb_clp;
872 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); 878 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
873 879
874 nfs4_put_delegation(dp); 880 if (cb->cb_done) {
881 spin_lock(&clp->cl_lock);
882 list_del(&cb->cb_per_client);
883 spin_unlock(&clp->cl_lock);
884 nfs4_put_delegation(dp);
885 }
875} 886}
876 887
877static const struct rpc_call_ops nfsd4_cb_recall_ops = { 888static const struct rpc_call_ops nfsd4_cb_recall_ops = {
@@ -906,16 +917,33 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp)
906 flush_workqueue(callback_wq); 917 flush_workqueue(callback_wq);
907} 918}
908 919
909void nfsd4_release_cb(struct nfsd4_callback *cb) 920static void nfsd4_release_cb(struct nfsd4_callback *cb)
910{ 921{
911 if (cb->cb_ops->rpc_release) 922 if (cb->cb_ops->rpc_release)
912 cb->cb_ops->rpc_release(cb); 923 cb->cb_ops->rpc_release(cb);
913} 924}
914 925
915void nfsd4_process_cb_update(struct nfsd4_callback *cb) 926/* requires cl_lock: */
927static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
928{
929 struct nfsd4_session *s;
930 struct nfsd4_conn *c;
931
932 list_for_each_entry(s, &clp->cl_sessions, se_perclnt) {
933 list_for_each_entry(c, &s->se_conns, cn_persession) {
934 if (c->cn_flags & NFS4_CDFC4_BACK)
935 return c;
936 }
937 }
938 return NULL;
939}
940
941static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
916{ 942{
917 struct nfs4_cb_conn conn; 943 struct nfs4_cb_conn conn;
918 struct nfs4_client *clp = cb->cb_clp; 944 struct nfs4_client *clp = cb->cb_clp;
945 struct nfsd4_session *ses = NULL;
946 struct nfsd4_conn *c;
919 int err; 947 int err;
920 948
921 /* 949 /*
@@ -926,6 +954,10 @@ void nfsd4_process_cb_update(struct nfsd4_callback *cb)
926 rpc_shutdown_client(clp->cl_cb_client); 954 rpc_shutdown_client(clp->cl_cb_client);
927 clp->cl_cb_client = NULL; 955 clp->cl_cb_client = NULL;
928 } 956 }
957 if (clp->cl_cb_conn.cb_xprt) {
958 svc_xprt_put(clp->cl_cb_conn.cb_xprt);
959 clp->cl_cb_conn.cb_xprt = NULL;
960 }
929 if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags)) 961 if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags))
930 return; 962 return;
931 spin_lock(&clp->cl_lock); 963 spin_lock(&clp->cl_lock);
@@ -936,11 +968,22 @@ void nfsd4_process_cb_update(struct nfsd4_callback *cb)
936 BUG_ON(!clp->cl_cb_flags); 968 BUG_ON(!clp->cl_cb_flags);
937 clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags); 969 clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
938 memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn)); 970 memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
971 c = __nfsd4_find_backchannel(clp);
972 if (c) {
973 svc_xprt_get(c->cn_xprt);
974 conn.cb_xprt = c->cn_xprt;
975 ses = c->cn_session;
976 }
939 spin_unlock(&clp->cl_lock); 977 spin_unlock(&clp->cl_lock);
940 978
941 err = setup_callback_client(clp, &conn); 979 err = setup_callback_client(clp, &conn, ses);
942 if (err) 980 if (err) {
943 warn_no_callback_path(clp, err); 981 warn_no_callback_path(clp, err);
982 return;
983 }
984 /* Yay, the callback channel's back! Restart any callbacks: */
985 list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client)
986 run_nfsd4_cb(cb);
944} 987}
945 988
946void nfsd4_do_callback_rpc(struct work_struct *w) 989void nfsd4_do_callback_rpc(struct work_struct *w)
@@ -965,10 +1008,11 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
965void nfsd4_cb_recall(struct nfs4_delegation *dp) 1008void nfsd4_cb_recall(struct nfs4_delegation *dp)
966{ 1009{
967 struct nfsd4_callback *cb = &dp->dl_recall; 1010 struct nfsd4_callback *cb = &dp->dl_recall;
1011 struct nfs4_client *clp = dp->dl_client;
968 1012
969 dp->dl_retries = 1; 1013 dp->dl_retries = 1;
970 cb->cb_op = dp; 1014 cb->cb_op = dp;
971 cb->cb_clp = dp->dl_client; 1015 cb->cb_clp = clp;
972 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL]; 1016 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
973 cb->cb_msg.rpc_argp = cb; 1017 cb->cb_msg.rpc_argp = cb;
974 cb->cb_msg.rpc_resp = cb; 1018 cb->cb_msg.rpc_resp = cb;
@@ -977,5 +1021,8 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
977 cb->cb_ops = &nfsd4_cb_recall_ops; 1021 cb->cb_ops = &nfsd4_cb_recall_ops;
978 dp->dl_retries = 1; 1022 dp->dl_retries = 1;
979 1023
980 queue_work(callback_wq, &dp->dl_recall.cb_work); 1024 INIT_LIST_HEAD(&cb->cb_per_client);
1025 cb->cb_done = true;
1026
1027 run_nfsd4_cb(&dp->dl_recall);
981} 1028}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index f0695e815f0e..6d2c397d458b 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -33,10 +33,11 @@
33 */ 33 */
34 34
35#include <linux/module.h> 35#include <linux/module.h>
36#include <linux/nfsd_idmap.h>
37#include <linux/seq_file.h> 36#include <linux/seq_file.h>
38#include <linux/sched.h> 37#include <linux/sched.h>
39#include <linux/slab.h> 38#include <linux/slab.h>
39#include "idmap.h"
40#include "nfsd.h"
40 41
41/* 42/*
42 * Cache entry 43 * Cache entry
@@ -514,7 +515,7 @@ rqst_authname(struct svc_rqst *rqstp)
514 return clp->name; 515 return clp->name;
515} 516}
516 517
517static int 518static __be32
518idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, 519idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen,
519 uid_t *id) 520 uid_t *id)
520{ 521{
@@ -524,15 +525,15 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
524 int ret; 525 int ret;
525 526
526 if (namelen + 1 > sizeof(key.name)) 527 if (namelen + 1 > sizeof(key.name))
527 return -EINVAL; 528 return nfserr_badowner;
528 memcpy(key.name, name, namelen); 529 memcpy(key.name, name, namelen);
529 key.name[namelen] = '\0'; 530 key.name[namelen] = '\0';
530 strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); 531 strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
531 ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item); 532 ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item);
532 if (ret == -ENOENT) 533 if (ret == -ENOENT)
533 ret = -ESRCH; /* nfserr_badname */ 534 return nfserr_badowner;
534 if (ret) 535 if (ret)
535 return ret; 536 return nfserrno(ret);
536 *id = item->id; 537 *id = item->id;
537 cache_put(&item->h, &nametoid_cache); 538 cache_put(&item->h, &nametoid_cache);
538 return 0; 539 return 0;
@@ -560,14 +561,14 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
560 return ret; 561 return ret;
561} 562}
562 563
563int 564__be32
564nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen, 565nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen,
565 __u32 *id) 566 __u32 *id)
566{ 567{
567 return idmap_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id); 568 return idmap_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id);
568} 569}
569 570
570int 571__be32
571nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, 572nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
572 __u32 *id) 573 __u32 *id)
573{ 574{
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 0cdfd022bb7b..db52546143d1 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -604,9 +604,7 @@ nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
604 return status; 604 return status;
605} 605}
606 606
607static __be32 607static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh)
608nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
609 void *arg)
610{ 608{
611 struct svc_fh tmp_fh; 609 struct svc_fh tmp_fh;
612 __be32 ret; 610 __be32 ret;
@@ -615,13 +613,19 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
615 ret = exp_pseudoroot(rqstp, &tmp_fh); 613 ret = exp_pseudoroot(rqstp, &tmp_fh);
616 if (ret) 614 if (ret)
617 return ret; 615 return ret;
618 if (tmp_fh.fh_dentry == cstate->current_fh.fh_dentry) { 616 if (tmp_fh.fh_dentry == fh->fh_dentry) {
619 fh_put(&tmp_fh); 617 fh_put(&tmp_fh);
620 return nfserr_noent; 618 return nfserr_noent;
621 } 619 }
622 fh_put(&tmp_fh); 620 fh_put(&tmp_fh);
623 return nfsd_lookup(rqstp, &cstate->current_fh, 621 return nfsd_lookup(rqstp, fh, "..", 2, fh);
624 "..", 2, &cstate->current_fh); 622}
623
624static __be32
625nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
626 void *arg)
627{
628 return nfsd4_do_lookupp(rqstp, &cstate->current_fh);
625} 629}
626 630
627static __be32 631static __be32
@@ -769,10 +773,36 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
769 } else 773 } else
770 secinfo->si_exp = exp; 774 secinfo->si_exp = exp;
771 dput(dentry); 775 dput(dentry);
776 if (cstate->minorversion)
777 /* See rfc 5661 section 2.6.3.1.1.8 */
778 fh_put(&cstate->current_fh);
772 return err; 779 return err;
773} 780}
774 781
775static __be32 782static __be32
783nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
784 struct nfsd4_secinfo_no_name *sin)
785{
786 __be32 err;
787
788 switch (sin->sin_style) {
789 case NFS4_SECINFO_STYLE4_CURRENT_FH:
790 break;
791 case NFS4_SECINFO_STYLE4_PARENT:
792 err = nfsd4_do_lookupp(rqstp, &cstate->current_fh);
793 if (err)
794 return err;
795 break;
796 default:
797 return nfserr_inval;
798 }
799 exp_get(cstate->current_fh.fh_export);
800 sin->sin_exp = cstate->current_fh.fh_export;
801 fh_put(&cstate->current_fh);
802 return nfs_ok;
803}
804
805static __be32
776nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 806nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
777 struct nfsd4_setattr *setattr) 807 struct nfsd4_setattr *setattr)
778{ 808{
@@ -974,8 +1004,8 @@ static const char *nfsd4_op_name(unsigned opnum);
974 * Also note, enforced elsewhere: 1004 * Also note, enforced elsewhere:
975 * - SEQUENCE other than as first op results in 1005 * - SEQUENCE other than as first op results in
976 * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().) 1006 * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
977 * - BIND_CONN_TO_SESSION must be the only op in its compound 1007 * - BIND_CONN_TO_SESSION must be the only op in its compound.
978 * (Will be enforced in nfsd4_bind_conn_to_session().) 1008 * (Enforced in nfsd4_bind_conn_to_session().)
979 * - DESTROY_SESSION must be the final operation in a compound, if 1009 * - DESTROY_SESSION must be the final operation in a compound, if
980 * sessionid's in SEQUENCE and DESTROY_SESSION are the same. 1010 * sessionid's in SEQUENCE and DESTROY_SESSION are the same.
981 * (Enforced in nfsd4_destroy_session().) 1011 * (Enforced in nfsd4_destroy_session().)
@@ -1126,10 +1156,6 @@ encode_op:
1126 1156
1127 nfsd4_increment_op_stats(op->opnum); 1157 nfsd4_increment_op_stats(op->opnum);
1128 } 1158 }
1129 if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
1130 dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
1131 status = nfserr_jukebox;
1132 }
1133 1159
1134 resp->cstate.status = status; 1160 resp->cstate.status = status;
1135 fh_put(&resp->cstate.current_fh); 1161 fh_put(&resp->cstate.current_fh);
@@ -1300,6 +1326,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
1300 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, 1326 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1301 .op_name = "OP_EXCHANGE_ID", 1327 .op_name = "OP_EXCHANGE_ID",
1302 }, 1328 },
1329 [OP_BIND_CONN_TO_SESSION] = {
1330 .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
1331 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1332 .op_name = "OP_BIND_CONN_TO_SESSION",
1333 },
1303 [OP_CREATE_SESSION] = { 1334 [OP_CREATE_SESSION] = {
1304 .op_func = (nfsd4op_func)nfsd4_create_session, 1335 .op_func = (nfsd4op_func)nfsd4_create_session,
1305 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, 1336 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
@@ -1320,6 +1351,10 @@ static struct nfsd4_operation nfsd4_ops[] = {
1320 .op_flags = ALLOWED_WITHOUT_FH, 1351 .op_flags = ALLOWED_WITHOUT_FH,
1321 .op_name = "OP_RECLAIM_COMPLETE", 1352 .op_name = "OP_RECLAIM_COMPLETE",
1322 }, 1353 },
1354 [OP_SECINFO_NO_NAME] = {
1355 .op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
1356 .op_name = "OP_SECINFO_NO_NAME",
1357 },
1323}; 1358};
1324 1359
1325static const char *nfsd4_op_name(unsigned opnum) 1360static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 7e26caab2a26..ffb59ef6f82f 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -302,7 +302,6 @@ purge_old(struct dentry *parent, struct dentry *child)
302{ 302{
303 int status; 303 int status;
304 304
305 /* note: we currently use this path only for minorversion 0 */
306 if (nfs4_has_reclaimed_state(child->d_name.name, false)) 305 if (nfs4_has_reclaimed_state(child->d_name.name, false))
307 return 0; 306 return 0;
308 307
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fbd18c3074bb..d98d0213285d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -230,7 +230,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
230 dp->dl_client = clp; 230 dp->dl_client = clp;
231 get_nfs4_file(fp); 231 get_nfs4_file(fp);
232 dp->dl_file = fp; 232 dp->dl_file = fp;
233 nfs4_file_get_access(fp, O_RDONLY); 233 dp->dl_vfs_file = find_readable_file(fp);
234 get_file(dp->dl_vfs_file);
234 dp->dl_flock = NULL; 235 dp->dl_flock = NULL;
235 dp->dl_type = type; 236 dp->dl_type = type;
236 dp->dl_stateid.si_boot = boot_time; 237 dp->dl_stateid.si_boot = boot_time;
@@ -252,6 +253,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
252 if (atomic_dec_and_test(&dp->dl_count)) { 253 if (atomic_dec_and_test(&dp->dl_count)) {
253 dprintk("NFSD: freeing dp %p\n",dp); 254 dprintk("NFSD: freeing dp %p\n",dp);
254 put_nfs4_file(dp->dl_file); 255 put_nfs4_file(dp->dl_file);
256 fput(dp->dl_vfs_file);
255 kmem_cache_free(deleg_slab, dp); 257 kmem_cache_free(deleg_slab, dp);
256 num_delegations--; 258 num_delegations--;
257 } 259 }
@@ -265,12 +267,10 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
265static void 267static void
266nfs4_close_delegation(struct nfs4_delegation *dp) 268nfs4_close_delegation(struct nfs4_delegation *dp)
267{ 269{
268 struct file *filp = find_readable_file(dp->dl_file);
269
270 dprintk("NFSD: close_delegation dp %p\n",dp); 270 dprintk("NFSD: close_delegation dp %p\n",dp);
271 /* XXX: do we even need this check?: */
271 if (dp->dl_flock) 272 if (dp->dl_flock)
272 vfs_setlease(filp, F_UNLCK, &dp->dl_flock); 273 vfs_setlease(dp->dl_vfs_file, F_UNLCK, &dp->dl_flock);
273 nfs4_file_put_access(dp->dl_file, O_RDONLY);
274} 274}
275 275
276/* Called under the state lock. */ 276/* Called under the state lock. */
@@ -642,6 +642,7 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u)
642 free_conn(c); 642 free_conn(c);
643 } 643 }
644 spin_unlock(&clp->cl_lock); 644 spin_unlock(&clp->cl_lock);
645 nfsd4_probe_callback(clp);
645} 646}
646 647
647static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags) 648static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
@@ -679,15 +680,12 @@ static int nfsd4_register_conn(struct nfsd4_conn *conn)
679 return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); 680 return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
680} 681}
681 682
682static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses) 683static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses, u32 dir)
683{ 684{
684 struct nfsd4_conn *conn; 685 struct nfsd4_conn *conn;
685 u32 flags = NFS4_CDFC4_FORE;
686 int ret; 686 int ret;
687 687
688 if (ses->se_flags & SESSION4_BACK_CHAN) 688 conn = alloc_conn(rqstp, dir);
689 flags |= NFS4_CDFC4_BACK;
690 conn = alloc_conn(rqstp, flags);
691 if (!conn) 689 if (!conn)
692 return nfserr_jukebox; 690 return nfserr_jukebox;
693 nfsd4_hash_conn(conn, ses); 691 nfsd4_hash_conn(conn, ses);
@@ -698,6 +696,17 @@ static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
698 return nfs_ok; 696 return nfs_ok;
699} 697}
700 698
699static __be32 nfsd4_new_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_session *ses)
700{
701 u32 dir = NFS4_CDFC4_FORE;
702
703 if (ses->se_flags & SESSION4_BACK_CHAN)
704 dir |= NFS4_CDFC4_BACK;
705
706 return nfsd4_new_conn(rqstp, ses, dir);
707}
708
709/* must be called under client_lock */
701static void nfsd4_del_conns(struct nfsd4_session *s) 710static void nfsd4_del_conns(struct nfsd4_session *s)
702{ 711{
703 struct nfs4_client *clp = s->se_client; 712 struct nfs4_client *clp = s->se_client;
@@ -749,6 +758,8 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
749 */ 758 */
750 slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached); 759 slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached);
751 numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs); 760 numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs);
761 if (numslots < 1)
762 return NULL;
752 763
753 new = alloc_session(slotsize, numslots); 764 new = alloc_session(slotsize, numslots);
754 if (!new) { 765 if (!new) {
@@ -769,25 +780,30 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
769 idx = hash_sessionid(&new->se_sessionid); 780 idx = hash_sessionid(&new->se_sessionid);
770 spin_lock(&client_lock); 781 spin_lock(&client_lock);
771 list_add(&new->se_hash, &sessionid_hashtbl[idx]); 782 list_add(&new->se_hash, &sessionid_hashtbl[idx]);
783 spin_lock(&clp->cl_lock);
772 list_add(&new->se_perclnt, &clp->cl_sessions); 784 list_add(&new->se_perclnt, &clp->cl_sessions);
785 spin_unlock(&clp->cl_lock);
773 spin_unlock(&client_lock); 786 spin_unlock(&client_lock);
774 787
775 status = nfsd4_new_conn(rqstp, new); 788 status = nfsd4_new_conn_from_crses(rqstp, new);
776 /* whoops: benny points out, status is ignored! (err, or bogus) */ 789 /* whoops: benny points out, status is ignored! (err, or bogus) */
777 if (status) { 790 if (status) {
778 free_session(&new->se_ref); 791 free_session(&new->se_ref);
779 return NULL; 792 return NULL;
780 } 793 }
781 if (!clp->cl_cb_session && (cses->flags & SESSION4_BACK_CHAN)) { 794 if (cses->flags & SESSION4_BACK_CHAN) {
782 struct sockaddr *sa = svc_addr(rqstp); 795 struct sockaddr *sa = svc_addr(rqstp);
783 796 /*
784 clp->cl_cb_session = new; 797 * This is a little silly; with sessions there's no real
785 clp->cl_cb_conn.cb_xprt = rqstp->rq_xprt; 798 * use for the callback address. Use the peer address
786 svc_xprt_get(rqstp->rq_xprt); 799 * as a reasonable default for now, but consider fixing
800 * the rpc client not to require an address in the
801 * future:
802 */
787 rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa); 803 rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
788 clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa); 804 clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
789 nfsd4_probe_callback(clp);
790 } 805 }
806 nfsd4_probe_callback(clp);
791 return new; 807 return new;
792} 808}
793 809
@@ -817,7 +833,9 @@ static void
817unhash_session(struct nfsd4_session *ses) 833unhash_session(struct nfsd4_session *ses)
818{ 834{
819 list_del(&ses->se_hash); 835 list_del(&ses->se_hash);
836 spin_lock(&ses->se_client->cl_lock);
820 list_del(&ses->se_perclnt); 837 list_del(&ses->se_perclnt);
838 spin_unlock(&ses->se_client->cl_lock);
821} 839}
822 840
823/* must be called under the client_lock */ 841/* must be called under the client_lock */
@@ -923,8 +941,10 @@ unhash_client_locked(struct nfs4_client *clp)
923 941
924 mark_client_expired(clp); 942 mark_client_expired(clp);
925 list_del(&clp->cl_lru); 943 list_del(&clp->cl_lru);
944 spin_lock(&clp->cl_lock);
926 list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) 945 list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
927 list_del_init(&ses->se_hash); 946 list_del_init(&ses->se_hash);
947 spin_unlock(&clp->cl_lock);
928} 948}
929 949
930static void 950static void
@@ -1051,12 +1071,13 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
1051 1071
1052 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); 1072 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
1053 atomic_set(&clp->cl_refcount, 0); 1073 atomic_set(&clp->cl_refcount, 0);
1054 atomic_set(&clp->cl_cb_set, 0); 1074 clp->cl_cb_state = NFSD4_CB_UNKNOWN;
1055 INIT_LIST_HEAD(&clp->cl_idhash); 1075 INIT_LIST_HEAD(&clp->cl_idhash);
1056 INIT_LIST_HEAD(&clp->cl_strhash); 1076 INIT_LIST_HEAD(&clp->cl_strhash);
1057 INIT_LIST_HEAD(&clp->cl_openowners); 1077 INIT_LIST_HEAD(&clp->cl_openowners);
1058 INIT_LIST_HEAD(&clp->cl_delegations); 1078 INIT_LIST_HEAD(&clp->cl_delegations);
1059 INIT_LIST_HEAD(&clp->cl_lru); 1079 INIT_LIST_HEAD(&clp->cl_lru);
1080 INIT_LIST_HEAD(&clp->cl_callbacks);
1060 spin_lock_init(&clp->cl_lock); 1081 spin_lock_init(&clp->cl_lock);
1061 INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc); 1082 INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
1062 clp->cl_time = get_seconds(); 1083 clp->cl_time = get_seconds();
@@ -1132,54 +1153,55 @@ find_unconfirmed_client(clientid_t *clid)
1132 return NULL; 1153 return NULL;
1133} 1154}
1134 1155
1135/* 1156static bool clp_used_exchangeid(struct nfs4_client *clp)
1136 * Return 1 iff clp's clientid establishment method matches the use_exchange_id
1137 * parameter. Matching is based on the fact the at least one of the
1138 * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
1139 *
1140 * FIXME: we need to unify the clientid namespaces for nfsv4.x
1141 * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
1142 * and SET_CLIENTID{,_CONFIRM}
1143 */
1144static inline int
1145match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
1146{ 1157{
1147 bool has_exchange_flags = (clp->cl_exchange_flags != 0); 1158 return clp->cl_exchange_flags != 0;
1148 return use_exchange_id == has_exchange_flags; 1159}
1149}
1150 1160
1151static struct nfs4_client * 1161static struct nfs4_client *
1152find_confirmed_client_by_str(const char *dname, unsigned int hashval, 1162find_confirmed_client_by_str(const char *dname, unsigned int hashval)
1153 bool use_exchange_id)
1154{ 1163{
1155 struct nfs4_client *clp; 1164 struct nfs4_client *clp;
1156 1165
1157 list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) { 1166 list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
1158 if (same_name(clp->cl_recdir, dname) && 1167 if (same_name(clp->cl_recdir, dname))
1159 match_clientid_establishment(clp, use_exchange_id))
1160 return clp; 1168 return clp;
1161 } 1169 }
1162 return NULL; 1170 return NULL;
1163} 1171}
1164 1172
1165static struct nfs4_client * 1173static struct nfs4_client *
1166find_unconfirmed_client_by_str(const char *dname, unsigned int hashval, 1174find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
1167 bool use_exchange_id)
1168{ 1175{
1169 struct nfs4_client *clp; 1176 struct nfs4_client *clp;
1170 1177
1171 list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) { 1178 list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
1172 if (same_name(clp->cl_recdir, dname) && 1179 if (same_name(clp->cl_recdir, dname))
1173 match_clientid_establishment(clp, use_exchange_id))
1174 return clp; 1180 return clp;
1175 } 1181 }
1176 return NULL; 1182 return NULL;
1177} 1183}
1178 1184
1185static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr)
1186{
1187 switch (family) {
1188 case AF_INET:
1189 ((struct sockaddr_in *)sa)->sin_family = AF_INET;
1190 ((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr;
1191 return;
1192 case AF_INET6:
1193 ((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6;
1194 ((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6;
1195 return;
1196 }
1197}
1198
1179static void 1199static void
1180gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) 1200gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp)
1181{ 1201{
1182 struct nfs4_cb_conn *conn = &clp->cl_cb_conn; 1202 struct nfs4_cb_conn *conn = &clp->cl_cb_conn;
1203 struct sockaddr *sa = svc_addr(rqstp);
1204 u32 scopeid = rpc_get_scope_id(sa);
1183 unsigned short expected_family; 1205 unsigned short expected_family;
1184 1206
1185 /* Currently, we only support tcp and tcp6 for the callback channel */ 1207 /* Currently, we only support tcp and tcp6 for the callback channel */
@@ -1205,6 +1227,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
1205 1227
1206 conn->cb_prog = se->se_callback_prog; 1228 conn->cb_prog = se->se_callback_prog;
1207 conn->cb_ident = se->se_callback_ident; 1229 conn->cb_ident = se->se_callback_ident;
1230 rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr);
1208 return; 1231 return;
1209out_err: 1232out_err:
1210 conn->cb_addr.ss_family = AF_UNSPEC; 1233 conn->cb_addr.ss_family = AF_UNSPEC;
@@ -1344,7 +1367,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1344 case SP4_NONE: 1367 case SP4_NONE:
1345 break; 1368 break;
1346 case SP4_SSV: 1369 case SP4_SSV:
1347 return nfserr_encr_alg_unsupp; 1370 return nfserr_serverfault;
1348 default: 1371 default:
1349 BUG(); /* checked by xdr code */ 1372 BUG(); /* checked by xdr code */
1350 case SP4_MACH_CRED: 1373 case SP4_MACH_CRED:
@@ -1361,8 +1384,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1361 nfs4_lock_state(); 1384 nfs4_lock_state();
1362 status = nfs_ok; 1385 status = nfs_ok;
1363 1386
1364 conf = find_confirmed_client_by_str(dname, strhashval, true); 1387 conf = find_confirmed_client_by_str(dname, strhashval);
1365 if (conf) { 1388 if (conf) {
1389 if (!clp_used_exchangeid(conf)) {
1390 status = nfserr_clid_inuse; /* XXX: ? */
1391 goto out;
1392 }
1366 if (!same_verf(&verf, &conf->cl_verifier)) { 1393 if (!same_verf(&verf, &conf->cl_verifier)) {
1367 /* 18.35.4 case 8 */ 1394 /* 18.35.4 case 8 */
1368 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) { 1395 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
@@ -1403,7 +1430,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1403 goto out; 1430 goto out;
1404 } 1431 }
1405 1432
1406 unconf = find_unconfirmed_client_by_str(dname, strhashval, true); 1433 unconf = find_unconfirmed_client_by_str(dname, strhashval);
1407 if (unconf) { 1434 if (unconf) {
1408 /* 1435 /*
1409 * Possible retry or client restart. Per 18.35.4 case 4, 1436 * Possible retry or client restart. Per 18.35.4 case 4,
@@ -1560,6 +1587,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1560 status = nfs_ok; 1587 status = nfs_ok;
1561 memcpy(cr_ses->sessionid.data, new->se_sessionid.data, 1588 memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
1562 NFS4_MAX_SESSIONID_LEN); 1589 NFS4_MAX_SESSIONID_LEN);
1590 memcpy(&cr_ses->fore_channel, &new->se_fchannel,
1591 sizeof(struct nfsd4_channel_attrs));
1563 cs_slot->sl_seqid++; 1592 cs_slot->sl_seqid++;
1564 cr_ses->seqid = cs_slot->sl_seqid; 1593 cr_ses->seqid = cs_slot->sl_seqid;
1565 1594
@@ -1581,6 +1610,45 @@ static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
1581 return argp->opcnt == resp->opcnt; 1610 return argp->opcnt == resp->opcnt;
1582} 1611}
1583 1612
1613static __be32 nfsd4_map_bcts_dir(u32 *dir)
1614{
1615 switch (*dir) {
1616 case NFS4_CDFC4_FORE:
1617 case NFS4_CDFC4_BACK:
1618 return nfs_ok;
1619 case NFS4_CDFC4_FORE_OR_BOTH:
1620 case NFS4_CDFC4_BACK_OR_BOTH:
1621 *dir = NFS4_CDFC4_BOTH;
1622 return nfs_ok;
1623 };
1624 return nfserr_inval;
1625}
1626
1627__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
1628 struct nfsd4_compound_state *cstate,
1629 struct nfsd4_bind_conn_to_session *bcts)
1630{
1631 __be32 status;
1632
1633 if (!nfsd4_last_compound_op(rqstp))
1634 return nfserr_not_only_op;
1635 spin_lock(&client_lock);
1636 cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid);
1637 /* Sorta weird: we only need the refcnt'ing because new_conn acquires
1638 * client_lock iself: */
1639 if (cstate->session) {
1640 nfsd4_get_session(cstate->session);
1641 atomic_inc(&cstate->session->se_client->cl_refcount);
1642 }
1643 spin_unlock(&client_lock);
1644 if (!cstate->session)
1645 return nfserr_badsession;
1646
1647 status = nfsd4_map_bcts_dir(&bcts->dir);
1648 nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
1649 return nfs_ok;
1650}
1651
1584static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) 1652static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
1585{ 1653{
1586 if (!session) 1654 if (!session)
@@ -1619,8 +1687,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
1619 spin_unlock(&client_lock); 1687 spin_unlock(&client_lock);
1620 1688
1621 nfs4_lock_state(); 1689 nfs4_lock_state();
1622 /* wait for callbacks */ 1690 nfsd4_probe_callback_sync(ses->se_client);
1623 nfsd4_shutdown_callback(ses->se_client);
1624 nfs4_unlock_state(); 1691 nfs4_unlock_state();
1625 1692
1626 nfsd4_del_conns(ses); 1693 nfsd4_del_conns(ses);
@@ -1733,8 +1800,12 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1733out: 1800out:
1734 /* Hold a session reference until done processing the compound. */ 1801 /* Hold a session reference until done processing the compound. */
1735 if (cstate->session) { 1802 if (cstate->session) {
1803 struct nfs4_client *clp = session->se_client;
1804
1736 nfsd4_get_session(cstate->session); 1805 nfsd4_get_session(cstate->session);
1737 atomic_inc(&session->se_client->cl_refcount); 1806 atomic_inc(&clp->cl_refcount);
1807 if (clp->cl_cb_state == NFSD4_CB_DOWN)
1808 seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN;
1738 } 1809 }
1739 kfree(conn); 1810 kfree(conn);
1740 spin_unlock(&client_lock); 1811 spin_unlock(&client_lock);
@@ -1775,7 +1846,6 @@ __be32
1775nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 1846nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1776 struct nfsd4_setclientid *setclid) 1847 struct nfsd4_setclientid *setclid)
1777{ 1848{
1778 struct sockaddr *sa = svc_addr(rqstp);
1779 struct xdr_netobj clname = { 1849 struct xdr_netobj clname = {
1780 .len = setclid->se_namelen, 1850 .len = setclid->se_namelen,
1781 .data = setclid->se_name, 1851 .data = setclid->se_name,
@@ -1801,10 +1871,12 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1801 strhashval = clientstr_hashval(dname); 1871 strhashval = clientstr_hashval(dname);
1802 1872
1803 nfs4_lock_state(); 1873 nfs4_lock_state();
1804 conf = find_confirmed_client_by_str(dname, strhashval, false); 1874 conf = find_confirmed_client_by_str(dname, strhashval);
1805 if (conf) { 1875 if (conf) {
1806 /* RFC 3530 14.2.33 CASE 0: */ 1876 /* RFC 3530 14.2.33 CASE 0: */
1807 status = nfserr_clid_inuse; 1877 status = nfserr_clid_inuse;
1878 if (clp_used_exchangeid(conf))
1879 goto out;
1808 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { 1880 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
1809 char addr_str[INET6_ADDRSTRLEN]; 1881 char addr_str[INET6_ADDRSTRLEN];
1810 rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str, 1882 rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str,
@@ -1819,7 +1891,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1819 * has a description of SETCLIENTID request processing consisting 1891 * has a description of SETCLIENTID request processing consisting
1820 * of 5 bullet points, labeled as CASE0 - CASE4 below. 1892 * of 5 bullet points, labeled as CASE0 - CASE4 below.
1821 */ 1893 */
1822 unconf = find_unconfirmed_client_by_str(dname, strhashval, false); 1894 unconf = find_unconfirmed_client_by_str(dname, strhashval);
1823 status = nfserr_resource; 1895 status = nfserr_resource;
1824 if (!conf) { 1896 if (!conf) {
1825 /* 1897 /*
@@ -1876,7 +1948,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1876 * for consistent minorversion use throughout: 1948 * for consistent minorversion use throughout:
1877 */ 1949 */
1878 new->cl_minorversion = 0; 1950 new->cl_minorversion = 0;
1879 gen_callback(new, setclid, rpc_get_scope_id(sa)); 1951 gen_callback(new, setclid, rqstp);
1880 add_to_unconfirmed(new, strhashval); 1952 add_to_unconfirmed(new, strhashval);
1881 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; 1953 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
1882 setclid->se_clientid.cl_id = new->cl_clientid.cl_id; 1954 setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
@@ -1935,7 +2007,6 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1935 if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) 2007 if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
1936 status = nfserr_clid_inuse; 2008 status = nfserr_clid_inuse;
1937 else { 2009 else {
1938 atomic_set(&conf->cl_cb_set, 0);
1939 nfsd4_change_callback(conf, &unconf->cl_cb_conn); 2010 nfsd4_change_callback(conf, &unconf->cl_cb_conn);
1940 nfsd4_probe_callback(conf); 2011 nfsd4_probe_callback(conf);
1941 expire_client(unconf); 2012 expire_client(unconf);
@@ -1964,7 +2035,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1964 unsigned int hash = 2035 unsigned int hash =
1965 clientstr_hashval(unconf->cl_recdir); 2036 clientstr_hashval(unconf->cl_recdir);
1966 conf = find_confirmed_client_by_str(unconf->cl_recdir, 2037 conf = find_confirmed_client_by_str(unconf->cl_recdir,
1967 hash, false); 2038 hash);
1968 if (conf) { 2039 if (conf) {
1969 nfsd4_remove_clid_dir(conf); 2040 nfsd4_remove_clid_dir(conf);
1970 expire_client(conf); 2041 expire_client(conf);
@@ -2300,41 +2371,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
2300 nfsd4_cb_recall(dp); 2371 nfsd4_cb_recall(dp);
2301} 2372}
2302 2373
2303/*
2304 * The file_lock is being reapd.
2305 *
2306 * Called by locks_free_lock() with lock_flocks() held.
2307 */
2308static
2309void nfsd_release_deleg_cb(struct file_lock *fl)
2310{
2311 struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
2312
2313 dprintk("NFSD nfsd_release_deleg_cb: fl %p dp %p dl_count %d\n", fl,dp, atomic_read(&dp->dl_count));
2314
2315 if (!(fl->fl_flags & FL_LEASE) || !dp)
2316 return;
2317 dp->dl_flock = NULL;
2318}
2319
2320/*
2321 * Called from setlease() with lock_flocks() held
2322 */
2323static
2324int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try)
2325{
2326 struct nfs4_delegation *onlistd =
2327 (struct nfs4_delegation *)onlist->fl_owner;
2328 struct nfs4_delegation *tryd =
2329 (struct nfs4_delegation *)try->fl_owner;
2330
2331 if (onlist->fl_lmops != try->fl_lmops)
2332 return 0;
2333
2334 return onlistd->dl_client == tryd->dl_client;
2335}
2336
2337
2338static 2374static
2339int nfsd_change_deleg_cb(struct file_lock **onlist, int arg) 2375int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
2340{ 2376{
@@ -2346,8 +2382,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
2346 2382
2347static const struct lock_manager_operations nfsd_lease_mng_ops = { 2383static const struct lock_manager_operations nfsd_lease_mng_ops = {
2348 .fl_break = nfsd_break_deleg_cb, 2384 .fl_break = nfsd_break_deleg_cb,
2349 .fl_release_private = nfsd_release_deleg_cb,
2350 .fl_mylease = nfsd_same_client_deleg_cb,
2351 .fl_change = nfsd_change_deleg_cb, 2385 .fl_change = nfsd_change_deleg_cb,
2352}; 2386};
2353 2387
@@ -2514,8 +2548,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file
2514 if (!fp->fi_fds[oflag]) { 2548 if (!fp->fi_fds[oflag]) {
2515 status = nfsd_open(rqstp, cur_fh, S_IFREG, access, 2549 status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
2516 &fp->fi_fds[oflag]); 2550 &fp->fi_fds[oflag]);
2517 if (status == nfserr_dropit)
2518 status = nfserr_jukebox;
2519 if (status) 2551 if (status)
2520 return status; 2552 return status;
2521 } 2553 }
@@ -2596,6 +2628,19 @@ nfs4_set_claim_prev(struct nfsd4_open *open)
2596 open->op_stateowner->so_client->cl_firststate = 1; 2628 open->op_stateowner->so_client->cl_firststate = 1;
2597} 2629}
2598 2630
2631/* Should we give out recallable state?: */
2632static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
2633{
2634 if (clp->cl_cb_state == NFSD4_CB_UP)
2635 return true;
2636 /*
2637 * In the sessions case, since we don't have to establish a
2638 * separate connection for callbacks, we assume it's OK
2639 * until we hear otherwise:
2640 */
2641 return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
2642}
2643
2599/* 2644/*
2600 * Attempt to hand out a delegation. 2645 * Attempt to hand out a delegation.
2601 */ 2646 */
@@ -2604,10 +2649,11 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2604{ 2649{
2605 struct nfs4_delegation *dp; 2650 struct nfs4_delegation *dp;
2606 struct nfs4_stateowner *sop = stp->st_stateowner; 2651 struct nfs4_stateowner *sop = stp->st_stateowner;
2607 int cb_up = atomic_read(&sop->so_client->cl_cb_set); 2652 int cb_up;
2608 struct file_lock *fl; 2653 struct file_lock *fl;
2609 int status, flag = 0; 2654 int status, flag = 0;
2610 2655
2656 cb_up = nfsd4_cb_channel_good(sop->so_client);
2611 flag = NFS4_OPEN_DELEGATE_NONE; 2657 flag = NFS4_OPEN_DELEGATE_NONE;
2612 open->op_recall = 0; 2658 open->op_recall = 0;
2613 switch (open->op_claim_type) { 2659 switch (open->op_claim_type) {
@@ -2655,7 +2701,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2655 dp->dl_flock = fl; 2701 dp->dl_flock = fl;
2656 2702
2657 /* vfs_setlease checks to see if delegation should be handed out. 2703 /* vfs_setlease checks to see if delegation should be handed out.
2658 * the lock_manager callbacks fl_mylease and fl_change are used 2704 * the lock_manager callback fl_change is used
2659 */ 2705 */
2660 if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) { 2706 if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
2661 dprintk("NFSD: setlease failed [%d], no delegation\n", status); 2707 dprintk("NFSD: setlease failed [%d], no delegation\n", status);
@@ -2794,7 +2840,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2794 renew_client(clp); 2840 renew_client(clp);
2795 status = nfserr_cb_path_down; 2841 status = nfserr_cb_path_down;
2796 if (!list_empty(&clp->cl_delegations) 2842 if (!list_empty(&clp->cl_delegations)
2797 && !atomic_read(&clp->cl_cb_set)) 2843 && clp->cl_cb_state != NFSD4_CB_UP)
2798 goto out; 2844 goto out;
2799 status = nfs_ok; 2845 status = nfs_ok;
2800out: 2846out:
@@ -3081,9 +3127,10 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
3081 if (status) 3127 if (status)
3082 goto out; 3128 goto out;
3083 renew_client(dp->dl_client); 3129 renew_client(dp->dl_client);
3084 if (filpp) 3130 if (filpp) {
3085 *filpp = find_readable_file(dp->dl_file); 3131 *filpp = find_readable_file(dp->dl_file);
3086 BUG_ON(!*filpp); 3132 BUG_ON(!*filpp);
3133 }
3087 } else { /* open or lock stateid */ 3134 } else { /* open or lock stateid */
3088 stp = find_stateid(stateid, flags); 3135 stp = find_stateid(stateid, flags);
3089 if (!stp) 3136 if (!stp)
@@ -4107,7 +4154,7 @@ nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
4107 unsigned int strhashval = clientstr_hashval(name); 4154 unsigned int strhashval = clientstr_hashval(name);
4108 struct nfs4_client *clp; 4155 struct nfs4_client *clp;
4109 4156
4110 clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id); 4157 clp = find_confirmed_client_by_str(name, strhashval);
4111 return clp ? 1 : 0; 4158 return clp ? 1 : 0;
4112} 4159}
4113 4160
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f35a94a04026..956629b9cdc9 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -44,13 +44,14 @@
44#include <linux/namei.h> 44#include <linux/namei.h>
45#include <linux/statfs.h> 45#include <linux/statfs.h>
46#include <linux/utsname.h> 46#include <linux/utsname.h>
47#include <linux/nfsd_idmap.h>
48#include <linux/nfs4_acl.h>
49#include <linux/sunrpc/svcauth_gss.h> 47#include <linux/sunrpc/svcauth_gss.h>
50 48
49#include "idmap.h"
50#include "acl.h"
51#include "xdr4.h" 51#include "xdr4.h"
52#include "vfs.h" 52#include "vfs.h"
53 53
54
54#define NFSDDBG_FACILITY NFSDDBG_XDR 55#define NFSDDBG_FACILITY NFSDDBG_XDR
55 56
56/* 57/*
@@ -288,17 +289,17 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
288 len += XDR_QUADLEN(dummy32) << 2; 289 len += XDR_QUADLEN(dummy32) << 2;
289 READMEM(buf, dummy32); 290 READMEM(buf, dummy32);
290 ace->whotype = nfs4_acl_get_whotype(buf, dummy32); 291 ace->whotype = nfs4_acl_get_whotype(buf, dummy32);
291 host_err = 0; 292 status = nfs_ok;
292 if (ace->whotype != NFS4_ACL_WHO_NAMED) 293 if (ace->whotype != NFS4_ACL_WHO_NAMED)
293 ace->who = 0; 294 ace->who = 0;
294 else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) 295 else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
295 host_err = nfsd_map_name_to_gid(argp->rqstp, 296 status = nfsd_map_name_to_gid(argp->rqstp,
296 buf, dummy32, &ace->who); 297 buf, dummy32, &ace->who);
297 else 298 else
298 host_err = nfsd_map_name_to_uid(argp->rqstp, 299 status = nfsd_map_name_to_uid(argp->rqstp,
299 buf, dummy32, &ace->who); 300 buf, dummy32, &ace->who);
300 if (host_err) 301 if (status)
301 goto out_nfserr; 302 return status;
302 } 303 }
303 } else 304 } else
304 *acl = NULL; 305 *acl = NULL;
@@ -420,6 +421,21 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
420 DECODE_TAIL; 421 DECODE_TAIL;
421} 422}
422 423
424static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
425{
426 DECODE_HEAD;
427 u32 dummy;
428
429 READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
430 COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
431 READ32(bcts->dir);
432 /* XXX: Perhaps Tom Tucker could help us figure out how we
433 * should be using ctsa_use_conn_in_rdma_mode: */
434 READ32(dummy);
435
436 DECODE_TAIL;
437}
438
423static __be32 439static __be32
424nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) 440nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
425{ 441{
@@ -847,6 +863,17 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
847} 863}
848 864
849static __be32 865static __be32
866nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
867 struct nfsd4_secinfo_no_name *sin)
868{
869 DECODE_HEAD;
870
871 READ_BUF(4);
872 READ32(sin->sin_style);
873 DECODE_TAIL;
874}
875
876static __be32
850nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) 877nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
851{ 878{
852 __be32 status; 879 __be32 status;
@@ -1005,7 +1032,7 @@ static __be32
1005nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, 1032nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
1006 struct nfsd4_exchange_id *exid) 1033 struct nfsd4_exchange_id *exid)
1007{ 1034{
1008 int dummy; 1035 int dummy, tmp;
1009 DECODE_HEAD; 1036 DECODE_HEAD;
1010 1037
1011 READ_BUF(NFS4_VERIFIER_SIZE); 1038 READ_BUF(NFS4_VERIFIER_SIZE);
@@ -1053,15 +1080,23 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
1053 1080
1054 /* ssp_hash_algs<> */ 1081 /* ssp_hash_algs<> */
1055 READ_BUF(4); 1082 READ_BUF(4);
1056 READ32(dummy); 1083 READ32(tmp);
1057 READ_BUF(dummy); 1084 while (tmp--) {
1058 p += XDR_QUADLEN(dummy); 1085 READ_BUF(4);
1086 READ32(dummy);
1087 READ_BUF(dummy);
1088 p += XDR_QUADLEN(dummy);
1089 }
1059 1090
1060 /* ssp_encr_algs<> */ 1091 /* ssp_encr_algs<> */
1061 READ_BUF(4); 1092 READ_BUF(4);
1062 READ32(dummy); 1093 READ32(tmp);
1063 READ_BUF(dummy); 1094 while (tmp--) {
1064 p += XDR_QUADLEN(dummy); 1095 READ_BUF(4);
1096 READ32(dummy);
1097 READ_BUF(dummy);
1098 p += XDR_QUADLEN(dummy);
1099 }
1065 1100
1066 /* ssp_window and ssp_num_gss_handles */ 1101 /* ssp_window and ssp_num_gss_handles */
1067 READ_BUF(8); 1102 READ_BUF(8);
@@ -1339,7 +1374,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
1339 1374
1340 /* new operations for NFSv4.1 */ 1375 /* new operations for NFSv4.1 */
1341 [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_notsupp, 1376 [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_notsupp,
1342 [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_notsupp, 1377 [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
1343 [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id, 1378 [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id,
1344 [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session, 1379 [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session,
1345 [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, 1380 [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session,
@@ -1350,7 +1385,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
1350 [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, 1385 [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp,
1351 [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, 1386 [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp,
1352 [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, 1387 [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp,
1353 [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_notsupp, 1388 [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
1354 [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, 1389 [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence,
1355 [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, 1390 [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
1356 [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, 1391 [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -2309,8 +2344,6 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
2309 case nfserr_resource: 2344 case nfserr_resource:
2310 nfserr = nfserr_toosmall; 2345 nfserr = nfserr_toosmall;
2311 goto fail; 2346 goto fail;
2312 case nfserr_dropit:
2313 goto fail;
2314 case nfserr_noent: 2347 case nfserr_noent:
2315 goto skip_entry; 2348 goto skip_entry;
2316 default: 2349 default:
@@ -2365,6 +2398,21 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
2365 return nfserr; 2398 return nfserr;
2366} 2399}
2367 2400
2401static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
2402{
2403 __be32 *p;
2404
2405 if (!nfserr) {
2406 RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8);
2407 WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
2408 WRITE32(bcts->dir);
2409 /* XXX: ? */
2410 WRITE32(0);
2411 ADJUST_ARGS();
2412 }
2413 return nfserr;
2414}
2415
2368static __be32 2416static __be32
2369nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) 2417nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
2370{ 2418{
@@ -2826,11 +2874,10 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
2826} 2874}
2827 2875
2828static __be32 2876static __be32
2829nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, 2877nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
2830 struct nfsd4_secinfo *secinfo) 2878 __be32 nfserr,struct svc_export *exp)
2831{ 2879{
2832 int i = 0; 2880 int i = 0;
2833 struct svc_export *exp = secinfo->si_exp;
2834 u32 nflavs; 2881 u32 nflavs;
2835 struct exp_flavor_info *flavs; 2882 struct exp_flavor_info *flavs;
2836 struct exp_flavor_info def_flavs[2]; 2883 struct exp_flavor_info def_flavs[2];
@@ -2892,6 +2939,20 @@ out:
2892 return nfserr; 2939 return nfserr;
2893} 2940}
2894 2941
2942static __be32
2943nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
2944 struct nfsd4_secinfo *secinfo)
2945{
2946 return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->si_exp);
2947}
2948
2949static __be32
2950nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
2951 struct nfsd4_secinfo_no_name *secinfo)
2952{
2953 return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->sin_exp);
2954}
2955
2895/* 2956/*
2896 * The SETATTR encode routine is special -- it always encodes a bitmap, 2957 * The SETATTR encode routine is special -- it always encodes a bitmap,
2897 * regardless of the error status. 2958 * regardless of the error status.
@@ -3076,13 +3137,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
3076 WRITE32(seq->seqid); 3137 WRITE32(seq->seqid);
3077 WRITE32(seq->slotid); 3138 WRITE32(seq->slotid);
3078 WRITE32(seq->maxslots); 3139 WRITE32(seq->maxslots);
3079 /* 3140 /* For now: target_maxslots = maxslots */
3080 * FIXME: for now:
3081 * target_maxslots = maxslots
3082 * status_flags = 0
3083 */
3084 WRITE32(seq->maxslots); 3141 WRITE32(seq->maxslots);
3085 WRITE32(0); 3142 WRITE32(seq->status_flags);
3086 3143
3087 ADJUST_ARGS(); 3144 ADJUST_ARGS();
3088 resp->cstate.datap = p; /* DRC cache data pointer */ 3145 resp->cstate.datap = p; /* DRC cache data pointer */
@@ -3143,7 +3200,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
3143 3200
3144 /* NFSv4.1 operations */ 3201 /* NFSv4.1 operations */
3145 [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop, 3202 [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop,
3146 [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop, 3203 [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
3147 [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id, 3204 [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id,
3148 [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session, 3205 [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session,
3149 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, 3206 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session,
@@ -3154,7 +3211,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
3154 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, 3211 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
3155 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, 3212 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
3156 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, 3213 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
3157 [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop, 3214 [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
3158 [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, 3215 [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
3159 [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, 3216 [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
3160 [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop, 3217 [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 4514ebbee4d6..33b3e2b06779 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -8,12 +8,12 @@
8#include <linux/namei.h> 8#include <linux/namei.h>
9#include <linux/ctype.h> 9#include <linux/ctype.h>
10 10
11#include <linux/nfsd_idmap.h>
12#include <linux/sunrpc/svcsock.h> 11#include <linux/sunrpc/svcsock.h>
13#include <linux/nfsd/syscall.h> 12#include <linux/nfsd/syscall.h>
14#include <linux/lockd/lockd.h> 13#include <linux/lockd/lockd.h>
15#include <linux/sunrpc/clnt.h> 14#include <linux/sunrpc/clnt.h>
16 15
16#include "idmap.h"
17#include "nfsd.h" 17#include "nfsd.h"
18#include "cache.h" 18#include "cache.h"
19 19
@@ -127,6 +127,7 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu
127 127
128static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) 128static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
129{ 129{
130#ifdef CONFIG_NFSD_DEPRECATED
130 static int warned; 131 static int warned;
131 if (file->f_dentry->d_name.name[0] == '.' && !warned) { 132 if (file->f_dentry->d_name.name[0] == '.' && !warned) {
132 printk(KERN_INFO 133 printk(KERN_INFO
@@ -135,6 +136,7 @@ static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size
135 current->comm, file->f_dentry->d_name.name); 136 current->comm, file->f_dentry->d_name.name);
136 warned = 1; 137 warned = 1;
137 } 138 }
139#endif
138 if (! file->private_data) { 140 if (! file->private_data) {
139 /* An attempt to read a transaction file without writing 141 /* An attempt to read a transaction file without writing
140 * causes a 0-byte write so that the file can return 142 * causes a 0-byte write so that the file can return
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 6b641cf2c19a..7ecfa2420307 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -158,6 +158,7 @@ void nfsd_lockd_shutdown(void);
158#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP) 158#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP)
159#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR) 159#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR)
160#define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE) 160#define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE)
161#define nfserr_badowner cpu_to_be32(NFSERR_BADOWNER)
161#define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD) 162#define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD)
162#define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL) 163#define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL)
163#define nfserr_grace cpu_to_be32(NFSERR_GRACE) 164#define nfserr_grace cpu_to_be32(NFSERR_GRACE)
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 08e17264784b..e15dc45fc5ec 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -735,9 +735,9 @@ nfserrno (int errno)
735 { nfserr_stale, -ESTALE }, 735 { nfserr_stale, -ESTALE },
736 { nfserr_jukebox, -ETIMEDOUT }, 736 { nfserr_jukebox, -ETIMEDOUT },
737 { nfserr_jukebox, -ERESTARTSYS }, 737 { nfserr_jukebox, -ERESTARTSYS },
738 { nfserr_dropit, -EAGAIN }, 738 { nfserr_jukebox, -EAGAIN },
739 { nfserr_dropit, -ENOMEM }, 739 { nfserr_jukebox, -EWOULDBLOCK },
740 { nfserr_badname, -ESRCH }, 740 { nfserr_jukebox, -ENOMEM },
741 { nfserr_io, -ETXTBSY }, 741 { nfserr_io, -ETXTBSY },
742 { nfserr_notsupp, -EOPNOTSUPP }, 742 { nfserr_notsupp, -EOPNOTSUPP },
743 { nfserr_toosmall, -ETOOSMALL }, 743 { nfserr_toosmall, -ETOOSMALL },
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 2bae1d86f5f2..18743c4d8bca 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -608,7 +608,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
608 /* Now call the procedure handler, and encode NFS status. */ 608 /* Now call the procedure handler, and encode NFS status. */
609 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); 609 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
610 nfserr = map_new_errors(rqstp->rq_vers, nfserr); 610 nfserr = map_new_errors(rqstp->rq_vers, nfserr);
611 if (nfserr == nfserr_dropit) { 611 if (nfserr == nfserr_dropit || rqstp->rq_dropme) {
612 dprintk("nfsd: Dropping request; may be revisited later\n"); 612 dprintk("nfsd: Dropping request; may be revisited later\n");
613 nfsd_cache_update(rqstp, RC_NOCACHE, NULL); 613 nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
614 return 0; 614 return 0;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 39adc27b0685..3074656ba7bf 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -68,10 +68,12 @@ typedef struct {
68struct nfsd4_callback { 68struct nfsd4_callback {
69 void *cb_op; 69 void *cb_op;
70 struct nfs4_client *cb_clp; 70 struct nfs4_client *cb_clp;
71 struct list_head cb_per_client;
71 u32 cb_minorversion; 72 u32 cb_minorversion;
72 struct rpc_message cb_msg; 73 struct rpc_message cb_msg;
73 const struct rpc_call_ops *cb_ops; 74 const struct rpc_call_ops *cb_ops;
74 struct work_struct cb_work; 75 struct work_struct cb_work;
76 bool cb_done;
75}; 77};
76 78
77struct nfs4_delegation { 79struct nfs4_delegation {
@@ -81,6 +83,7 @@ struct nfs4_delegation {
81 atomic_t dl_count; /* ref count */ 83 atomic_t dl_count; /* ref count */
82 struct nfs4_client *dl_client; 84 struct nfs4_client *dl_client;
83 struct nfs4_file *dl_file; 85 struct nfs4_file *dl_file;
86 struct file *dl_vfs_file;
84 struct file_lock *dl_flock; 87 struct file_lock *dl_flock;
85 u32 dl_type; 88 u32 dl_type;
86 time_t dl_time; 89 time_t dl_time;
@@ -95,6 +98,7 @@ struct nfs4_delegation {
95struct nfs4_cb_conn { 98struct nfs4_cb_conn {
96 /* SETCLIENTID info */ 99 /* SETCLIENTID info */
97 struct sockaddr_storage cb_addr; 100 struct sockaddr_storage cb_addr;
101 struct sockaddr_storage cb_saddr;
98 size_t cb_addrlen; 102 size_t cb_addrlen;
99 u32 cb_prog; /* used only in 4.0 case; 103 u32 cb_prog; /* used only in 4.0 case;
100 per-session otherwise */ 104 per-session otherwise */
@@ -146,6 +150,11 @@ struct nfsd4_create_session {
146 u32 gid; 150 u32 gid;
147}; 151};
148 152
153struct nfsd4_bind_conn_to_session {
154 struct nfs4_sessionid sessionid;
155 u32 dir;
156};
157
149/* The single slot clientid cache structure */ 158/* The single slot clientid cache structure */
150struct nfsd4_clid_slot { 159struct nfsd4_clid_slot {
151 u32 sl_seqid; 160 u32 sl_seqid;
@@ -235,9 +244,13 @@ struct nfs4_client {
235 unsigned long cl_cb_flags; 244 unsigned long cl_cb_flags;
236 struct rpc_clnt *cl_cb_client; 245 struct rpc_clnt *cl_cb_client;
237 u32 cl_cb_ident; 246 u32 cl_cb_ident;
238 atomic_t cl_cb_set; 247#define NFSD4_CB_UP 0
248#define NFSD4_CB_UNKNOWN 1
249#define NFSD4_CB_DOWN 2
250 int cl_cb_state;
239 struct nfsd4_callback cl_cb_null; 251 struct nfsd4_callback cl_cb_null;
240 struct nfsd4_session *cl_cb_session; 252 struct nfsd4_session *cl_cb_session;
253 struct list_head cl_callbacks; /* list of in-progress callbacks */
241 254
242 /* for all client information that callback code might need: */ 255 /* for all client information that callback code might need: */
243 spinlock_t cl_lock; 256 spinlock_t cl_lock;
@@ -454,6 +467,7 @@ extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
454extern void nfs4_free_stateowner(struct kref *kref); 467extern void nfs4_free_stateowner(struct kref *kref);
455extern int set_callback_cred(void); 468extern int set_callback_cred(void);
456extern void nfsd4_probe_callback(struct nfs4_client *clp); 469extern void nfsd4_probe_callback(struct nfs4_client *clp);
470extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
457extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); 471extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
458extern void nfsd4_do_callback_rpc(struct work_struct *); 472extern void nfsd4_do_callback_rpc(struct work_struct *);
459extern void nfsd4_cb_recall(struct nfs4_delegation *dp); 473extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 230b79fbf005..641117f2188d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1,4 +1,3 @@
1#define MSNFS /* HACK HACK */
2/* 1/*
3 * File operations used by nfsd. Some of these have been ripped from 2 * File operations used by nfsd. Some of these have been ripped from
4 * other parts of the kernel because they weren't exported, others 3 * other parts of the kernel because they weren't exported, others
@@ -35,8 +34,8 @@
35#endif /* CONFIG_NFSD_V3 */ 34#endif /* CONFIG_NFSD_V3 */
36 35
37#ifdef CONFIG_NFSD_V4 36#ifdef CONFIG_NFSD_V4
38#include <linux/nfs4_acl.h> 37#include "acl.h"
39#include <linux/nfsd_idmap.h> 38#include "idmap.h"
40#endif /* CONFIG_NFSD_V4 */ 39#endif /* CONFIG_NFSD_V4 */
41 40
42#include "nfsd.h" 41#include "nfsd.h"
@@ -88,8 +87,9 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
88 .dentry = dget(dentry)}; 87 .dentry = dget(dentry)};
89 int err = 0; 88 int err = 0;
90 89
91 while (d_mountpoint(path.dentry) && follow_down(&path)) 90 err = follow_down(&path, false);
92 ; 91 if (err < 0)
92 goto out;
93 93
94 exp2 = rqst_exp_get_by_name(rqstp, &path); 94 exp2 = rqst_exp_get_by_name(rqstp, &path);
95 if (IS_ERR(exp2)) { 95 if (IS_ERR(exp2)) {
@@ -273,6 +273,13 @@ out:
273 return err; 273 return err;
274} 274}
275 275
276static int nfsd_break_lease(struct inode *inode)
277{
278 if (!S_ISREG(inode->i_mode))
279 return 0;
280 return break_lease(inode, O_WRONLY | O_NONBLOCK);
281}
282
276/* 283/*
277 * Commit metadata changes to stable storage. 284 * Commit metadata changes to stable storage.
278 */ 285 */
@@ -375,16 +382,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
375 goto out; 382 goto out;
376 } 383 }
377 384
378 /*
379 * If we are changing the size of the file, then
380 * we need to break all leases.
381 */
382 host_err = break_lease(inode, O_WRONLY | O_NONBLOCK);
383 if (host_err == -EWOULDBLOCK)
384 host_err = -ETIMEDOUT;
385 if (host_err) /* ENOMEM or EWOULDBLOCK */
386 goto out_nfserr;
387
388 host_err = get_write_access(inode); 385 host_err = get_write_access(inode);
389 if (host_err) 386 if (host_err)
390 goto out_nfserr; 387 goto out_nfserr;
@@ -425,7 +422,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
425 422
426 err = nfserr_notsync; 423 err = nfserr_notsync;
427 if (!check_guard || guardtime == inode->i_ctime.tv_sec) { 424 if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
425 host_err = nfsd_break_lease(inode);
426 if (host_err)
427 goto out_nfserr;
428 fh_lock(fhp); 428 fh_lock(fhp);
429
429 host_err = notify_change(dentry, iap); 430 host_err = notify_change(dentry, iap);
430 err = nfserrno(host_err); 431 err = nfserrno(host_err);
431 fh_unlock(fhp); 432 fh_unlock(fhp);
@@ -752,8 +753,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
752 */ 753 */
753 if (!(access & NFSD_MAY_NOT_BREAK_LEASE)) 754 if (!(access & NFSD_MAY_NOT_BREAK_LEASE))
754 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0)); 755 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
755 if (host_err == -EWOULDBLOCK)
756 host_err = -ETIMEDOUT;
757 if (host_err) /* NOMEM or WOULDBLOCK */ 756 if (host_err) /* NOMEM or WOULDBLOCK */
758 goto out_nfserr; 757 goto out_nfserr;
759 758
@@ -874,15 +873,6 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
874 return __splice_from_pipe(pipe, sd, nfsd_splice_actor); 873 return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
875} 874}
876 875
877static inline int svc_msnfs(struct svc_fh *ffhp)
878{
879#ifdef MSNFS
880 return (ffhp->fh_export->ex_flags & NFSEXP_MSNFS);
881#else
882 return 0;
883#endif
884}
885
886static __be32 876static __be32
887nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 877nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
888 loff_t offset, struct kvec *vec, int vlen, unsigned long *count) 878 loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
@@ -895,9 +885,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
895 err = nfserr_perm; 885 err = nfserr_perm;
896 inode = file->f_path.dentry->d_inode; 886 inode = file->f_path.dentry->d_inode;
897 887
898 if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count))
899 goto out;
900
901 if (file->f_op->splice_read && rqstp->rq_splice_ok) { 888 if (file->f_op->splice_read && rqstp->rq_splice_ok) {
902 struct splice_desc sd = { 889 struct splice_desc sd = {
903 .len = 0, 890 .len = 0,
@@ -922,7 +909,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
922 fsnotify_access(file); 909 fsnotify_access(file);
923 } else 910 } else
924 err = nfserrno(host_err); 911 err = nfserrno(host_err);
925out:
926 return err; 912 return err;
927} 913}
928 914
@@ -987,14 +973,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
987 int stable = *stablep; 973 int stable = *stablep;
988 int use_wgather; 974 int use_wgather;
989 975
990#ifdef MSNFS
991 err = nfserr_perm;
992
993 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
994 (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
995 goto out;
996#endif
997
998 dentry = file->f_path.dentry; 976 dentry = file->f_path.dentry;
999 inode = dentry->d_inode; 977 inode = dentry->d_inode;
1000 exp = fhp->fh_export; 978 exp = fhp->fh_export;
@@ -1045,7 +1023,6 @@ out_nfserr:
1045 err = 0; 1023 err = 0;
1046 else 1024 else
1047 err = nfserrno(host_err); 1025 err = nfserrno(host_err);
1048out:
1049 return err; 1026 return err;
1050} 1027}
1051 1028
@@ -1665,6 +1642,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1665 err = nfserrno(host_err); 1642 err = nfserrno(host_err);
1666 goto out_dput; 1643 goto out_dput;
1667 } 1644 }
1645 err = nfserr_noent;
1646 if (!dold->d_inode)
1647 goto out_drop_write;
1648 host_err = nfsd_break_lease(dold->d_inode);
1649 if (host_err)
1650 goto out_drop_write;
1668 host_err = vfs_link(dold, dirp, dnew); 1651 host_err = vfs_link(dold, dirp, dnew);
1669 if (!host_err) { 1652 if (!host_err) {
1670 err = nfserrno(commit_metadata(ffhp)); 1653 err = nfserrno(commit_metadata(ffhp));
@@ -1676,6 +1659,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1676 else 1659 else
1677 err = nfserrno(host_err); 1660 err = nfserrno(host_err);
1678 } 1661 }
1662out_drop_write:
1679 mnt_drop_write(tfhp->fh_export->ex_path.mnt); 1663 mnt_drop_write(tfhp->fh_export->ex_path.mnt);
1680out_dput: 1664out_dput:
1681 dput(dnew); 1665 dput(dnew);
@@ -1750,12 +1734,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1750 if (ndentry == trap) 1734 if (ndentry == trap)
1751 goto out_dput_new; 1735 goto out_dput_new;
1752 1736
1753 if (svc_msnfs(ffhp) &&
1754 ((odentry->d_count > 1) || (ndentry->d_count > 1))) {
1755 host_err = -EPERM;
1756 goto out_dput_new;
1757 }
1758
1759 host_err = -EXDEV; 1737 host_err = -EXDEV;
1760 if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) 1738 if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
1761 goto out_dput_new; 1739 goto out_dput_new;
@@ -1763,15 +1741,17 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1763 if (host_err) 1741 if (host_err)
1764 goto out_dput_new; 1742 goto out_dput_new;
1765 1743
1744 host_err = nfsd_break_lease(odentry->d_inode);
1745 if (host_err)
1746 goto out_drop_write;
1766 host_err = vfs_rename(fdir, odentry, tdir, ndentry); 1747 host_err = vfs_rename(fdir, odentry, tdir, ndentry);
1767 if (!host_err) { 1748 if (!host_err) {
1768 host_err = commit_metadata(tfhp); 1749 host_err = commit_metadata(tfhp);
1769 if (!host_err) 1750 if (!host_err)
1770 host_err = commit_metadata(ffhp); 1751 host_err = commit_metadata(ffhp);
1771 } 1752 }
1772 1753out_drop_write:
1773 mnt_drop_write(ffhp->fh_export->ex_path.mnt); 1754 mnt_drop_write(ffhp->fh_export->ex_path.mnt);
1774
1775 out_dput_new: 1755 out_dput_new:
1776 dput(ndentry); 1756 dput(ndentry);
1777 out_dput_old: 1757 out_dput_old:
@@ -1834,18 +1814,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
1834 if (host_err) 1814 if (host_err)
1835 goto out_nfserr; 1815 goto out_nfserr;
1836 1816
1837 if (type != S_IFDIR) { /* It's UNLINK */ 1817 host_err = nfsd_break_lease(rdentry->d_inode);
1838#ifdef MSNFS 1818 if (host_err)
1839 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && 1819 goto out_put;
1840 (rdentry->d_count > 1)) { 1820 if (type != S_IFDIR)
1841 host_err = -EPERM;
1842 } else
1843#endif
1844 host_err = vfs_unlink(dirp, rdentry); 1821 host_err = vfs_unlink(dirp, rdentry);
1845 } else { /* It's RMDIR */ 1822 else
1846 host_err = vfs_rmdir(dirp, rdentry); 1823 host_err = vfs_rmdir(dirp, rdentry);
1847 } 1824out_put:
1848
1849 dput(rdentry); 1825 dput(rdentry);
1850 1826
1851 if (!host_err) 1827 if (!host_err)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 60fce3dc5cb5..366401e1a536 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -311,6 +311,11 @@ struct nfsd4_secinfo {
311 struct svc_export *si_exp; /* response */ 311 struct svc_export *si_exp; /* response */
312}; 312};
313 313
314struct nfsd4_secinfo_no_name {
315 u32 sin_style; /* request */
316 struct svc_export *sin_exp; /* response */
317};
318
314struct nfsd4_setattr { 319struct nfsd4_setattr {
315 stateid_t sa_stateid; /* request */ 320 stateid_t sa_stateid; /* request */
316 u32 sa_bmval[3]; /* request */ 321 u32 sa_bmval[3]; /* request */
@@ -373,8 +378,8 @@ struct nfsd4_sequence {
373 u32 cachethis; /* request */ 378 u32 cachethis; /* request */
374#if 0 379#if 0
375 u32 target_maxslots; /* response */ 380 u32 target_maxslots; /* response */
376 u32 status_flags; /* response */
377#endif /* not yet */ 381#endif /* not yet */
382 u32 status_flags; /* response */
378}; 383};
379 384
380struct nfsd4_destroy_session { 385struct nfsd4_destroy_session {
@@ -422,6 +427,7 @@ struct nfsd4_op {
422 427
423 /* NFSv4.1 */ 428 /* NFSv4.1 */
424 struct nfsd4_exchange_id exchange_id; 429 struct nfsd4_exchange_id exchange_id;
430 struct nfsd4_bind_conn_to_session bind_conn_to_session;
425 struct nfsd4_create_session create_session; 431 struct nfsd4_create_session create_session;
426 struct nfsd4_destroy_session destroy_session; 432 struct nfsd4_destroy_session destroy_session;
427 struct nfsd4_sequence sequence; 433 struct nfsd4_sequence sequence;
@@ -518,6 +524,7 @@ extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
518 struct nfsd4_sequence *seq); 524 struct nfsd4_sequence *seq);
519extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, 525extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
520 struct nfsd4_compound_state *, struct nfsd4_exchange_id *); 526 struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
527extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *);
521extern __be32 nfsd4_create_session(struct svc_rqst *, 528extern __be32 nfsd4_create_session(struct svc_rqst *,
522 struct nfsd4_compound_state *, 529 struct nfsd4_compound_state *,
523 struct nfsd4_create_session *); 530 struct nfsd4_create_session *);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 0994f6a76c07..58fd707174e1 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -704,7 +704,8 @@ skip_mount_setup:
704 sbp[0]->s_state = 704 sbp[0]->s_state =
705 cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS); 705 cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
706 /* synchronize sbp[1] with sbp[0] */ 706 /* synchronize sbp[1] with sbp[0] */
707 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); 707 if (sbp[1])
708 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
708 return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL); 709 return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
709} 710}
710 711
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index b572b6727181..326e7475a22a 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1,7 +1,7 @@
1/** 1/**
2 * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project. 2 * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2006 Anton Altaparmakov 4 * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
5 * Copyright (c) 2002 Richard Russon 5 * Copyright (c) 2002 Richard Russon
6 * 6 *
7 * This program/include file is free software; you can redistribute it and/or 7 * This program/include file is free software; you can redistribute it and/or
@@ -2576,6 +2576,8 @@ mft_rec_already_initialized:
2576 flush_dcache_page(page); 2576 flush_dcache_page(page);
2577 SetPageUptodate(page); 2577 SetPageUptodate(page);
2578 if (base_ni) { 2578 if (base_ni) {
2579 MFT_RECORD *m_tmp;
2580
2579 /* 2581 /*
2580 * Setup the base mft record in the extent mft record. This 2582 * Setup the base mft record in the extent mft record. This
2581 * completes initialization of the allocated extent mft record 2583 * completes initialization of the allocated extent mft record
@@ -2588,11 +2590,11 @@ mft_rec_already_initialized:
2588 * attach it to the base inode @base_ni and map, pin, and lock 2590 * attach it to the base inode @base_ni and map, pin, and lock
2589 * its, i.e. the allocated, mft record. 2591 * its, i.e. the allocated, mft record.
2590 */ 2592 */
2591 m = map_extent_mft_record(base_ni, bit, &ni); 2593 m_tmp = map_extent_mft_record(base_ni, bit, &ni);
2592 if (IS_ERR(m)) { 2594 if (IS_ERR(m_tmp)) {
2593 ntfs_error(vol->sb, "Failed to map allocated extent " 2595 ntfs_error(vol->sb, "Failed to map allocated extent "
2594 "mft record 0x%llx.", (long long)bit); 2596 "mft record 0x%llx.", (long long)bit);
2595 err = PTR_ERR(m); 2597 err = PTR_ERR(m_tmp);
2596 /* Set the mft record itself not in use. */ 2598 /* Set the mft record itself not in use. */
2597 m->flags &= cpu_to_le16( 2599 m->flags &= cpu_to_le16(
2598 ~le16_to_cpu(MFT_RECORD_IN_USE)); 2600 ~le16_to_cpu(MFT_RECORD_IN_USE));
@@ -2603,6 +2605,7 @@ mft_rec_already_initialized:
2603 ntfs_unmap_page(page); 2605 ntfs_unmap_page(page);
2604 goto undo_mftbmp_alloc; 2606 goto undo_mftbmp_alloc;
2605 } 2607 }
2608 BUG_ON(m != m_tmp);
2606 /* 2609 /*
2607 * Make sure the allocated mft record is written out to disk. 2610 * Make sure the allocated mft record is written out to disk.
2608 * No need to set the inode dirty because the caller is going 2611 * No need to set the inode dirty because the caller is going
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index ab152c00cd3a..77a8de5f7119 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -1,7 +1,6 @@
1config OCFS2_FS 1config OCFS2_FS
2 tristate "OCFS2 file system support" 2 tristate "OCFS2 file system support"
3 depends on NET && SYSFS 3 depends on NET && SYSFS && CONFIGFS_FS
4 select CONFIGFS_FS
5 select JBD2 4 select JBD2
6 select CRC32 5 select CRC32
7 select QUOTA 6 select QUOTA
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 63e3fca266e0..a6651956482e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1989,20 +1989,20 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1989 return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); 1989 return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
1990} 1990}
1991 1991
1992static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset, 1992static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
1993 loff_t len) 1993 loff_t len)
1994{ 1994{
1995 struct inode *inode = file->f_path.dentry->d_inode;
1995 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1996 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1996 struct ocfs2_space_resv sr; 1997 struct ocfs2_space_resv sr;
1997 int change_size = 1; 1998 int change_size = 1;
1998 int cmd = OCFS2_IOC_RESVSP64; 1999 int cmd = OCFS2_IOC_RESVSP64;
1999 2000
2001 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2002 return -EOPNOTSUPP;
2000 if (!ocfs2_writes_unwritten_extents(osb)) 2003 if (!ocfs2_writes_unwritten_extents(osb))
2001 return -EOPNOTSUPP; 2004 return -EOPNOTSUPP;
2002 2005
2003 if (S_ISDIR(inode->i_mode))
2004 return -ENODEV;
2005
2006 if (mode & FALLOC_FL_KEEP_SIZE) 2006 if (mode & FALLOC_FL_KEEP_SIZE)
2007 change_size = 0; 2007 change_size = 0;
2008 2008
@@ -2610,7 +2610,6 @@ const struct inode_operations ocfs2_file_iops = {
2610 .getxattr = generic_getxattr, 2610 .getxattr = generic_getxattr,
2611 .listxattr = ocfs2_listxattr, 2611 .listxattr = ocfs2_listxattr,
2612 .removexattr = generic_removexattr, 2612 .removexattr = generic_removexattr,
2613 .fallocate = ocfs2_fallocate,
2614 .fiemap = ocfs2_fiemap, 2613 .fiemap = ocfs2_fiemap,
2615}; 2614};
2616 2615
@@ -2642,6 +2641,7 @@ const struct file_operations ocfs2_fops = {
2642 .flock = ocfs2_flock, 2641 .flock = ocfs2_flock,
2643 .splice_read = ocfs2_file_splice_read, 2642 .splice_read = ocfs2_file_splice_read,
2644 .splice_write = ocfs2_file_splice_write, 2643 .splice_write = ocfs2_file_splice_write,
2644 .fallocate = ocfs2_fallocate,
2645}; 2645};
2646 2646
2647const struct file_operations ocfs2_dops = { 2647const struct file_operations ocfs2_dops = {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 06d1f749ca89..38f986d2447e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -993,8 +993,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
993} 993}
994 994
995/* Handle quota on quotactl */ 995/* Handle quota on quotactl */
996static int ocfs2_quota_on(struct super_block *sb, int type, int format_id, 996static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
997 char *path)
998{ 997{
999 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 998 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
1000 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; 999 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -1013,7 +1012,7 @@ static int ocfs2_quota_off(struct super_block *sb, int type)
1013} 1012}
1014 1013
1015static const struct quotactl_ops ocfs2_quotactl_ops = { 1014static const struct quotactl_ops ocfs2_quotactl_ops = {
1016 .quota_on = ocfs2_quota_on, 1015 .quota_on_meta = ocfs2_quota_on,
1017 .quota_off = ocfs2_quota_off, 1016 .quota_off = ocfs2_quota_off,
1018 .quota_sync = dquot_quota_sync, 1017 .quota_sync = dquot_quota_sync,
1019 .get_info = dquot_get_dqinfo, 1018 .get_info = dquot_get_dqinfo,
diff --git a/fs/open.c b/fs/open.c
index 5b6ef7e2859e..5a2c6ebc22b5 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -255,10 +255,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
255 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) 255 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
256 return -EFBIG; 256 return -EFBIG;
257 257
258 if (!inode->i_op->fallocate) 258 if (!file->f_op->fallocate)
259 return -EOPNOTSUPP; 259 return -EOPNOTSUPP;
260 260
261 return inode->i_op->fallocate(inode, mode, offset, len); 261 return file->f_op->fallocate(file, mode, offset, len);
262} 262}
263 263
264SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) 264SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
@@ -790,6 +790,8 @@ struct file *nameidata_to_filp(struct nameidata *nd)
790 790
791 /* Pick up the filp from the open intent */ 791 /* Pick up the filp from the open intent */
792 filp = nd->intent.open.file; 792 filp = nd->intent.open.file;
793 nd->intent.open.file = NULL;
794
793 /* Has the filesystem initialised the file for us? */ 795 /* Has the filesystem initialised the file for us? */
794 if (filp->f_path.dentry == NULL) { 796 if (filp->f_path.dentry == NULL) {
795 path_get(&nd->path); 797 path_get(&nd->path);
diff --git a/fs/pipe.c b/fs/pipe.c
index e2e95fb46a1e..da42f7db50de 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -441,7 +441,7 @@ redo:
441 break; 441 break;
442 } 442 }
443 if (do_wakeup) { 443 if (do_wakeup) {
444 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT); 444 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
445 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 445 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
446 } 446 }
447 pipe_wait(pipe); 447 pipe_wait(pipe);
@@ -450,7 +450,7 @@ redo:
450 450
451 /* Signal writers asynchronously that there is more room. */ 451 /* Signal writers asynchronously that there is more room. */
452 if (do_wakeup) { 452 if (do_wakeup) {
453 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT); 453 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
454 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 454 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
455 } 455 }
456 if (ret > 0) 456 if (ret > 0)
@@ -612,7 +612,7 @@ redo2:
612 break; 612 break;
613 } 613 }
614 if (do_wakeup) { 614 if (do_wakeup) {
615 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN); 615 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
616 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 616 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
617 do_wakeup = 0; 617 do_wakeup = 0;
618 } 618 }
@@ -623,7 +623,7 @@ redo2:
623out: 623out:
624 mutex_unlock(&inode->i_mutex); 624 mutex_unlock(&inode->i_mutex);
625 if (do_wakeup) { 625 if (do_wakeup) {
626 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN); 626 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
627 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 627 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
628 } 628 }
629 if (ret > 0) 629 if (ret > 0)
@@ -715,7 +715,7 @@ pipe_release(struct inode *inode, int decr, int decw)
715 if (!pipe->readers && !pipe->writers) { 715 if (!pipe->readers && !pipe->writers) {
716 free_pipe_info(inode); 716 free_pipe_info(inode);
717 } else { 717 } else {
718 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT); 718 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
719 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 719 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
720 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 720 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
721 } 721 }
@@ -1292,7 +1292,7 @@ static int __init init_pipe_fs(void)
1292static void __exit exit_pipe_fs(void) 1292static void __exit exit_pipe_fs(void)
1293{ 1293{
1294 unregister_filesystem(&pipe_fs_type); 1294 unregister_filesystem(&pipe_fs_type);
1295 mntput_long(pipe_mnt); 1295 mntput(pipe_mnt);
1296} 1296}
1297 1297
1298fs_initcall(init_pipe_fs); 1298fs_initcall(init_pipe_fs);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 39df95a0ec25..b1cf6bf4b41d 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/errno.h> 23#include <linux/errno.h>
24 24
25EXPORT_SYMBOL(posix_acl_init);
25EXPORT_SYMBOL(posix_acl_alloc); 26EXPORT_SYMBOL(posix_acl_alloc);
26EXPORT_SYMBOL(posix_acl_clone); 27EXPORT_SYMBOL(posix_acl_clone);
27EXPORT_SYMBOL(posix_acl_valid); 28EXPORT_SYMBOL(posix_acl_valid);
@@ -32,6 +33,16 @@ EXPORT_SYMBOL(posix_acl_chmod_masq);
32EXPORT_SYMBOL(posix_acl_permission); 33EXPORT_SYMBOL(posix_acl_permission);
33 34
34/* 35/*
36 * Init a fresh posix_acl
37 */
38void
39posix_acl_init(struct posix_acl *acl, int count)
40{
41 atomic_set(&acl->a_refcount, 1);
42 acl->a_count = count;
43}
44
45/*
35 * Allocate a new ACL with the specified number of entries. 46 * Allocate a new ACL with the specified number of entries.
36 */ 47 */
37struct posix_acl * 48struct posix_acl *
@@ -40,10 +51,8 @@ posix_acl_alloc(int count, gfp_t flags)
40 const size_t size = sizeof(struct posix_acl) + 51 const size_t size = sizeof(struct posix_acl) +
41 count * sizeof(struct posix_acl_entry); 52 count * sizeof(struct posix_acl_entry);
42 struct posix_acl *acl = kmalloc(size, flags); 53 struct posix_acl *acl = kmalloc(size, flags);
43 if (acl) { 54 if (acl)
44 atomic_set(&acl->a_refcount, 1); 55 posix_acl_init(acl, count);
45 acl->a_count = count;
46 }
47 return acl; 56 return acl;
48} 57}
49 58
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 6a0068841d96..15af6222f8a4 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -1,5 +1,5 @@
1config PROC_FS 1config PROC_FS
2 bool "/proc file system support" if EMBEDDED 2 bool "/proc file system support" if EXPERT
3 default y 3 default y
4 help 4 help
5 This is a virtual file system providing information about the status 5 This is a virtual file system providing information about the status
@@ -40,7 +40,7 @@ config PROC_VMCORE
40 Exports the dump image of crashed kernel in ELF format. 40 Exports the dump image of crashed kernel in ELF format.
41 41
42config PROC_SYSCTL 42config PROC_SYSCTL
43 bool "Sysctl support (/proc/sys)" if EMBEDDED 43 bool "Sysctl support (/proc/sys)" if EXPERT
44 depends on PROC_FS 44 depends on PROC_FS
45 select SYSCTL 45 select SYSCTL
46 default y 46 default y
@@ -61,7 +61,7 @@ config PROC_SYSCTL
61config PROC_PAGE_MONITOR 61config PROC_PAGE_MONITOR
62 default y 62 default y
63 depends on PROC_FS && MMU 63 depends on PROC_FS && MMU
64 bool "Enable /proc page monitoring" if EMBEDDED 64 bool "Enable /proc page monitoring" if EXPERT
65 help 65 help
66 Various /proc files exist to monitor process memory utilization: 66 Various /proc files exist to monitor process memory utilization:
67 /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, 67 /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 93f1cdd5d3d7..9d096e82b201 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1151,7 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1151 goto err_task_lock; 1151 goto err_task_lock;
1152 } 1152 }
1153 1153
1154 if (oom_score_adj < task->signal->oom_score_adj && 1154 if (oom_score_adj < task->signal->oom_score_adj_min &&
1155 !capable(CAP_SYS_RESOURCE)) { 1155 !capable(CAP_SYS_RESOURCE)) {
1156 err = -EACCES; 1156 err = -EACCES;
1157 goto err_sighand; 1157 goto err_sighand;
@@ -1164,6 +1164,8 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1164 atomic_dec(&task->mm->oom_disable_count); 1164 atomic_dec(&task->mm->oom_disable_count);
1165 } 1165 }
1166 task->signal->oom_score_adj = oom_score_adj; 1166 task->signal->oom_score_adj = oom_score_adj;
1167 if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
1168 task->signal->oom_score_adj_min = oom_score_adj;
1167 /* 1169 /*
1168 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is 1170 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
1169 * always attainable. 1171 * always attainable.
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index eafc22ab1fdd..b701eaa482bf 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -67,7 +67,7 @@ static void *c_start(struct seq_file *m, loff_t *pos)
67 struct console *con; 67 struct console *con;
68 loff_t off = 0; 68 loff_t off = 0;
69 69
70 acquire_console_sem(); 70 console_lock();
71 for_each_console(con) 71 for_each_console(con)
72 if (off++ == *pos) 72 if (off++ == *pos)
73 break; 73 break;
@@ -84,7 +84,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos)
84 84
85static void c_stop(struct seq_file *m, void *v) 85static void c_stop(struct seq_file *m, void *v)
86{ 86{
87 release_console_sem(); 87 console_unlock();
88} 88}
89 89
90static const struct seq_operations consoles_op = { 90static const struct seq_operations consoles_op = {
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a65239cfd97e..ed257d141568 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -101,6 +101,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
101#ifdef CONFIG_MEMORY_FAILURE 101#ifdef CONFIG_MEMORY_FAILURE
102 "HardwareCorrupted: %5lu kB\n" 102 "HardwareCorrupted: %5lu kB\n"
103#endif 103#endif
104#ifdef CONFIG_TRANSPARENT_HUGEPAGE
105 "AnonHugePages: %8lu kB\n"
106#endif
104 , 107 ,
105 K(i.totalram), 108 K(i.totalram),
106 K(i.freeram), 109 K(i.freeram),
@@ -128,7 +131,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
128 K(i.freeswap), 131 K(i.freeswap),
129 K(global_page_state(NR_FILE_DIRTY)), 132 K(global_page_state(NR_FILE_DIRTY)),
130 K(global_page_state(NR_WRITEBACK)), 133 K(global_page_state(NR_WRITEBACK)),
131 K(global_page_state(NR_ANON_PAGES)), 134 K(global_page_state(NR_ANON_PAGES)
135#ifdef CONFIG_TRANSPARENT_HUGEPAGE
136 + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
137 HPAGE_PMD_NR
138#endif
139 ),
132 K(global_page_state(NR_FILE_MAPPED)), 140 K(global_page_state(NR_FILE_MAPPED)),
133 K(global_page_state(NR_SHMEM)), 141 K(global_page_state(NR_SHMEM)),
134 K(global_page_state(NR_SLAB_RECLAIMABLE) + 142 K(global_page_state(NR_SLAB_RECLAIMABLE) +
@@ -151,6 +159,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
151#ifdef CONFIG_MEMORY_FAILURE 159#ifdef CONFIG_MEMORY_FAILURE
152 ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) 160 ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
153#endif 161#endif
162#ifdef CONFIG_TRANSPARENT_HUGEPAGE
163 ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
164 HPAGE_PMD_NR)
165#endif
154 ); 166 );
155 167
156 hugetlb_report_meminfo(m); 168 hugetlb_report_meminfo(m);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index b06c674624e6..6d8e6a9e93ab 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -116,15 +116,17 @@ u64 stable_page_flags(struct page *page)
116 if (PageHuge(page)) 116 if (PageHuge(page))
117 u |= 1 << KPF_HUGE; 117 u |= 1 << KPF_HUGE;
118 118
119 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
120
121 /* 119 /*
122 * Caveats on high order pages: 120 * Caveats on high order pages: page->_count will only be set
123 * PG_buddy will only be set on the head page; SLUB/SLQB do the same 121 * -1 on the head page; SLUB/SLQB do the same for PG_slab;
124 * for PG_slab; SLOB won't set PG_slab at all on compound pages. 122 * SLOB won't set PG_slab at all on compound pages.
125 */ 123 */
124 if (PageBuddy(page))
125 u |= 1 << KPF_BUDDY;
126
127 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
128
126 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); 129 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
127 u |= kpf_copy_bit(k, KPF_BUDDY, PG_buddy);
128 130
129 u |= kpf_copy_bit(k, KPF_ERROR, PG_error); 131 u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
130 u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); 132 u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c3755bd8dd3e..60b914860f81 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -418,7 +418,8 @@ static int show_smap(struct seq_file *m, void *v)
418 "Anonymous: %8lu kB\n" 418 "Anonymous: %8lu kB\n"
419 "Swap: %8lu kB\n" 419 "Swap: %8lu kB\n"
420 "KernelPageSize: %8lu kB\n" 420 "KernelPageSize: %8lu kB\n"
421 "MMUPageSize: %8lu kB\n", 421 "MMUPageSize: %8lu kB\n"
422 "Locked: %8lu kB\n",
422 (vma->vm_end - vma->vm_start) >> 10, 423 (vma->vm_end - vma->vm_start) >> 10,
423 mss.resident >> 10, 424 mss.resident >> 10,
424 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), 425 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -430,7 +431,9 @@ static int show_smap(struct seq_file *m, void *v)
430 mss.anonymous >> 10, 431 mss.anonymous >> 10,
431 mss.swap >> 10, 432 mss.swap >> 10,
432 vma_kernel_pagesize(vma) >> 10, 433 vma_kernel_pagesize(vma) >> 10,
433 vma_mmu_pagesize(vma) >> 10); 434 vma_mmu_pagesize(vma) >> 10,
435 (vma->vm_flags & VM_LOCKED) ?
436 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
434 437
435 if (m->count < m->size) /* vma is copied successfully */ 438 if (m->count < m->size) /* vma is copied successfully */
436 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; 439 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 84becd3e4772..a2a622e079f0 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2189,8 +2189,8 @@ int dquot_resume(struct super_block *sb, int type)
2189} 2189}
2190EXPORT_SYMBOL(dquot_resume); 2190EXPORT_SYMBOL(dquot_resume);
2191 2191
2192int dquot_quota_on_path(struct super_block *sb, int type, int format_id, 2192int dquot_quota_on(struct super_block *sb, int type, int format_id,
2193 struct path *path) 2193 struct path *path)
2194{ 2194{
2195 int error = security_quota_on(path->dentry); 2195 int error = security_quota_on(path->dentry);
2196 if (error) 2196 if (error)
@@ -2204,20 +2204,6 @@ int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
2204 DQUOT_LIMITS_ENABLED); 2204 DQUOT_LIMITS_ENABLED);
2205 return error; 2205 return error;
2206} 2206}
2207EXPORT_SYMBOL(dquot_quota_on_path);
2208
2209int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name)
2210{
2211 struct path path;
2212 int error;
2213
2214 error = kern_path(name, LOOKUP_FOLLOW, &path);
2215 if (!error) {
2216 error = dquot_quota_on_path(sb, type, format_id, &path);
2217 path_put(&path);
2218 }
2219 return error;
2220}
2221EXPORT_SYMBOL(dquot_quota_on); 2207EXPORT_SYMBOL(dquot_quota_on);
2222 2208
2223/* 2209/*
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b299961e1edb..b34bdb25490c 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -64,18 +64,15 @@ static int quota_sync_all(int type)
64} 64}
65 65
66static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, 66static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
67 void __user *addr) 67 struct path *path)
68{ 68{
69 char *pathname; 69 if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta)
70 int ret = -ENOSYS; 70 return -ENOSYS;
71 71 if (sb->s_qcop->quota_on_meta)
72 pathname = getname(addr); 72 return sb->s_qcop->quota_on_meta(sb, type, id);
73 if (IS_ERR(pathname)) 73 if (IS_ERR(path))
74 return PTR_ERR(pathname); 74 return PTR_ERR(path);
75 if (sb->s_qcop->quota_on) 75 return sb->s_qcop->quota_on(sb, type, id, path);
76 ret = sb->s_qcop->quota_on(sb, type, id, pathname);
77 putname(pathname);
78 return ret;
79} 76}
80 77
81static int quota_getfmt(struct super_block *sb, int type, void __user *addr) 78static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
@@ -241,7 +238,7 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
241 238
242/* Copy parameters and call proper function */ 239/* Copy parameters and call proper function */
243static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, 240static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
244 void __user *addr) 241 void __user *addr, struct path *path)
245{ 242{
246 int ret; 243 int ret;
247 244
@@ -256,7 +253,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
256 253
257 switch (cmd) { 254 switch (cmd) {
258 case Q_QUOTAON: 255 case Q_QUOTAON:
259 return quota_quotaon(sb, type, cmd, id, addr); 256 return quota_quotaon(sb, type, cmd, id, path);
260 case Q_QUOTAOFF: 257 case Q_QUOTAOFF:
261 if (!sb->s_qcop->quota_off) 258 if (!sb->s_qcop->quota_off)
262 return -ENOSYS; 259 return -ENOSYS;
@@ -335,6 +332,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
335{ 332{
336 uint cmds, type; 333 uint cmds, type;
337 struct super_block *sb = NULL; 334 struct super_block *sb = NULL;
335 struct path path, *pathp = NULL;
338 int ret; 336 int ret;
339 337
340 cmds = cmd >> SUBCMDSHIFT; 338 cmds = cmd >> SUBCMDSHIFT;
@@ -351,12 +349,27 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
351 return -ENODEV; 349 return -ENODEV;
352 } 350 }
353 351
352 /*
353 * Path for quotaon has to be resolved before grabbing superblock
354 * because that gets s_umount sem which is also possibly needed by path
355 * resolution (think about autofs) and thus deadlocks could arise.
356 */
357 if (cmds == Q_QUOTAON) {
358 ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW, &path);
359 if (ret)
360 pathp = ERR_PTR(ret);
361 else
362 pathp = &path;
363 }
364
354 sb = quotactl_block(special); 365 sb = quotactl_block(special);
355 if (IS_ERR(sb)) 366 if (IS_ERR(sb))
356 return PTR_ERR(sb); 367 return PTR_ERR(sb);
357 368
358 ret = do_quotactl(sb, type, cmds, id, addr); 369 ret = do_quotactl(sb, type, cmds, id, addr, pathp);
359 370
360 drop_super(sb); 371 drop_super(sb);
372 if (pathp && !IS_ERR(pathp))
373 path_put(pathp);
361 return ret; 374 return ret;
362} 375}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 2575682a9ead..0aab04f46827 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -632,7 +632,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
632static int reiserfs_release_dquot(struct dquot *); 632static int reiserfs_release_dquot(struct dquot *);
633static int reiserfs_mark_dquot_dirty(struct dquot *); 633static int reiserfs_mark_dquot_dirty(struct dquot *);
634static int reiserfs_write_info(struct super_block *, int); 634static int reiserfs_write_info(struct super_block *, int);
635static int reiserfs_quota_on(struct super_block *, int, int, char *); 635static int reiserfs_quota_on(struct super_block *, int, int, struct path *);
636 636
637static const struct dquot_operations reiserfs_quota_operations = { 637static const struct dquot_operations reiserfs_quota_operations = {
638 .write_dquot = reiserfs_write_dquot, 638 .write_dquot = reiserfs_write_dquot,
@@ -2048,25 +2048,21 @@ static int reiserfs_quota_on_mount(struct super_block *sb, int type)
2048 * Standard function to be called on quota_on 2048 * Standard function to be called on quota_on
2049 */ 2049 */
2050static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, 2050static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2051 char *name) 2051 struct path *path)
2052{ 2052{
2053 int err; 2053 int err;
2054 struct path path;
2055 struct inode *inode; 2054 struct inode *inode;
2056 struct reiserfs_transaction_handle th; 2055 struct reiserfs_transaction_handle th;
2057 2056
2058 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA))) 2057 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
2059 return -EINVAL; 2058 return -EINVAL;
2060 2059
2061 err = kern_path(name, LOOKUP_FOLLOW, &path);
2062 if (err)
2063 return err;
2064 /* Quotafile not on the same filesystem? */ 2060 /* Quotafile not on the same filesystem? */
2065 if (path.mnt->mnt_sb != sb) { 2061 if (path->mnt->mnt_sb != sb) {
2066 err = -EXDEV; 2062 err = -EXDEV;
2067 goto out; 2063 goto out;
2068 } 2064 }
2069 inode = path.dentry->d_inode; 2065 inode = path->dentry->d_inode;
2070 /* We must not pack tails for quota files on reiserfs for quota IO to work */ 2066 /* We must not pack tails for quota files on reiserfs for quota IO to work */
2071 if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) { 2067 if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
2072 err = reiserfs_unpack(inode, NULL); 2068 err = reiserfs_unpack(inode, NULL);
@@ -2082,7 +2078,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2082 /* Journaling quota? */ 2078 /* Journaling quota? */
2083 if (REISERFS_SB(sb)->s_qf_names[type]) { 2079 if (REISERFS_SB(sb)->s_qf_names[type]) {
2084 /* Quotafile not of fs root? */ 2080 /* Quotafile not of fs root? */
2085 if (path.dentry->d_parent != sb->s_root) 2081 if (path->dentry->d_parent != sb->s_root)
2086 reiserfs_warning(sb, "super-6521", 2082 reiserfs_warning(sb, "super-6521",
2087 "Quota file not on filesystem root. " 2083 "Quota file not on filesystem root. "
2088 "Journalled quota will not work."); 2084 "Journalled quota will not work.");
@@ -2101,9 +2097,8 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2101 if (err) 2097 if (err)
2102 goto out; 2098 goto out;
2103 } 2099 }
2104 err = dquot_quota_on_path(sb, type, format_id, &path); 2100 err = dquot_quota_on(sb, type, format_id, path);
2105out: 2101out:
2106 path_put(&path);
2107 return err; 2102 return err;
2108} 2103}
2109 2104
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index e5f63da64d04..aa68a8a31518 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -29,7 +29,6 @@ config SQUASHFS
29config SQUASHFS_XATTR 29config SQUASHFS_XATTR
30 bool "Squashfs XATTR support" 30 bool "Squashfs XATTR support"
31 depends on SQUASHFS 31 depends on SQUASHFS
32 default n
33 help 32 help
34 Saying Y here includes support for extended attributes (xattrs). 33 Saying Y here includes support for extended attributes (xattrs).
35 Xattrs are name:value pairs associated with inodes by 34 Xattrs are name:value pairs associated with inodes by
@@ -40,7 +39,6 @@ config SQUASHFS_XATTR
40config SQUASHFS_LZO 39config SQUASHFS_LZO
41 bool "Include support for LZO compressed file systems" 40 bool "Include support for LZO compressed file systems"
42 depends on SQUASHFS 41 depends on SQUASHFS
43 default n
44 select LZO_DECOMPRESS 42 select LZO_DECOMPRESS
45 help 43 help
46 Saying Y here includes support for reading Squashfs file systems 44 Saying Y here includes support for reading Squashfs file systems
@@ -53,10 +51,24 @@ config SQUASHFS_LZO
53 51
54 If unsure, say N. 52 If unsure, say N.
55 53
54config SQUASHFS_XZ
55 bool "Include support for XZ compressed file systems"
56 depends on SQUASHFS
57 select XZ_DEC
58 help
59 Saying Y here includes support for reading Squashfs file systems
60 compressed with XZ compresssion. XZ gives better compression than
61 the default zlib compression, at the expense of greater CPU and
62 memory overhead.
63
64 XZ is not the standard compression used in Squashfs and so most
65 file systems will be readable without selecting this option.
66
67 If unsure, say N.
68
56config SQUASHFS_EMBEDDED 69config SQUASHFS_EMBEDDED
57 bool "Additional option for memory-constrained systems" 70 bool "Additional option for memory-constrained systems"
58 depends on SQUASHFS 71 depends on SQUASHFS
59 default n
60 help 72 help
61 Saying Y here allows you to specify cache size. 73 Saying Y here allows you to specify cache size.
62 74
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 7672bac8d328..cecf2bea07af 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -7,3 +7,4 @@ squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o 7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
8squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o 8squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
9squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o 9squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
10squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 653c030eb840..8ab48bc2fa7d 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -34,7 +34,6 @@
34 34
35#include "squashfs_fs.h" 35#include "squashfs_fs.h"
36#include "squashfs_fs_sb.h" 36#include "squashfs_fs_sb.h"
37#include "squashfs_fs_i.h"
38#include "squashfs.h" 37#include "squashfs.h"
39#include "decompressor.h" 38#include "decompressor.h"
40 39
@@ -64,6 +63,14 @@ static struct buffer_head *get_block_length(struct super_block *sb,
64 *length = (unsigned char) bh->b_data[*offset] | 63 *length = (unsigned char) bh->b_data[*offset] |
65 (unsigned char) bh->b_data[*offset + 1] << 8; 64 (unsigned char) bh->b_data[*offset + 1] << 8;
66 *offset += 2; 65 *offset += 2;
66
67 if (*offset == msblk->devblksize) {
68 put_bh(bh);
69 bh = sb_bread(sb, ++(*cur_index));
70 if (bh == NULL)
71 return NULL;
72 *offset = 0;
73 }
67 } 74 }
68 75
69 return bh; 76 return bh;
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 57314bee9059..26b15ae34d6f 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -55,7 +55,6 @@
55 55
56#include "squashfs_fs.h" 56#include "squashfs_fs.h"
57#include "squashfs_fs_sb.h" 57#include "squashfs_fs_sb.h"
58#include "squashfs_fs_i.h"
59#include "squashfs.h" 58#include "squashfs.h"
60 59
61/* 60/*
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 24af9ce9722f..a5940e54c4dd 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -27,7 +27,6 @@
27 27
28#include "squashfs_fs.h" 28#include "squashfs_fs.h"
29#include "squashfs_fs_sb.h" 29#include "squashfs_fs_sb.h"
30#include "squashfs_fs_i.h"
31#include "decompressor.h" 30#include "decompressor.h"
32#include "squashfs.h" 31#include "squashfs.h"
33 32
@@ -41,23 +40,26 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
41}; 40};
42 41
43#ifndef CONFIG_SQUASHFS_LZO 42#ifndef CONFIG_SQUASHFS_LZO
44static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = { 43static const struct squashfs_decompressor squashfs_lzo_comp_ops = {
45 NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 44 NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
46}; 45};
47#endif 46#endif
48 47
48#ifndef CONFIG_SQUASHFS_XZ
49static const struct squashfs_decompressor squashfs_xz_comp_ops = {
50 NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0
51};
52#endif
53
49static const struct squashfs_decompressor squashfs_unknown_comp_ops = { 54static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
50 NULL, NULL, NULL, 0, "unknown", 0 55 NULL, NULL, NULL, 0, "unknown", 0
51}; 56};
52 57
53static const struct squashfs_decompressor *decompressor[] = { 58static const struct squashfs_decompressor *decompressor[] = {
54 &squashfs_zlib_comp_ops, 59 &squashfs_zlib_comp_ops,
55 &squashfs_lzma_unsupported_comp_ops,
56#ifdef CONFIG_SQUASHFS_LZO
57 &squashfs_lzo_comp_ops, 60 &squashfs_lzo_comp_ops,
58#else 61 &squashfs_xz_comp_ops,
59 &squashfs_lzo_unsupported_comp_ops, 62 &squashfs_lzma_unsupported_comp_ops,
60#endif
61 &squashfs_unknown_comp_ops 63 &squashfs_unknown_comp_ops
62}; 64};
63 65
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 7425f80783f6..3b305a70f7aa 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -52,4 +52,13 @@ static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
52 return msblk->decompressor->decompress(msblk, buffer, bh, b, offset, 52 return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
53 length, srclength, pages); 53 length, srclength, pages);
54} 54}
55
56#ifdef CONFIG_SQUASHFS_XZ
57extern const struct squashfs_decompressor squashfs_xz_comp_ops;
58#endif
59
60#ifdef CONFIG_SQUASHFS_LZO
61extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
62#endif
63
55#endif 64#endif
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index 7c90bbd6879d..7eef571443c6 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -39,7 +39,6 @@
39 39
40#include "squashfs_fs.h" 40#include "squashfs_fs.h"
41#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
42#include "squashfs_fs_i.h"
43#include "squashfs.h" 42#include "squashfs.h"
44 43
45/* 44/*
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index b7f64bcd2b70..d8f32452638e 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -37,7 +37,6 @@
37 37
38#include "squashfs_fs.h" 38#include "squashfs_fs.h"
39#include "squashfs_fs_sb.h" 39#include "squashfs_fs_sb.h"
40#include "squashfs_fs_i.h"
41#include "squashfs.h" 40#include "squashfs.h"
42 41
43/* 42/*
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 5d87789bf1c1..7da759e34c52 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -29,7 +29,6 @@
29 29
30#include "squashfs_fs.h" 30#include "squashfs_fs.h"
31#include "squashfs_fs_sb.h" 31#include "squashfs_fs_sb.h"
32#include "squashfs_fs_i.h"
33#include "squashfs.h" 32#include "squashfs.h"
34#include "decompressor.h" 33#include "decompressor.h"
35 34
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 5d45569d5f72..ba729d808876 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -27,11 +27,6 @@
27 27
28#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args) 28#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args)
29 29
30static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
31{
32 return list_entry(inode, struct squashfs_inode_info, vfs_inode);
33}
34
35/* block.c */ 30/* block.c */
36extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *, 31extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
37 int, int); 32 int, int);
@@ -104,6 +99,3 @@ extern const struct xattr_handler *squashfs_xattr_handlers[];
104 99
105/* zlib_wrapper.c */ 100/* zlib_wrapper.c */
106extern const struct squashfs_decompressor squashfs_zlib_comp_ops; 101extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
107
108/* lzo_wrapper.c */
109extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index c5137fc9ab11..39533feffd6d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -238,6 +238,7 @@ struct meta_index {
238#define ZLIB_COMPRESSION 1 238#define ZLIB_COMPRESSION 1
239#define LZMA_COMPRESSION 2 239#define LZMA_COMPRESSION 2
240#define LZO_COMPRESSION 3 240#define LZO_COMPRESSION 3
241#define XZ_COMPRESSION 4
241 242
242struct squashfs_super_block { 243struct squashfs_super_block {
243 __le32 s_magic; 244 __le32 s_magic;
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index d3e3a37f28a1..359baefc01fc 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -45,4 +45,10 @@ struct squashfs_inode_info {
45 }; 45 };
46 struct inode vfs_inode; 46 struct inode vfs_inode;
47}; 47};
48
49
50static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
51{
52 return list_entry(inode, struct squashfs_inode_info, vfs_inode);
53}
48#endif 54#endif
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index d33be5dd6c32..05385dbe1465 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -32,7 +32,6 @@
32 32
33#include "squashfs_fs.h" 33#include "squashfs_fs.h"
34#include "squashfs_fs_sb.h" 34#include "squashfs_fs_sb.h"
35#include "squashfs_fs_i.h"
36#include "squashfs.h" 35#include "squashfs.h"
37#include "xattr.h" 36#include "xattr.h"
38 37
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
new file mode 100644
index 000000000000..c4eb40018256
--- /dev/null
+++ b/fs/squashfs/xz_wrapper.c
@@ -0,0 +1,147 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xz_wrapper.c
22 */
23
24
25#include <linux/mutex.h>
26#include <linux/buffer_head.h>
27#include <linux/slab.h>
28#include <linux/xz.h>
29
30#include "squashfs_fs.h"
31#include "squashfs_fs_sb.h"
32#include "squashfs_fs_i.h"
33#include "squashfs.h"
34#include "decompressor.h"
35
36struct squashfs_xz {
37 struct xz_dec *state;
38 struct xz_buf buf;
39};
40
41static void *squashfs_xz_init(struct squashfs_sb_info *msblk)
42{
43 int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
44
45 struct squashfs_xz *stream = kmalloc(sizeof(*stream), GFP_KERNEL);
46 if (stream == NULL)
47 goto failed;
48
49 stream->state = xz_dec_init(XZ_PREALLOC, block_size);
50 if (stream->state == NULL)
51 goto failed;
52
53 return stream;
54
55failed:
56 ERROR("Failed to allocate xz workspace\n");
57 kfree(stream);
58 return NULL;
59}
60
61
62static void squashfs_xz_free(void *strm)
63{
64 struct squashfs_xz *stream = strm;
65
66 if (stream) {
67 xz_dec_end(stream->state);
68 kfree(stream);
69 }
70}
71
72
73static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
74 struct buffer_head **bh, int b, int offset, int length, int srclength,
75 int pages)
76{
77 enum xz_ret xz_err;
78 int avail, total = 0, k = 0, page = 0;
79 struct squashfs_xz *stream = msblk->stream;
80
81 mutex_lock(&msblk->read_data_mutex);
82
83 xz_dec_reset(stream->state);
84 stream->buf.in_pos = 0;
85 stream->buf.in_size = 0;
86 stream->buf.out_pos = 0;
87 stream->buf.out_size = PAGE_CACHE_SIZE;
88 stream->buf.out = buffer[page++];
89
90 do {
91 if (stream->buf.in_pos == stream->buf.in_size && k < b) {
92 avail = min(length, msblk->devblksize - offset);
93 length -= avail;
94 wait_on_buffer(bh[k]);
95 if (!buffer_uptodate(bh[k]))
96 goto release_mutex;
97
98 stream->buf.in = bh[k]->b_data + offset;
99 stream->buf.in_size = avail;
100 stream->buf.in_pos = 0;
101 offset = 0;
102 }
103
104 if (stream->buf.out_pos == stream->buf.out_size
105 && page < pages) {
106 stream->buf.out = buffer[page++];
107 stream->buf.out_pos = 0;
108 total += PAGE_CACHE_SIZE;
109 }
110
111 xz_err = xz_dec_run(stream->state, &stream->buf);
112
113 if (stream->buf.in_pos == stream->buf.in_size && k < b)
114 put_bh(bh[k++]);
115 } while (xz_err == XZ_OK);
116
117 if (xz_err != XZ_STREAM_END) {
118 ERROR("xz_dec_run error, data probably corrupt\n");
119 goto release_mutex;
120 }
121
122 if (k < b) {
123 ERROR("xz_uncompress error, input remaining\n");
124 goto release_mutex;
125 }
126
127 total += stream->buf.out_pos;
128 mutex_unlock(&msblk->read_data_mutex);
129 return total;
130
131release_mutex:
132 mutex_unlock(&msblk->read_data_mutex);
133
134 for (; k < b; k++)
135 put_bh(bh[k]);
136
137 return -EIO;
138}
139
140const struct squashfs_decompressor squashfs_xz_comp_ops = {
141 .init = squashfs_xz_init,
142 .free = squashfs_xz_free,
143 .decompress = squashfs_xz_uncompress,
144 .id = XZ_COMPRESSION,
145 .name = "xz",
146 .supported = 1
147};
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 7a603874e483..4661ae2b1cec 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -29,7 +29,6 @@
29 29
30#include "squashfs_fs.h" 30#include "squashfs_fs.h"
31#include "squashfs_fs_sb.h" 31#include "squashfs_fs_sb.h"
32#include "squashfs_fs_i.h"
33#include "squashfs.h" 32#include "squashfs.h"
34#include "decompressor.h" 33#include "decompressor.h"
35 34
@@ -66,8 +65,8 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
66 struct buffer_head **bh, int b, int offset, int length, int srclength, 65 struct buffer_head **bh, int b, int offset, int length, int srclength,
67 int pages) 66 int pages)
68{ 67{
69 int zlib_err = 0, zlib_init = 0; 68 int zlib_err, zlib_init = 0;
70 int avail, bytes, k = 0, page = 0; 69 int k = 0, page = 0;
71 z_stream *stream = msblk->stream; 70 z_stream *stream = msblk->stream;
72 71
73 mutex_lock(&msblk->read_data_mutex); 72 mutex_lock(&msblk->read_data_mutex);
@@ -75,21 +74,14 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
75 stream->avail_out = 0; 74 stream->avail_out = 0;
76 stream->avail_in = 0; 75 stream->avail_in = 0;
77 76
78 bytes = length;
79 do { 77 do {
80 if (stream->avail_in == 0 && k < b) { 78 if (stream->avail_in == 0 && k < b) {
81 avail = min(bytes, msblk->devblksize - offset); 79 int avail = min(length, msblk->devblksize - offset);
82 bytes -= avail; 80 length -= avail;
83 wait_on_buffer(bh[k]); 81 wait_on_buffer(bh[k]);
84 if (!buffer_uptodate(bh[k])) 82 if (!buffer_uptodate(bh[k]))
85 goto release_mutex; 83 goto release_mutex;
86 84
87 if (avail == 0) {
88 offset = 0;
89 put_bh(bh[k++]);
90 continue;
91 }
92
93 stream->next_in = bh[k]->b_data + offset; 85 stream->next_in = bh[k]->b_data + offset;
94 stream->avail_in = avail; 86 stream->avail_in = avail;
95 offset = 0; 87 offset = 0;
@@ -128,6 +120,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
128 goto release_mutex; 120 goto release_mutex;
129 } 121 }
130 122
123 if (k < b) {
124 ERROR("zlib_uncompress error, data remaining\n");
125 goto release_mutex;
126 }
127
131 length = stream->total_out; 128 length = stream->total_out;
132 mutex_unlock(&msblk->read_data_mutex); 129 mutex_unlock(&msblk->read_data_mutex);
133 return length; 130 return length;
diff --git a/fs/stat.c b/fs/stat.c
index 12e90e213900..d5c61cf2b703 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,11 +75,13 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
75 int error = -EINVAL; 75 int error = -EINVAL;
76 int lookup_flags = 0; 76 int lookup_flags = 0;
77 77
78 if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) 78 if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0)
79 goto out; 79 goto out;
80 80
81 if (!(flag & AT_SYMLINK_NOFOLLOW)) 81 if (!(flag & AT_SYMLINK_NOFOLLOW))
82 lookup_flags |= LOOKUP_FOLLOW; 82 lookup_flags |= LOOKUP_FOLLOW;
83 if (flag & AT_NO_AUTOMOUNT)
84 lookup_flags |= LOOKUP_NO_AUTOMOUNT;
83 85
84 error = user_path_at(dfd, filename, lookup_flags, &path); 86 error = user_path_at(dfd, filename, lookup_flags, &path);
85 if (error) 87 if (error)
diff --git a/fs/super.c b/fs/super.c
index 4f6a3571a634..7e9dd4cc2c01 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -177,6 +177,11 @@ void deactivate_locked_super(struct super_block *s)
177 struct file_system_type *fs = s->s_type; 177 struct file_system_type *fs = s->s_type;
178 if (atomic_dec_and_test(&s->s_active)) { 178 if (atomic_dec_and_test(&s->s_active)) {
179 fs->kill_sb(s); 179 fs->kill_sb(s);
180 /*
181 * We need to call rcu_barrier so all the delayed rcu free
182 * inodes are flushed before we release the fs module.
183 */
184 rcu_barrier();
180 put_filesystem(fs); 185 put_filesystem(fs);
181 put_super(s); 186 put_super(s);
182 } else { 187 } else {
@@ -1141,7 +1146,7 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
1141 return mnt; 1146 return mnt;
1142 1147
1143 err: 1148 err:
1144 mntput_long(mnt); 1149 mntput(mnt);
1145 return ERR_PTR(err); 1150 return ERR_PTR(err);
1146} 1151}
1147 1152
diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
index f4b67588b9d6..8c41feacbac5 100644
--- a/fs/sysfs/Kconfig
+++ b/fs/sysfs/Kconfig
@@ -1,5 +1,5 @@
1config SYSFS 1config SYSFS
2 bool "sysfs file system support" if EMBEDDED 2 bool "sysfs file system support" if EXPERT
3 default y 3 default y
4 help 4 help
5 The sysfs filesystem is a virtual filesystem that the kernel uses to 5 The sysfs filesystem is a virtual filesystem that the kernel uses to
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 0dce969d6cad..faca44997099 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -98,6 +98,7 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \
98 kmem.o \ 98 kmem.o \
99 xfs_aops.o \ 99 xfs_aops.o \
100 xfs_buf.o \ 100 xfs_buf.o \
101 xfs_discard.o \
101 xfs_export.o \ 102 xfs_export.o \
102 xfs_file.o \ 103 xfs_file.o \
103 xfs_fs_subr.o \ 104 xfs_fs_subr.o \
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 92f1f2acc6ab..ac1c7e8378dd 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -896,7 +896,6 @@ xfs_buf_rele(
896 trace_xfs_buf_rele(bp, _RET_IP_); 896 trace_xfs_buf_rele(bp, _RET_IP_);
897 897
898 if (!pag) { 898 if (!pag) {
899 ASSERT(!bp->b_relse);
900 ASSERT(list_empty(&bp->b_lru)); 899 ASSERT(list_empty(&bp->b_lru));
901 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); 900 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
902 if (atomic_dec_and_test(&bp->b_hold)) 901 if (atomic_dec_and_test(&bp->b_hold))
@@ -908,11 +907,7 @@ xfs_buf_rele(
908 907
909 ASSERT(atomic_read(&bp->b_hold) > 0); 908 ASSERT(atomic_read(&bp->b_hold) > 0);
910 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { 909 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
911 if (bp->b_relse) { 910 if (!(bp->b_flags & XBF_STALE) &&
912 atomic_inc(&bp->b_hold);
913 spin_unlock(&pag->pag_buf_lock);
914 bp->b_relse(bp);
915 } else if (!(bp->b_flags & XBF_STALE) &&
916 atomic_read(&bp->b_lru_ref)) { 911 atomic_read(&bp->b_lru_ref)) {
917 xfs_buf_lru_add(bp); 912 xfs_buf_lru_add(bp);
918 spin_unlock(&pag->pag_buf_lock); 913 spin_unlock(&pag->pag_buf_lock);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a76c2428faff..cbe65950e524 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -152,8 +152,6 @@ typedef struct xfs_buftarg {
152 152
153struct xfs_buf; 153struct xfs_buf;
154typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); 154typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
155typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
156typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
157 155
158#define XB_PAGES 2 156#define XB_PAGES 2
159 157
@@ -183,7 +181,6 @@ typedef struct xfs_buf {
183 void *b_addr; /* virtual address of buffer */ 181 void *b_addr; /* virtual address of buffer */
184 struct work_struct b_iodone_work; 182 struct work_struct b_iodone_work;
185 xfs_buf_iodone_t b_iodone; /* I/O completion function */ 183 xfs_buf_iodone_t b_iodone; /* I/O completion function */
186 xfs_buf_relse_t b_relse; /* releasing function */
187 struct completion b_iowait; /* queue for I/O waiters */ 184 struct completion b_iowait; /* queue for I/O waiters */
188 void *b_fspriv; 185 void *b_fspriv;
189 void *b_fspriv2; 186 void *b_fspriv2;
@@ -323,7 +320,6 @@ void xfs_buf_stale(struct xfs_buf *bp);
323#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) 320#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2)
324#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) 321#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val))
325#define XFS_BUF_SET_START(bp) do { } while (0) 322#define XFS_BUF_SET_START(bp) do { } while (0)
326#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func))
327 323
328#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) 324#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr)
329#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) 325#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt)
@@ -360,8 +356,7 @@ xfs_buf_set_ref(
360 356
361static inline void xfs_buf_relse(xfs_buf_t *bp) 357static inline void xfs_buf_relse(xfs_buf_t *bp)
362{ 358{
363 if (!bp->b_relse) 359 xfs_buf_unlock(bp);
364 xfs_buf_unlock(bp);
365 xfs_buf_rele(bp); 360 xfs_buf_rele(bp);
366} 361}
367 362
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
new file mode 100644
index 000000000000..05201ae719e5
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -0,0 +1,191 @@
1/*
2 * Copyright (C) 2010 Red Hat, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_sb.h"
20#include "xfs_inum.h"
21#include "xfs_log.h"
22#include "xfs_ag.h"
23#include "xfs_mount.h"
24#include "xfs_quota.h"
25#include "xfs_trans.h"
26#include "xfs_alloc_btree.h"
27#include "xfs_bmap_btree.h"
28#include "xfs_ialloc_btree.h"
29#include "xfs_btree.h"
30#include "xfs_inode.h"
31#include "xfs_alloc.h"
32#include "xfs_error.h"
33#include "xfs_discard.h"
34#include "xfs_trace.h"
35
36STATIC int
37xfs_trim_extents(
38 struct xfs_mount *mp,
39 xfs_agnumber_t agno,
40 xfs_fsblock_t start,
41 xfs_fsblock_t len,
42 xfs_fsblock_t minlen,
43 __uint64_t *blocks_trimmed)
44{
45 struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
46 struct xfs_btree_cur *cur;
47 struct xfs_buf *agbp;
48 struct xfs_perag *pag;
49 int error;
50 int i;
51
52 pag = xfs_perag_get(mp, agno);
53
54 error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
55 if (error || !agbp)
56 goto out_put_perag;
57
58 cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
59
60 /*
61 * Force out the log. This means any transactions that might have freed
62 * space before we took the AGF buffer lock are now on disk, and the
63 * volatile disk cache is flushed.
64 */
65 xfs_log_force(mp, XFS_LOG_SYNC);
66
67 /*
68 * Look up the longest btree in the AGF and start with it.
69 */
70 error = xfs_alloc_lookup_le(cur, 0,
71 XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
72 if (error)
73 goto out_del_cursor;
74
75 /*
76 * Loop until we are done with all extents that are large
77 * enough to be worth discarding.
78 */
79 while (i) {
80 xfs_agblock_t fbno;
81 xfs_extlen_t flen;
82
83 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
84 if (error)
85 goto out_del_cursor;
86 XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
87 ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
88
89 /*
90 * Too small? Give up.
91 */
92 if (flen < minlen) {
93 trace_xfs_discard_toosmall(mp, agno, fbno, flen);
94 goto out_del_cursor;
95 }
96
97 /*
98 * If the extent is entirely outside of the range we are
99 * supposed to discard skip it. Do not bother to trim
100 * down partially overlapping ranges for now.
101 */
102 if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
103 XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
104 trace_xfs_discard_exclude(mp, agno, fbno, flen);
105 goto next_extent;
106 }
107
108 /*
109 * If any blocks in the range are still busy, skip the
110 * discard and try again the next time.
111 */
112 if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
113 trace_xfs_discard_busy(mp, agno, fbno, flen);
114 goto next_extent;
115 }
116
117 trace_xfs_discard_extent(mp, agno, fbno, flen);
118 error = -blkdev_issue_discard(bdev,
119 XFS_AGB_TO_DADDR(mp, agno, fbno),
120 XFS_FSB_TO_BB(mp, flen),
121 GFP_NOFS, 0);
122 if (error)
123 goto out_del_cursor;
124 *blocks_trimmed += flen;
125
126next_extent:
127 error = xfs_btree_decrement(cur, 0, &i);
128 if (error)
129 goto out_del_cursor;
130 }
131
132out_del_cursor:
133 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
134 xfs_buf_relse(agbp);
135out_put_perag:
136 xfs_perag_put(pag);
137 return error;
138}
139
140int
141xfs_ioc_trim(
142 struct xfs_mount *mp,
143 struct fstrim_range __user *urange)
144{
145 struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
146 unsigned int granularity = q->limits.discard_granularity;
147 struct fstrim_range range;
148 xfs_fsblock_t start, len, minlen;
149 xfs_agnumber_t start_agno, end_agno, agno;
150 __uint64_t blocks_trimmed = 0;
151 int error, last_error = 0;
152
153 if (!capable(CAP_SYS_ADMIN))
154 return -XFS_ERROR(EPERM);
155 if (copy_from_user(&range, urange, sizeof(range)))
156 return -XFS_ERROR(EFAULT);
157
158 /*
159 * Truncating down the len isn't actually quite correct, but using
160 * XFS_B_TO_FSB would mean we trivially get overflows for values
161 * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
162 * used by the fstrim application. In the end it really doesn't
163 * matter as trimming blocks is an advisory interface.
164 */
165 start = XFS_B_TO_FSBT(mp, range.start);
166 len = XFS_B_TO_FSBT(mp, range.len);
167 minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
168
169 start_agno = XFS_FSB_TO_AGNO(mp, start);
170 if (start_agno >= mp->m_sb.sb_agcount)
171 return -XFS_ERROR(EINVAL);
172
173 end_agno = XFS_FSB_TO_AGNO(mp, start + len);
174 if (end_agno >= mp->m_sb.sb_agcount)
175 end_agno = mp->m_sb.sb_agcount - 1;
176
177 for (agno = start_agno; agno <= end_agno; agno++) {
178 error = -xfs_trim_extents(mp, agno, start, len, minlen,
179 &blocks_trimmed);
180 if (error)
181 last_error = error;
182 }
183
184 if (last_error)
185 return last_error;
186
187 range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
188 if (copy_to_user(urange, &range, sizeof(range)))
189 return -XFS_ERROR(EFAULT);
190 return 0;
191}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
new file mode 100644
index 000000000000..e82b6dd3e127
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -0,0 +1,8 @@
1#ifndef XFS_DISCARD_H
2#define XFS_DISCARD_H 1
3
4struct fstrim_range;
5
6extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
7
8#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ba8ad422a165..a55c1b46b219 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -37,10 +37,45 @@
37#include "xfs_trace.h" 37#include "xfs_trace.h"
38 38
39#include <linux/dcache.h> 39#include <linux/dcache.h>
40#include <linux/falloc.h>
40 41
41static const struct vm_operations_struct xfs_file_vm_ops; 42static const struct vm_operations_struct xfs_file_vm_ops;
42 43
43/* 44/*
45 * Locking primitives for read and write IO paths to ensure we consistently use
46 * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
47 */
48static inline void
49xfs_rw_ilock(
50 struct xfs_inode *ip,
51 int type)
52{
53 if (type & XFS_IOLOCK_EXCL)
54 mutex_lock(&VFS_I(ip)->i_mutex);
55 xfs_ilock(ip, type);
56}
57
58static inline void
59xfs_rw_iunlock(
60 struct xfs_inode *ip,
61 int type)
62{
63 xfs_iunlock(ip, type);
64 if (type & XFS_IOLOCK_EXCL)
65 mutex_unlock(&VFS_I(ip)->i_mutex);
66}
67
68static inline void
69xfs_rw_ilock_demote(
70 struct xfs_inode *ip,
71 int type)
72{
73 xfs_ilock_demote(ip, type);
74 if (type & XFS_IOLOCK_EXCL)
75 mutex_unlock(&VFS_I(ip)->i_mutex);
76}
77
78/*
44 * xfs_iozero 79 * xfs_iozero
45 * 80 *
46 * xfs_iozero clears the specified range of buffer supplied, 81 * xfs_iozero clears the specified range of buffer supplied,
@@ -262,22 +297,21 @@ xfs_file_aio_read(
262 if (XFS_FORCED_SHUTDOWN(mp)) 297 if (XFS_FORCED_SHUTDOWN(mp))
263 return -EIO; 298 return -EIO;
264 299
265 if (unlikely(ioflags & IO_ISDIRECT))
266 mutex_lock(&inode->i_mutex);
267 xfs_ilock(ip, XFS_IOLOCK_SHARED);
268
269 if (unlikely(ioflags & IO_ISDIRECT)) { 300 if (unlikely(ioflags & IO_ISDIRECT)) {
301 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
302
270 if (inode->i_mapping->nrpages) { 303 if (inode->i_mapping->nrpages) {
271 ret = -xfs_flushinval_pages(ip, 304 ret = -xfs_flushinval_pages(ip,
272 (iocb->ki_pos & PAGE_CACHE_MASK), 305 (iocb->ki_pos & PAGE_CACHE_MASK),
273 -1, FI_REMAPF_LOCKED); 306 -1, FI_REMAPF_LOCKED);
307 if (ret) {
308 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
309 return ret;
310 }
274 } 311 }
275 mutex_unlock(&inode->i_mutex); 312 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
276 if (ret) { 313 } else
277 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 314 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
278 return ret;
279 }
280 }
281 315
282 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); 316 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
283 317
@@ -285,7 +319,7 @@ xfs_file_aio_read(
285 if (ret > 0) 319 if (ret > 0)
286 XFS_STATS_ADD(xs_read_bytes, ret); 320 XFS_STATS_ADD(xs_read_bytes, ret);
287 321
288 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 322 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
289 return ret; 323 return ret;
290} 324}
291 325
@@ -309,7 +343,7 @@ xfs_file_splice_read(
309 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 343 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
310 return -EIO; 344 return -EIO;
311 345
312 xfs_ilock(ip, XFS_IOLOCK_SHARED); 346 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
313 347
314 trace_xfs_file_splice_read(ip, count, *ppos, ioflags); 348 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
315 349
@@ -317,10 +351,61 @@ xfs_file_splice_read(
317 if (ret > 0) 351 if (ret > 0)
318 XFS_STATS_ADD(xs_read_bytes, ret); 352 XFS_STATS_ADD(xs_read_bytes, ret);
319 353
320 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 354 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
321 return ret; 355 return ret;
322} 356}
323 357
358STATIC void
359xfs_aio_write_isize_update(
360 struct inode *inode,
361 loff_t *ppos,
362 ssize_t bytes_written)
363{
364 struct xfs_inode *ip = XFS_I(inode);
365 xfs_fsize_t isize = i_size_read(inode);
366
367 if (bytes_written > 0)
368 XFS_STATS_ADD(xs_write_bytes, bytes_written);
369
370 if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
371 *ppos > isize))
372 *ppos = isize;
373
374 if (*ppos > ip->i_size) {
375 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
376 if (*ppos > ip->i_size)
377 ip->i_size = *ppos;
378 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
379 }
380}
381
382/*
383 * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
384 * part of the I/O may have been written to disk before the error occured. In
385 * this case the on-disk file size may have been adjusted beyond the in-memory
386 * file size and now needs to be truncated back.
387 */
388STATIC void
389xfs_aio_write_newsize_update(
390 struct xfs_inode *ip)
391{
392 if (ip->i_new_size) {
393 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
394 ip->i_new_size = 0;
395 if (ip->i_d.di_size > ip->i_size)
396 ip->i_d.di_size = ip->i_size;
397 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
398 }
399}
400
401/*
402 * xfs_file_splice_write() does not use xfs_rw_ilock() because
403 * generic_file_splice_write() takes the i_mutex itself. This, in theory,
404 * couuld cause lock inversions between the aio_write path and the splice path
405 * if someone is doing concurrent splice(2) based writes and write(2) based
406 * writes to the same inode. The only real way to fix this is to re-implement
407 * the generic code here with correct locking orders.
408 */
324STATIC ssize_t 409STATIC ssize_t
325xfs_file_splice_write( 410xfs_file_splice_write(
326 struct pipe_inode_info *pipe, 411 struct pipe_inode_info *pipe,
@@ -331,7 +416,7 @@ xfs_file_splice_write(
331{ 416{
332 struct inode *inode = outfilp->f_mapping->host; 417 struct inode *inode = outfilp->f_mapping->host;
333 struct xfs_inode *ip = XFS_I(inode); 418 struct xfs_inode *ip = XFS_I(inode);
334 xfs_fsize_t isize, new_size; 419 xfs_fsize_t new_size;
335 int ioflags = 0; 420 int ioflags = 0;
336 ssize_t ret; 421 ssize_t ret;
337 422
@@ -355,27 +440,9 @@ xfs_file_splice_write(
355 trace_xfs_file_splice_write(ip, count, *ppos, ioflags); 440 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
356 441
357 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); 442 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
358 if (ret > 0)
359 XFS_STATS_ADD(xs_write_bytes, ret);
360
361 isize = i_size_read(inode);
362 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
363 *ppos = isize;
364
365 if (*ppos > ip->i_size) {
366 xfs_ilock(ip, XFS_ILOCK_EXCL);
367 if (*ppos > ip->i_size)
368 ip->i_size = *ppos;
369 xfs_iunlock(ip, XFS_ILOCK_EXCL);
370 }
371 443
372 if (ip->i_new_size) { 444 xfs_aio_write_isize_update(inode, ppos, ret);
373 xfs_ilock(ip, XFS_ILOCK_EXCL); 445 xfs_aio_write_newsize_update(ip);
374 ip->i_new_size = 0;
375 if (ip->i_d.di_size > ip->i_size)
376 ip->i_d.di_size = ip->i_size;
377 xfs_iunlock(ip, XFS_ILOCK_EXCL);
378 }
379 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 446 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
380 return ret; 447 return ret;
381} 448}
@@ -562,247 +629,314 @@ out_lock:
562 return error; 629 return error;
563} 630}
564 631
632/*
633 * Common pre-write limit and setup checks.
634 *
635 * Returns with iolock held according to @iolock.
636 */
565STATIC ssize_t 637STATIC ssize_t
566xfs_file_aio_write( 638xfs_file_aio_write_checks(
567 struct kiocb *iocb, 639 struct file *file,
568 const struct iovec *iovp, 640 loff_t *pos,
569 unsigned long nr_segs, 641 size_t *count,
570 loff_t pos) 642 int *iolock)
571{ 643{
572 struct file *file = iocb->ki_filp; 644 struct inode *inode = file->f_mapping->host;
573 struct address_space *mapping = file->f_mapping;
574 struct inode *inode = mapping->host;
575 struct xfs_inode *ip = XFS_I(inode); 645 struct xfs_inode *ip = XFS_I(inode);
576 struct xfs_mount *mp = ip->i_mount; 646 xfs_fsize_t new_size;
577 ssize_t ret = 0, error = 0; 647 int error = 0;
578 int ioflags = 0;
579 xfs_fsize_t isize, new_size;
580 int iolock;
581 size_t ocount = 0, count;
582 int need_i_mutex;
583 648
584 XFS_STATS_INC(xs_write_calls); 649 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
650 if (error) {
651 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
652 *iolock = 0;
653 return error;
654 }
585 655
586 BUG_ON(iocb->ki_pos != pos); 656 new_size = *pos + *count;
657 if (new_size > ip->i_size)
658 ip->i_new_size = new_size;
587 659
588 if (unlikely(file->f_flags & O_DIRECT)) 660 if (likely(!(file->f_mode & FMODE_NOCMTIME)))
589 ioflags |= IO_ISDIRECT; 661 file_update_time(file);
590 if (file->f_mode & FMODE_NOCMTIME) 662
591 ioflags |= IO_INVIS; 663 /*
664 * If the offset is beyond the size of the file, we need to zero any
665 * blocks that fall between the existing EOF and the start of this
666 * write.
667 */
668 if (*pos > ip->i_size)
669 error = -xfs_zero_eof(ip, *pos, ip->i_size);
592 670
593 error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); 671 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
594 if (error) 672 if (error)
595 return error; 673 return error;
596 674
597 count = ocount; 675 /*
598 if (count == 0) 676 * If we're writing the file then make sure to clear the setuid and
599 return 0; 677 * setgid bits if the process is not being run by root. This keeps
600 678 * people from modifying setuid and setgid binaries.
601 xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); 679 */
680 return file_remove_suid(file);
602 681
603 if (XFS_FORCED_SHUTDOWN(mp)) 682}
604 return -EIO;
605 683
606relock: 684/*
607 if (ioflags & IO_ISDIRECT) { 685 * xfs_file_dio_aio_write - handle direct IO writes
608 iolock = XFS_IOLOCK_SHARED; 686 *
609 need_i_mutex = 0; 687 * Lock the inode appropriately to prepare for and issue a direct IO write.
610 } else { 688 * By separating it from the buffered write path we remove all the tricky to
611 iolock = XFS_IOLOCK_EXCL; 689 * follow locking changes and looping.
612 need_i_mutex = 1; 690 *
613 mutex_lock(&inode->i_mutex); 691 * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
692 * until we're sure the bytes at the new EOF have been zeroed and/or the cached
693 * pages are flushed out.
694 *
695 * In most cases the direct IO writes will be done holding IOLOCK_SHARED
696 * allowing them to be done in parallel with reads and other direct IO writes.
697 * However, if the IO is not aligned to filesystem blocks, the direct IO layer
698 * needs to do sub-block zeroing and that requires serialisation against other
699 * direct IOs to the same block. In this case we need to serialise the
700 * submission of the unaligned IOs so that we don't get racing block zeroing in
701 * the dio layer. To avoid the problem with aio, we also need to wait for
702 * outstanding IOs to complete so that unwritten extent conversion is completed
703 * before we try to map the overlapping block. This is currently implemented by
704 * hitting it with a big hammer (i.e. xfs_ioend_wait()).
705 *
706 * Returns with locks held indicated by @iolock and errors indicated by
707 * negative return values.
708 */
709STATIC ssize_t
710xfs_file_dio_aio_write(
711 struct kiocb *iocb,
712 const struct iovec *iovp,
713 unsigned long nr_segs,
714 loff_t pos,
715 size_t ocount,
716 int *iolock)
717{
718 struct file *file = iocb->ki_filp;
719 struct address_space *mapping = file->f_mapping;
720 struct inode *inode = mapping->host;
721 struct xfs_inode *ip = XFS_I(inode);
722 struct xfs_mount *mp = ip->i_mount;
723 ssize_t ret = 0;
724 size_t count = ocount;
725 int unaligned_io = 0;
726 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
727 mp->m_rtdev_targp : mp->m_ddev_targp;
728
729 *iolock = 0;
730 if ((pos & target->bt_smask) || (count & target->bt_smask))
731 return -XFS_ERROR(EINVAL);
732
733 if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
734 unaligned_io = 1;
735
736 if (unaligned_io || mapping->nrpages || pos > ip->i_size)
737 *iolock = XFS_IOLOCK_EXCL;
738 else
739 *iolock = XFS_IOLOCK_SHARED;
740 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
741
742 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
743 if (ret)
744 return ret;
745
746 if (mapping->nrpages) {
747 WARN_ON(*iolock != XFS_IOLOCK_EXCL);
748 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
749 FI_REMAPF_LOCKED);
750 if (ret)
751 return ret;
614 } 752 }
615 753
616 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); 754 /*
617 755 * If we are doing unaligned IO, wait for all other IO to drain,
618start: 756 * otherwise demote the lock if we had to flush cached pages
619 error = -generic_write_checks(file, &pos, &count, 757 */
620 S_ISBLK(inode->i_mode)); 758 if (unaligned_io)
621 if (error) { 759 xfs_ioend_wait(ip);
622 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 760 else if (*iolock == XFS_IOLOCK_EXCL) {
623 goto out_unlock_mutex; 761 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
762 *iolock = XFS_IOLOCK_SHARED;
624 } 763 }
625 764
626 if (ioflags & IO_ISDIRECT) { 765 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
627 xfs_buftarg_t *target = 766 ret = generic_file_direct_write(iocb, iovp,
628 XFS_IS_REALTIME_INODE(ip) ? 767 &nr_segs, pos, &iocb->ki_pos, count, ocount);
629 mp->m_rtdev_targp : mp->m_ddev_targp;
630 768
631 if ((pos & target->bt_smask) || (count & target->bt_smask)) { 769 /* No fallback to buffered IO on errors for XFS. */
632 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 770 ASSERT(ret < 0 || ret == count);
633 return XFS_ERROR(-EINVAL); 771 return ret;
634 } 772}
635 773
636 if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) { 774STATIC ssize_t
637 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 775xfs_file_buffered_aio_write(
638 iolock = XFS_IOLOCK_EXCL; 776 struct kiocb *iocb,
639 need_i_mutex = 1; 777 const struct iovec *iovp,
640 mutex_lock(&inode->i_mutex); 778 unsigned long nr_segs,
641 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); 779 loff_t pos,
642 goto start; 780 size_t ocount,
643 } 781 int *iolock)
644 } 782{
783 struct file *file = iocb->ki_filp;
784 struct address_space *mapping = file->f_mapping;
785 struct inode *inode = mapping->host;
786 struct xfs_inode *ip = XFS_I(inode);
787 ssize_t ret;
788 int enospc = 0;
789 size_t count = ocount;
645 790
646 new_size = pos + count; 791 *iolock = XFS_IOLOCK_EXCL;
647 if (new_size > ip->i_size) 792 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
648 ip->i_new_size = new_size;
649 793
650 if (likely(!(ioflags & IO_INVIS))) 794 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
651 file_update_time(file); 795 if (ret)
796 return ret;
652 797
798 /* We can write back this queue in page reclaim */
799 current->backing_dev_info = mapping->backing_dev_info;
800
801write_retry:
802 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
803 ret = generic_file_buffered_write(iocb, iovp, nr_segs,
804 pos, &iocb->ki_pos, count, ret);
653 /* 805 /*
654 * If the offset is beyond the size of the file, we have a couple 806 * if we just got an ENOSPC, flush the inode now we aren't holding any
655 * of things to do. First, if there is already space allocated 807 * page locks and retry *once*
656 * we need to either create holes or zero the disk or ...
657 *
658 * If there is a page where the previous size lands, we need
659 * to zero it out up to the new size.
660 */ 808 */
661 809 if (ret == -ENOSPC && !enospc) {
662 if (pos > ip->i_size) { 810 ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
663 error = xfs_zero_eof(ip, pos, ip->i_size); 811 if (ret)
664 if (error) { 812 return ret;
665 xfs_iunlock(ip, XFS_ILOCK_EXCL); 813 enospc = 1;
666 goto out_unlock_internal; 814 goto write_retry;
667 }
668 } 815 }
669 xfs_iunlock(ip, XFS_ILOCK_EXCL); 816 current->backing_dev_info = NULL;
817 return ret;
818}
670 819
671 /* 820STATIC ssize_t
672 * If we're writing the file then make sure to clear the 821xfs_file_aio_write(
673 * setuid and setgid bits if the process is not being run 822 struct kiocb *iocb,
674 * by root. This keeps people from modifying setuid and 823 const struct iovec *iovp,
675 * setgid binaries. 824 unsigned long nr_segs,
676 */ 825 loff_t pos)
677 error = -file_remove_suid(file); 826{
678 if (unlikely(error)) 827 struct file *file = iocb->ki_filp;
679 goto out_unlock_internal; 828 struct address_space *mapping = file->f_mapping;
829 struct inode *inode = mapping->host;
830 struct xfs_inode *ip = XFS_I(inode);
831 ssize_t ret;
832 int iolock;
833 size_t ocount = 0;
680 834
681 /* We can write back this queue in page reclaim */ 835 XFS_STATS_INC(xs_write_calls);
682 current->backing_dev_info = mapping->backing_dev_info;
683 836
684 if ((ioflags & IO_ISDIRECT)) { 837 BUG_ON(iocb->ki_pos != pos);
685 if (mapping->nrpages) {
686 WARN_ON(need_i_mutex == 0);
687 error = xfs_flushinval_pages(ip,
688 (pos & PAGE_CACHE_MASK),
689 -1, FI_REMAPF_LOCKED);
690 if (error)
691 goto out_unlock_internal;
692 }
693 838
694 if (need_i_mutex) { 839 ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
695 /* demote the lock now the cached pages are gone */ 840 if (ret)
696 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 841 return ret;
697 mutex_unlock(&inode->i_mutex);
698 842
699 iolock = XFS_IOLOCK_SHARED; 843 if (ocount == 0)
700 need_i_mutex = 0; 844 return 0;
701 }
702 845
703 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); 846 xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
704 ret = generic_file_direct_write(iocb, iovp,
705 &nr_segs, pos, &iocb->ki_pos, count, ocount);
706 847
707 /* 848 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
708 * direct-io write to a hole: fall through to buffered I/O 849 return -EIO;
709 * for completing the rest of the request.
710 */
711 if (ret >= 0 && ret != count) {
712 XFS_STATS_ADD(xs_write_bytes, ret);
713 850
714 pos += ret; 851 if (unlikely(file->f_flags & O_DIRECT))
715 count -= ret; 852 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
853 ocount, &iolock);
854 else
855 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
856 ocount, &iolock);
716 857
717 ioflags &= ~IO_ISDIRECT; 858 xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
718 xfs_iunlock(ip, iolock);
719 goto relock;
720 }
721 } else {
722 int enospc = 0;
723 ssize_t ret2 = 0;
724 859
725write_retry: 860 if (ret <= 0)
726 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags); 861 goto out_unlock;
727 ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
728 pos, &iocb->ki_pos, count, ret);
729 /*
730 * if we just got an ENOSPC, flush the inode now we
731 * aren't holding any page locks and retry *once*
732 */
733 if (ret2 == -ENOSPC && !enospc) {
734 error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
735 if (error)
736 goto out_unlock_internal;
737 enospc = 1;
738 goto write_retry;
739 }
740 ret = ret2;
741 }
742 862
743 current->backing_dev_info = NULL; 863 /* Handle various SYNC-type writes */
864 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
865 loff_t end = pos + ret - 1;
866 int error, error2;
744 867
745 isize = i_size_read(inode); 868 xfs_rw_iunlock(ip, iolock);
746 if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize)) 869 error = filemap_write_and_wait_range(mapping, pos, end);
747 iocb->ki_pos = isize; 870 xfs_rw_ilock(ip, iolock);
748 871
749 if (iocb->ki_pos > ip->i_size) { 872 error2 = -xfs_file_fsync(file,
750 xfs_ilock(ip, XFS_ILOCK_EXCL); 873 (file->f_flags & __O_SYNC) ? 0 : 1);
751 if (iocb->ki_pos > ip->i_size) 874 if (error)
752 ip->i_size = iocb->ki_pos; 875 ret = error;
753 xfs_iunlock(ip, XFS_ILOCK_EXCL); 876 else if (error2)
877 ret = error2;
754 } 878 }
755 879
756 error = -ret; 880out_unlock:
757 if (ret <= 0) 881 xfs_aio_write_newsize_update(ip);
758 goto out_unlock_internal; 882 xfs_rw_iunlock(ip, iolock);
883 return ret;
884}
759 885
760 XFS_STATS_ADD(xs_write_bytes, ret); 886STATIC long
887xfs_file_fallocate(
888 struct file *file,
889 int mode,
890 loff_t offset,
891 loff_t len)
892{
893 struct inode *inode = file->f_path.dentry->d_inode;
894 long error;
895 loff_t new_size = 0;
896 xfs_flock64_t bf;
897 xfs_inode_t *ip = XFS_I(inode);
898 int cmd = XFS_IOC_RESVSP;
761 899
762 /* Handle various SYNC-type writes */ 900 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
763 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { 901 return -EOPNOTSUPP;
764 loff_t end = pos + ret - 1;
765 int error2;
766 902
767 xfs_iunlock(ip, iolock); 903 bf.l_whence = 0;
768 if (need_i_mutex) 904 bf.l_start = offset;
769 mutex_unlock(&inode->i_mutex); 905 bf.l_len = len;
770 906
771 error2 = filemap_write_and_wait_range(mapping, pos, end); 907 xfs_ilock(ip, XFS_IOLOCK_EXCL);
772 if (!error)
773 error = error2;
774 if (need_i_mutex)
775 mutex_lock(&inode->i_mutex);
776 xfs_ilock(ip, iolock);
777 908
778 error2 = -xfs_file_fsync(file, 909 if (mode & FALLOC_FL_PUNCH_HOLE)
779 (file->f_flags & __O_SYNC) ? 0 : 1); 910 cmd = XFS_IOC_UNRESVSP;
780 if (!error) 911
781 error = error2; 912 /* check the new inode size is valid before allocating */
913 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
914 offset + len > i_size_read(inode)) {
915 new_size = offset + len;
916 error = inode_newsize_ok(inode, new_size);
917 if (error)
918 goto out_unlock;
782 } 919 }
783 920
784 out_unlock_internal: 921 error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
785 if (ip->i_new_size) { 922 if (error)
786 xfs_ilock(ip, XFS_ILOCK_EXCL); 923 goto out_unlock;
787 ip->i_new_size = 0; 924
788 /* 925 /* Change file size if needed */
789 * If this was a direct or synchronous I/O that failed (such 926 if (new_size) {
790 * as ENOSPC) then part of the I/O may have been written to 927 struct iattr iattr;
791 * disk before the error occured. In this case the on-disk 928
792 * file size may have been adjusted beyond the in-memory file 929 iattr.ia_valid = ATTR_SIZE;
793 * size and now needs to be truncated back. 930 iattr.ia_size = new_size;
794 */ 931 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
795 if (ip->i_d.di_size > ip->i_size)
796 ip->i_d.di_size = ip->i_size;
797 xfs_iunlock(ip, XFS_ILOCK_EXCL);
798 } 932 }
799 xfs_iunlock(ip, iolock); 933
800 out_unlock_mutex: 934out_unlock:
801 if (need_i_mutex) 935 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
802 mutex_unlock(&inode->i_mutex); 936 return error;
803 return -error;
804} 937}
805 938
939
806STATIC int 940STATIC int
807xfs_file_open( 941xfs_file_open(
808 struct inode *inode, 942 struct inode *inode,
@@ -921,6 +1055,7 @@ const struct file_operations xfs_file_operations = {
921 .open = xfs_file_open, 1055 .open = xfs_file_open,
922 .release = xfs_file_release, 1056 .release = xfs_file_release,
923 .fsync = xfs_file_fsync, 1057 .fsync = xfs_file_fsync,
1058 .fallocate = xfs_file_fallocate,
924}; 1059};
925 1060
926const struct file_operations xfs_dir_file_operations = { 1061const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index ad442d9e392e..f5e2a19e0f8e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -39,6 +39,7 @@
39#include "xfs_dfrag.h" 39#include "xfs_dfrag.h"
40#include "xfs_fsops.h" 40#include "xfs_fsops.h"
41#include "xfs_vnodeops.h" 41#include "xfs_vnodeops.h"
42#include "xfs_discard.h"
42#include "xfs_quota.h" 43#include "xfs_quota.h"
43#include "xfs_inode_item.h" 44#include "xfs_inode_item.h"
44#include "xfs_export.h" 45#include "xfs_export.h"
@@ -984,10 +985,22 @@ xfs_ioctl_setattr(
984 985
985 /* 986 /*
986 * Extent size must be a multiple of the appropriate block 987 * Extent size must be a multiple of the appropriate block
987 * size, if set at all. 988 * size, if set at all. It must also be smaller than the
989 * maximum extent size supported by the filesystem.
990 *
991 * Also, for non-realtime files, limit the extent size hint to
992 * half the size of the AGs in the filesystem so alignment
993 * doesn't result in extents larger than an AG.
988 */ 994 */
989 if (fa->fsx_extsize != 0) { 995 if (fa->fsx_extsize != 0) {
990 xfs_extlen_t size; 996 xfs_extlen_t size;
997 xfs_fsblock_t extsize_fsb;
998
999 extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
1000 if (extsize_fsb > MAXEXTLEN) {
1001 code = XFS_ERROR(EINVAL);
1002 goto error_return;
1003 }
991 1004
992 if (XFS_IS_REALTIME_INODE(ip) || 1005 if (XFS_IS_REALTIME_INODE(ip) ||
993 ((mask & FSX_XFLAGS) && 1006 ((mask & FSX_XFLAGS) &&
@@ -996,6 +1009,10 @@ xfs_ioctl_setattr(
996 mp->m_sb.sb_blocklog; 1009 mp->m_sb.sb_blocklog;
997 } else { 1010 } else {
998 size = mp->m_sb.sb_blocksize; 1011 size = mp->m_sb.sb_blocksize;
1012 if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
1013 code = XFS_ERROR(EINVAL);
1014 goto error_return;
1015 }
999 } 1016 }
1000 1017
1001 if (fa->fsx_extsize % size) { 1018 if (fa->fsx_extsize % size) {
@@ -1294,6 +1311,8 @@ xfs_file_ioctl(
1294 trace_xfs_file_ioctl(ip); 1311 trace_xfs_file_ioctl(ip);
1295 1312
1296 switch (cmd) { 1313 switch (cmd) {
1314 case FITRIM:
1315 return xfs_ioc_trim(mp, arg);
1297 case XFS_IOC_ALLOCSP: 1316 case XFS_IOC_ALLOCSP:
1298 case XFS_IOC_FREESP: 1317 case XFS_IOC_FREESP:
1299 case XFS_IOC_RESVSP: 1318 case XFS_IOC_RESVSP:
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index da54403633b6..bd5727852fd6 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -46,7 +46,6 @@
46#include <linux/namei.h> 46#include <linux/namei.h>
47#include <linux/posix_acl.h> 47#include <linux/posix_acl.h>
48#include <linux/security.h> 48#include <linux/security.h>
49#include <linux/falloc.h>
50#include <linux/fiemap.h> 49#include <linux/fiemap.h>
51#include <linux/slab.h> 50#include <linux/slab.h>
52 51
@@ -505,61 +504,6 @@ xfs_vn_setattr(
505 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0); 504 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
506} 505}
507 506
508STATIC long
509xfs_vn_fallocate(
510 struct inode *inode,
511 int mode,
512 loff_t offset,
513 loff_t len)
514{
515 long error;
516 loff_t new_size = 0;
517 xfs_flock64_t bf;
518 xfs_inode_t *ip = XFS_I(inode);
519 int cmd = XFS_IOC_RESVSP;
520
521 /* preallocation on directories not yet supported */
522 error = -ENODEV;
523 if (S_ISDIR(inode->i_mode))
524 goto out_error;
525
526 bf.l_whence = 0;
527 bf.l_start = offset;
528 bf.l_len = len;
529
530 xfs_ilock(ip, XFS_IOLOCK_EXCL);
531
532 if (mode & FALLOC_FL_PUNCH_HOLE)
533 cmd = XFS_IOC_UNRESVSP;
534
535 /* check the new inode size is valid before allocating */
536 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
537 offset + len > i_size_read(inode)) {
538 new_size = offset + len;
539 error = inode_newsize_ok(inode, new_size);
540 if (error)
541 goto out_unlock;
542 }
543
544 error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
545 if (error)
546 goto out_unlock;
547
548 /* Change file size if needed */
549 if (new_size) {
550 struct iattr iattr;
551
552 iattr.ia_valid = ATTR_SIZE;
553 iattr.ia_size = new_size;
554 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
555 }
556
557out_unlock:
558 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
559out_error:
560 return error;
561}
562
563#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 507#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
564 508
565/* 509/*
@@ -653,7 +597,6 @@ static const struct inode_operations xfs_inode_operations = {
653 .getxattr = generic_getxattr, 597 .getxattr = generic_getxattr,
654 .removexattr = generic_removexattr, 598 .removexattr = generic_removexattr,
655 .listxattr = xfs_vn_listxattr, 599 .listxattr = xfs_vn_listxattr,
656 .fallocate = xfs_vn_fallocate,
657 .fiemap = xfs_vn_fiemap, 600 .fiemap = xfs_vn_fiemap,
658}; 601};
659 602
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index bd07f7339366..9731898083ae 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1414,7 +1414,7 @@ xfs_fs_freeze(
1414 1414
1415 xfs_save_resvblks(mp); 1415 xfs_save_resvblks(mp);
1416 xfs_quiesce_attr(mp); 1416 xfs_quiesce_attr(mp);
1417 return -xfs_fs_log_dummy(mp, SYNC_WAIT); 1417 return -xfs_fs_log_dummy(mp);
1418} 1418}
1419 1419
1420STATIC int 1420STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a02480de9759..e22f0057d21f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -362,7 +362,7 @@ xfs_quiesce_data(
362 362
363 /* mark the log as covered if needed */ 363 /* mark the log as covered if needed */
364 if (xfs_log_need_covered(mp)) 364 if (xfs_log_need_covered(mp))
365 error2 = xfs_fs_log_dummy(mp, SYNC_WAIT); 365 error2 = xfs_fs_log_dummy(mp);
366 366
367 /* flush data-only devices */ 367 /* flush data-only devices */
368 if (mp->m_rtdev_targp) 368 if (mp->m_rtdev_targp)
@@ -503,13 +503,14 @@ xfs_sync_worker(
503 int error; 503 int error;
504 504
505 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 505 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
506 xfs_log_force(mp, 0);
507 xfs_reclaim_inodes(mp, 0);
508 /* dgc: errors ignored here */ 506 /* dgc: errors ignored here */
509 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
510 if (mp->m_super->s_frozen == SB_UNFROZEN && 507 if (mp->m_super->s_frozen == SB_UNFROZEN &&
511 xfs_log_need_covered(mp)) 508 xfs_log_need_covered(mp))
512 error = xfs_fs_log_dummy(mp, 0); 509 error = xfs_fs_log_dummy(mp);
510 else
511 xfs_log_force(mp, 0);
512 xfs_reclaim_inodes(mp, 0);
513 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
513 } 514 }
514 mp->m_sync_seq++; 515 mp->m_sync_seq++;
515 wake_up(&mp->m_wait_single_sync_task); 516 wake_up(&mp->m_wait_single_sync_task);
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7bb5092d6ae4..ee3cee097e7e 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -18,6 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include <linux/sysctl.h> 19#include <linux/sysctl.h>
20#include <linux/proc_fs.h> 20#include <linux/proc_fs.h>
21#include "xfs_error.h"
21 22
22static struct ctl_table_header *xfs_table_header; 23static struct ctl_table_header *xfs_table_header;
23 24
@@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler(
51 52
52 return ret; 53 return ret;
53} 54}
55
56STATIC int
57xfs_panic_mask_proc_handler(
58 ctl_table *ctl,
59 int write,
60 void __user *buffer,
61 size_t *lenp,
62 loff_t *ppos)
63{
64 int ret, *valp = ctl->data;
65
66 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
67 if (!ret && write) {
68 xfs_panic_mask = *valp;
69#ifdef DEBUG
70 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
71#endif
72 }
73 return ret;
74}
54#endif /* CONFIG_PROC_FS */ 75#endif /* CONFIG_PROC_FS */
55 76
56static ctl_table xfs_table[] = { 77static ctl_table xfs_table[] = {
@@ -77,7 +98,7 @@ static ctl_table xfs_table[] = {
77 .data = &xfs_params.panic_mask.val, 98 .data = &xfs_params.panic_mask.val,
78 .maxlen = sizeof(int), 99 .maxlen = sizeof(int),
79 .mode = 0644, 100 .mode = 0644,
80 .proc_handler = proc_dointvec_minmax, 101 .proc_handler = xfs_panic_mask_proc_handler,
81 .extra1 = &xfs_params.panic_mask.min, 102 .extra1 = &xfs_params.panic_mask.min,
82 .extra2 = &xfs_params.panic_mask.max 103 .extra2 = &xfs_params.panic_mask.max
83 }, 104 },
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 647af2a2e7aa..2d0bcb479075 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -1759,6 +1759,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
1759DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); 1759DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
1760DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); 1760DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
1761 1761
1762DECLARE_EVENT_CLASS(xfs_discard_class,
1763 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1764 xfs_agblock_t agbno, xfs_extlen_t len),
1765 TP_ARGS(mp, agno, agbno, len),
1766 TP_STRUCT__entry(
1767 __field(dev_t, dev)
1768 __field(xfs_agnumber_t, agno)
1769 __field(xfs_agblock_t, agbno)
1770 __field(xfs_extlen_t, len)
1771 ),
1772 TP_fast_assign(
1773 __entry->dev = mp->m_super->s_dev;
1774 __entry->agno = agno;
1775 __entry->agbno = agbno;
1776 __entry->len = len;
1777 ),
1778 TP_printk("dev %d:%d agno %u agbno %u len %u\n",
1779 MAJOR(__entry->dev), MINOR(__entry->dev),
1780 __entry->agno,
1781 __entry->agbno,
1782 __entry->len)
1783)
1784
1785#define DEFINE_DISCARD_EVENT(name) \
1786DEFINE_EVENT(xfs_discard_class, name, \
1787 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
1788 xfs_agblock_t agbno, xfs_extlen_t len), \
1789 TP_ARGS(mp, agno, agbno, len))
1790DEFINE_DISCARD_EVENT(xfs_discard_extent);
1791DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
1792DEFINE_DISCARD_EVENT(xfs_discard_exclude);
1793DEFINE_DISCARD_EVENT(xfs_discard_busy);
1794
1762#endif /* _TRACE_XFS_H */ 1795#endif /* _TRACE_XFS_H */
1763 1796
1764#undef TRACE_INCLUDE_PATH 1797#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index f8e854b4fde8..206a2815ced6 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1863,12 +1863,14 @@ xfs_qm_dqreclaim_one(void)
1863 xfs_dquot_t *dqpout; 1863 xfs_dquot_t *dqpout;
1864 xfs_dquot_t *dqp; 1864 xfs_dquot_t *dqp;
1865 int restarts; 1865 int restarts;
1866 int startagain;
1866 1867
1867 restarts = 0; 1868 restarts = 0;
1868 dqpout = NULL; 1869 dqpout = NULL;
1869 1870
1870 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ 1871 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
1871startagain: 1872again:
1873 startagain = 0;
1872 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); 1874 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
1873 1875
1874 list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) { 1876 list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
@@ -1885,13 +1887,10 @@ startagain:
1885 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); 1887 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
1886 1888
1887 trace_xfs_dqreclaim_want(dqp); 1889 trace_xfs_dqreclaim_want(dqp);
1888
1889 xfs_dqunlock(dqp);
1890 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1891 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
1892 return NULL;
1893 XQM_STATS_INC(xqmstats.xs_qm_dqwants); 1890 XQM_STATS_INC(xqmstats.xs_qm_dqwants);
1894 goto startagain; 1891 restarts++;
1892 startagain = 1;
1893 goto dqunlock;
1895 } 1894 }
1896 1895
1897 /* 1896 /*
@@ -1906,23 +1905,20 @@ startagain:
1906 ASSERT(list_empty(&dqp->q_mplist)); 1905 ASSERT(list_empty(&dqp->q_mplist));
1907 list_del_init(&dqp->q_freelist); 1906 list_del_init(&dqp->q_freelist);
1908 xfs_Gqm->qm_dqfrlist_cnt--; 1907 xfs_Gqm->qm_dqfrlist_cnt--;
1909 xfs_dqunlock(dqp);
1910 dqpout = dqp; 1908 dqpout = dqp;
1911 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); 1909 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
1912 break; 1910 goto dqunlock;
1913 } 1911 }
1914 1912
1915 ASSERT(dqp->q_hash); 1913 ASSERT(dqp->q_hash);
1916 ASSERT(!list_empty(&dqp->q_mplist)); 1914 ASSERT(!list_empty(&dqp->q_mplist));
1917 1915
1918 /* 1916 /*
1919 * Try to grab the flush lock. If this dquot is in the process of 1917 * Try to grab the flush lock. If this dquot is in the process
1920 * getting flushed to disk, we don't want to reclaim it. 1918 * of getting flushed to disk, we don't want to reclaim it.
1921 */ 1919 */
1922 if (!xfs_dqflock_nowait(dqp)) { 1920 if (!xfs_dqflock_nowait(dqp))
1923 xfs_dqunlock(dqp); 1921 goto dqunlock;
1924 continue;
1925 }
1926 1922
1927 /* 1923 /*
1928 * We have the flush lock so we know that this is not in the 1924 * We have the flush lock so we know that this is not in the
@@ -1944,8 +1940,7 @@ startagain:
1944 xfs_fs_cmn_err(CE_WARN, mp, 1940 xfs_fs_cmn_err(CE_WARN, mp,
1945 "xfs_qm_dqreclaim: dquot %p flush failed", dqp); 1941 "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
1946 } 1942 }
1947 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ 1943 goto dqunlock;
1948 continue;
1949 } 1944 }
1950 1945
1951 /* 1946 /*
@@ -1967,13 +1962,8 @@ startagain:
1967 */ 1962 */
1968 if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) { 1963 if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
1969 restarts++; 1964 restarts++;
1970 mutex_unlock(&dqp->q_hash->qh_lock); 1965 startagain = 1;
1971 xfs_dqfunlock(dqp); 1966 goto qhunlock;
1972 xfs_dqunlock(dqp);
1973 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1974 if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
1975 return NULL;
1976 goto startagain;
1977 } 1967 }
1978 1968
1979 ASSERT(dqp->q_nrefs == 0); 1969 ASSERT(dqp->q_nrefs == 0);
@@ -1986,14 +1976,20 @@ startagain:
1986 xfs_Gqm->qm_dqfrlist_cnt--; 1976 xfs_Gqm->qm_dqfrlist_cnt--;
1987 dqpout = dqp; 1977 dqpout = dqp;
1988 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); 1978 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
1979qhunlock:
1989 mutex_unlock(&dqp->q_hash->qh_lock); 1980 mutex_unlock(&dqp->q_hash->qh_lock);
1990dqfunlock: 1981dqfunlock:
1991 xfs_dqfunlock(dqp); 1982 xfs_dqfunlock(dqp);
1983dqunlock:
1992 xfs_dqunlock(dqp); 1984 xfs_dqunlock(dqp);
1993 if (dqpout) 1985 if (dqpout)
1994 break; 1986 break;
1995 if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 1987 if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
1996 return NULL; 1988 break;
1989 if (startagain) {
1990 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1991 goto again;
1992 }
1997 } 1993 }
1998 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); 1994 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1999 return dqpout; 1995 return dqpout;
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 975aa10e1a47..0df88897ef84 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -25,86 +25,78 @@
25#include "xfs_mount.h" 25#include "xfs_mount.h"
26#include "xfs_error.h" 26#include "xfs_error.h"
27 27
28static char message[1024]; /* keep it off the stack */
29static DEFINE_SPINLOCK(xfs_err_lock);
30
31/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
32#define XFS_MAX_ERR_LEVEL 7
33#define XFS_ERR_MASK ((1 << 3) - 1)
34static const char * const err_level[XFS_MAX_ERR_LEVEL+1] =
35 {KERN_EMERG, KERN_ALERT, KERN_CRIT,
36 KERN_ERR, KERN_WARNING, KERN_NOTICE,
37 KERN_INFO, KERN_DEBUG};
38
39void 28void
40cmn_err(register int level, char *fmt, ...) 29cmn_err(
30 const char *lvl,
31 const char *fmt,
32 ...)
41{ 33{
42 char *fp = fmt; 34 struct va_format vaf;
43 int len; 35 va_list args;
44 ulong flags; 36
45 va_list ap; 37 va_start(args, fmt);
46 38 vaf.fmt = fmt;
47 level &= XFS_ERR_MASK; 39 vaf.va = &args;
48 if (level > XFS_MAX_ERR_LEVEL) 40
49 level = XFS_MAX_ERR_LEVEL; 41 printk("%s%pV", lvl, &vaf);
50 spin_lock_irqsave(&xfs_err_lock,flags); 42 va_end(args);
51 va_start(ap, fmt); 43
52 if (*fmt == '!') fp++; 44 BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
53 len = vsnprintf(message, sizeof(message), fp, ap);
54 if (len >= sizeof(message))
55 len = sizeof(message) - 1;
56 if (message[len-1] == '\n')
57 message[len-1] = 0;
58 printk("%s%s\n", err_level[level], message);
59 va_end(ap);
60 spin_unlock_irqrestore(&xfs_err_lock,flags);
61 BUG_ON(level == CE_PANIC);
62} 45}
63 46
64void 47void
65xfs_fs_vcmn_err( 48xfs_fs_cmn_err(
66 int level, 49 const char *lvl,
67 struct xfs_mount *mp, 50 struct xfs_mount *mp,
68 char *fmt, 51 const char *fmt,
69 va_list ap) 52 ...)
70{ 53{
71 unsigned long flags; 54 struct va_format vaf;
72 int len = 0; 55 va_list args;
73 56
74 level &= XFS_ERR_MASK; 57 va_start(args, fmt);
75 if (level > XFS_MAX_ERR_LEVEL) 58 vaf.fmt = fmt;
76 level = XFS_MAX_ERR_LEVEL; 59 vaf.va = &args;
77 60
78 spin_lock_irqsave(&xfs_err_lock,flags); 61 printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf);
62 va_end(args);
79 63
80 if (mp) { 64 BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
81 len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname); 65}
66
67/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */
68void
69xfs_cmn_err(
70 int panic_tag,
71 const char *lvl,
72 struct xfs_mount *mp,
73 const char *fmt,
74 ...)
75{
76 struct va_format vaf;
77 va_list args;
78 int do_panic = 0;
82 79
83 /* 80 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
84 * Skip the printk if we can't print anything useful 81 printk(KERN_ALERT "XFS: Transforming an alert into a BUG.");
85 * due to an over-long device name. 82 do_panic = 1;
86 */
87 if (len >= sizeof(message))
88 goto out;
89 } 83 }
90 84
91 len = vsnprintf(message + len, sizeof(message) - len, fmt, ap); 85 va_start(args, fmt);
92 if (len >= sizeof(message)) 86 vaf.fmt = fmt;
93 len = sizeof(message) - 1; 87 vaf.va = &args;
94 if (message[len-1] == '\n')
95 message[len-1] = 0;
96 88
97 printk("%s%s\n", err_level[level], message); 89 printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf);
98 out: 90 va_end(args);
99 spin_unlock_irqrestore(&xfs_err_lock,flags);
100 91
101 BUG_ON(level == CE_PANIC); 92 BUG_ON(do_panic);
102} 93}
103 94
104void 95void
105assfail(char *expr, char *file, int line) 96assfail(char *expr, char *file, int line)
106{ 97{
107 printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line); 98 printk(KERN_CRIT "Assertion failed: %s, file: %s, line: %d\n", expr,
99 file, line);
108 BUG(); 100 BUG();
109} 101}
110 102
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index d2d20462fd4f..05699f67d475 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -20,15 +20,22 @@
20 20
21#include <stdarg.h> 21#include <stdarg.h>
22 22
23#define CE_DEBUG 7 /* debug */ 23struct xfs_mount;
24#define CE_CONT 6 /* continuation */ 24
25#define CE_NOTE 5 /* notice */ 25#define CE_DEBUG KERN_DEBUG
26#define CE_WARN 4 /* warning */ 26#define CE_CONT KERN_INFO
27#define CE_ALERT 1 /* alert */ 27#define CE_NOTE KERN_NOTICE
28#define CE_PANIC 0 /* panic */ 28#define CE_WARN KERN_WARNING
29 29#define CE_ALERT KERN_ALERT
30extern void cmn_err(int, char *, ...) 30#define CE_PANIC KERN_EMERG
31 __attribute__ ((format (printf, 2, 3))); 31
32void cmn_err(const char *lvl, const char *fmt, ...)
33 __attribute__ ((format (printf, 2, 3)));
34void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp,
35 const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
36void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp,
37 const char *fmt, ...) __attribute__ ((format (printf, 4, 5)));
38
32extern void assfail(char *expr, char *f, int l); 39extern void assfail(char *expr, char *f, int l);
33 40
34#define ASSERT_ALWAYS(expr) \ 41#define ASSERT_ALWAYS(expr) \
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index fa8723f5870a..f3227984a9bf 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,10 +41,6 @@
41#define XFSA_FIXUP_BNO_OK 1 41#define XFSA_FIXUP_BNO_OK 1
42#define XFSA_FIXUP_CNT_OK 2 42#define XFSA_FIXUP_CNT_OK 2
43 43
44static int
45xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
46 xfs_agblock_t bno, xfs_extlen_t len);
47
48/* 44/*
49 * Prototypes for per-ag allocation routines 45 * Prototypes for per-ag allocation routines
50 */ 46 */
@@ -94,7 +90,7 @@ xfs_alloc_lookup_ge(
94 * Lookup the first record less than or equal to [bno, len] 90 * Lookup the first record less than or equal to [bno, len]
95 * in the btree given by cur. 91 * in the btree given by cur.
96 */ 92 */
97STATIC int /* error */ 93int /* error */
98xfs_alloc_lookup_le( 94xfs_alloc_lookup_le(
99 struct xfs_btree_cur *cur, /* btree cursor */ 95 struct xfs_btree_cur *cur, /* btree cursor */
100 xfs_agblock_t bno, /* starting block of extent */ 96 xfs_agblock_t bno, /* starting block of extent */
@@ -127,7 +123,7 @@ xfs_alloc_update(
127/* 123/*
128 * Get the data from the pointed-to record. 124 * Get the data from the pointed-to record.
129 */ 125 */
130STATIC int /* error */ 126int /* error */
131xfs_alloc_get_rec( 127xfs_alloc_get_rec(
132 struct xfs_btree_cur *cur, /* btree cursor */ 128 struct xfs_btree_cur *cur, /* btree cursor */
133 xfs_agblock_t *bno, /* output: starting block of extent */ 129 xfs_agblock_t *bno, /* output: starting block of extent */
@@ -2615,7 +2611,7 @@ restart:
2615 * will require a synchronous transaction, but it can still be 2611 * will require a synchronous transaction, but it can still be
2616 * used to distinguish between a partial or exact match. 2612 * used to distinguish between a partial or exact match.
2617 */ 2613 */
2618static int 2614int
2619xfs_alloc_busy_search( 2615xfs_alloc_busy_search(
2620 struct xfs_mount *mp, 2616 struct xfs_mount *mp,
2621 xfs_agnumber_t agno, 2617 xfs_agnumber_t agno,
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 895009a97271..d0b3bc72005b 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -19,6 +19,7 @@
19#define __XFS_ALLOC_H__ 19#define __XFS_ALLOC_H__
20 20
21struct xfs_buf; 21struct xfs_buf;
22struct xfs_btree_cur;
22struct xfs_mount; 23struct xfs_mount;
23struct xfs_perag; 24struct xfs_perag;
24struct xfs_trans; 25struct xfs_trans;
@@ -74,6 +75,22 @@ typedef unsigned int xfs_alloctype_t;
74#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4)) 75#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4))
75 76
76/* 77/*
78 * When deciding how much space to allocate out of an AG, we limit the
79 * allocation maximum size to the size the AG. However, we cannot use all the
80 * blocks in the AG - some are permanently used by metadata. These
81 * blocks are generally:
82 * - the AG superblock, AGF, AGI and AGFL
83 * - the AGF (bno and cnt) and AGI btree root blocks
84 * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
85 *
86 * The AG headers are sector sized, so the amount of space they take up is
87 * dependent on filesystem geometry. The others are all single blocks.
88 */
89#define XFS_ALLOC_AG_MAX_USABLE(mp) \
90 ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
91
92
93/*
77 * Argument structure for xfs_alloc routines. 94 * Argument structure for xfs_alloc routines.
78 * This is turned into a structure to avoid having 20 arguments passed 95 * This is turned into a structure to avoid having 20 arguments passed
79 * down several levels of the stack. 96 * down several levels of the stack.
@@ -118,16 +135,16 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
118 struct xfs_perag *pag); 135 struct xfs_perag *pag);
119 136
120#ifdef __KERNEL__ 137#ifdef __KERNEL__
121
122void 138void
123xfs_alloc_busy_insert(xfs_trans_t *tp, 139xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
124 xfs_agnumber_t agno, 140 xfs_agblock_t bno, xfs_extlen_t len);
125 xfs_agblock_t bno,
126 xfs_extlen_t len);
127 141
128void 142void
129xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); 143xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
130 144
145int
146xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
147 xfs_agblock_t bno, xfs_extlen_t len);
131#endif /* __KERNEL__ */ 148#endif /* __KERNEL__ */
132 149
133/* 150/*
@@ -205,4 +222,18 @@ xfs_free_extent(
205 xfs_fsblock_t bno, /* starting block number of extent */ 222 xfs_fsblock_t bno, /* starting block number of extent */
206 xfs_extlen_t len); /* length of extent */ 223 xfs_extlen_t len); /* length of extent */
207 224
225int /* error */
226xfs_alloc_lookup_le(
227 struct xfs_btree_cur *cur, /* btree cursor */
228 xfs_agblock_t bno, /* starting block of extent */
229 xfs_extlen_t len, /* length of extent */
230 int *stat); /* success/failure */
231
232int /* error */
233xfs_alloc_get_rec(
234 struct xfs_btree_cur *cur, /* btree cursor */
235 xfs_agblock_t *bno, /* output: starting block of extent */
236 xfs_extlen_t *len, /* output: length of extent */
237 int *stat); /* output: success/failure */
238
208#endif /* __XFS_ALLOC_H__ */ 239#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 4111cd3966c7..dc3afd7739ff 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1038,17 +1038,34 @@ xfs_bmap_add_extent_delay_real(
1038 * Filling in the middle part of a previous delayed allocation. 1038 * Filling in the middle part of a previous delayed allocation.
1039 * Contiguity is impossible here. 1039 * Contiguity is impossible here.
1040 * This case is avoided almost all the time. 1040 * This case is avoided almost all the time.
1041 *
1042 * We start with a delayed allocation:
1043 *
1044 * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
1045 * PREV @ idx
1046 *
1047 * and we are allocating:
1048 * +rrrrrrrrrrrrrrrrr+
1049 * new
1050 *
1051 * and we set it up for insertion as:
1052 * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
1053 * new
1054 * PREV @ idx LEFT RIGHT
1055 * inserted at idx + 1
1041 */ 1056 */
1042 temp = new->br_startoff - PREV.br_startoff; 1057 temp = new->br_startoff - PREV.br_startoff;
1043 trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
1044 xfs_bmbt_set_blockcount(ep, temp);
1045 r[0] = *new;
1046 r[1].br_state = PREV.br_state;
1047 r[1].br_startblock = 0;
1048 r[1].br_startoff = new_endoff;
1049 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; 1058 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
1050 r[1].br_blockcount = temp2; 1059 trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
1051 xfs_iext_insert(ip, idx + 1, 2, &r[0], state); 1060 xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */
1061 LEFT = *new;
1062 RIGHT.br_state = PREV.br_state;
1063 RIGHT.br_startblock = nullstartblock(
1064 (int)xfs_bmap_worst_indlen(ip, temp2));
1065 RIGHT.br_startoff = new_endoff;
1066 RIGHT.br_blockcount = temp2;
1067 /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
1068 xfs_iext_insert(ip, idx + 1, 2, &LEFT, state);
1052 ip->i_df.if_lastex = idx + 1; 1069 ip->i_df.if_lastex = idx + 1;
1053 ip->i_d.di_nextents++; 1070 ip->i_d.di_nextents++;
1054 if (cur == NULL) 1071 if (cur == NULL)
@@ -2430,7 +2447,7 @@ xfs_bmap_btalloc_nullfb(
2430 startag = ag = 0; 2447 startag = ag = 0;
2431 2448
2432 pag = xfs_perag_get(mp, ag); 2449 pag = xfs_perag_get(mp, ag);
2433 while (*blen < ap->alen) { 2450 while (*blen < args->maxlen) {
2434 if (!pag->pagf_init) { 2451 if (!pag->pagf_init) {
2435 error = xfs_alloc_pagf_init(mp, args->tp, ag, 2452 error = xfs_alloc_pagf_init(mp, args->tp, ag,
2436 XFS_ALLOC_FLAG_TRYLOCK); 2453 XFS_ALLOC_FLAG_TRYLOCK);
@@ -2452,7 +2469,7 @@ xfs_bmap_btalloc_nullfb(
2452 notinit = 1; 2469 notinit = 1;
2453 2470
2454 if (xfs_inode_is_filestream(ap->ip)) { 2471 if (xfs_inode_is_filestream(ap->ip)) {
2455 if (*blen >= ap->alen) 2472 if (*blen >= args->maxlen)
2456 break; 2473 break;
2457 2474
2458 if (ap->userdata) { 2475 if (ap->userdata) {
@@ -2498,14 +2515,14 @@ xfs_bmap_btalloc_nullfb(
2498 * If the best seen length is less than the request 2515 * If the best seen length is less than the request
2499 * length, use the best as the minimum. 2516 * length, use the best as the minimum.
2500 */ 2517 */
2501 else if (*blen < ap->alen) 2518 else if (*blen < args->maxlen)
2502 args->minlen = *blen; 2519 args->minlen = *blen;
2503 /* 2520 /*
2504 * Otherwise we've seen an extent as big as alen, 2521 * Otherwise we've seen an extent as big as maxlen,
2505 * use that as the minimum. 2522 * use that as the minimum.
2506 */ 2523 */
2507 else 2524 else
2508 args->minlen = ap->alen; 2525 args->minlen = args->maxlen;
2509 2526
2510 /* 2527 /*
2511 * set the failure fallback case to look in the selected 2528 * set the failure fallback case to look in the selected
@@ -2573,7 +2590,9 @@ xfs_bmap_btalloc(
2573 args.tp = ap->tp; 2590 args.tp = ap->tp;
2574 args.mp = mp; 2591 args.mp = mp;
2575 args.fsbno = ap->rval; 2592 args.fsbno = ap->rval;
2576 args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks); 2593
2594 /* Trim the allocation back to the maximum an AG can fit. */
2595 args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp));
2577 args.firstblock = ap->firstblock; 2596 args.firstblock = ap->firstblock;
2578 blen = 0; 2597 blen = 0;
2579 if (nullfb) { 2598 if (nullfb) {
@@ -2621,7 +2640,7 @@ xfs_bmap_btalloc(
2621 /* 2640 /*
2622 * Adjust for alignment 2641 * Adjust for alignment
2623 */ 2642 */
2624 if (blen > args.alignment && blen <= ap->alen) 2643 if (blen > args.alignment && blen <= args.maxlen)
2625 args.minlen = blen - args.alignment; 2644 args.minlen = blen - args.alignment;
2626 args.minalignslop = 0; 2645 args.minalignslop = 0;
2627 } else { 2646 } else {
@@ -2640,7 +2659,7 @@ xfs_bmap_btalloc(
2640 * of minlen+alignment+slop doesn't go up 2659 * of minlen+alignment+slop doesn't go up
2641 * between the calls. 2660 * between the calls.
2642 */ 2661 */
2643 if (blen > mp->m_dalign && blen <= ap->alen) 2662 if (blen > mp->m_dalign && blen <= args.maxlen)
2644 nextminlen = blen - mp->m_dalign; 2663 nextminlen = blen - mp->m_dalign;
2645 else 2664 else
2646 nextminlen = args.minlen; 2665 nextminlen = args.minlen;
@@ -4485,6 +4504,16 @@ xfs_bmapi(
4485 /* Figure out the extent size, adjust alen */ 4504 /* Figure out the extent size, adjust alen */
4486 extsz = xfs_get_extsz_hint(ip); 4505 extsz = xfs_get_extsz_hint(ip);
4487 if (extsz) { 4506 if (extsz) {
4507 /*
4508 * make sure we don't exceed a single
4509 * extent length when we align the
4510 * extent by reducing length we are
4511 * going to allocate by the maximum
4512 * amount extent size aligment may
4513 * require.
4514 */
4515 alen = XFS_FILBLKS_MIN(len,
4516 MAXEXTLEN - (2 * extsz - 1));
4488 error = xfs_bmap_extsize_align(mp, 4517 error = xfs_bmap_extsize_align(mp,
4489 &got, &prev, extsz, 4518 &got, &prev, extsz,
4490 rt, eof, 4519 rt, eof,
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index ed2b65f3f8b9..6f8c21ce0d6d 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -141,7 +141,6 @@ xfs_buf_item_log_check(
141#define xfs_buf_item_log_check(x) 141#define xfs_buf_item_log_check(x)
142#endif 142#endif
143 143
144STATIC void xfs_buf_error_relse(xfs_buf_t *bp);
145STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); 144STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
146 145
147/* 146/*
@@ -428,13 +427,15 @@ xfs_buf_item_unpin(
428 427
429 if (remove) { 428 if (remove) {
430 /* 429 /*
431 * We have to remove the log item from the transaction 430 * If we are in a transaction context, we have to
432 * as we are about to release our reference to the 431 * remove the log item from the transaction as we are
433 * buffer. If we don't, the unlock that occurs later 432 * about to release our reference to the buffer. If we
434 * in xfs_trans_uncommit() will ry to reference the 433 * don't, the unlock that occurs later in
434 * xfs_trans_uncommit() will try to reference the
435 * buffer which we no longer have a hold on. 435 * buffer which we no longer have a hold on.
436 */ 436 */
437 xfs_trans_del_item(lip); 437 if (lip->li_desc)
438 xfs_trans_del_item(lip);
438 439
439 /* 440 /*
440 * Since the transaction no longer refers to the buffer, 441 * Since the transaction no longer refers to the buffer,
@@ -959,128 +960,76 @@ xfs_buf_do_callbacks(
959 */ 960 */
960void 961void
961xfs_buf_iodone_callbacks( 962xfs_buf_iodone_callbacks(
962 xfs_buf_t *bp) 963 struct xfs_buf *bp)
963{ 964{
964 xfs_log_item_t *lip; 965 struct xfs_log_item *lip = bp->b_fspriv;
965 static ulong lasttime; 966 struct xfs_mount *mp = lip->li_mountp;
966 static xfs_buftarg_t *lasttarg; 967 static ulong lasttime;
967 xfs_mount_t *mp; 968 static xfs_buftarg_t *lasttarg;
968 969
969 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 970 if (likely(!XFS_BUF_GETERROR(bp)))
970 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 971 goto do_callbacks;
971 972
972 if (XFS_BUF_GETERROR(bp) != 0) { 973 /*
973 /* 974 * If we've already decided to shutdown the filesystem because of
974 * If we've already decided to shutdown the filesystem 975 * I/O errors, there's no point in giving this a retry.
975 * because of IO errors, there's no point in giving this 976 */
976 * a retry. 977 if (XFS_FORCED_SHUTDOWN(mp)) {
977 */ 978 XFS_BUF_SUPER_STALE(bp);
978 mp = lip->li_mountp; 979 trace_xfs_buf_item_iodone(bp, _RET_IP_);
979 if (XFS_FORCED_SHUTDOWN(mp)) { 980 goto do_callbacks;
980 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); 981 }
981 XFS_BUF_SUPER_STALE(bp);
982 trace_xfs_buf_item_iodone(bp, _RET_IP_);
983 xfs_buf_do_callbacks(bp);
984 XFS_BUF_SET_FSPRIVATE(bp, NULL);
985 XFS_BUF_CLR_IODONE_FUNC(bp);
986 xfs_buf_ioend(bp, 0);
987 return;
988 }
989 982
990 if ((XFS_BUF_TARGET(bp) != lasttarg) || 983 if (XFS_BUF_TARGET(bp) != lasttarg ||
991 (time_after(jiffies, (lasttime + 5*HZ)))) { 984 time_after(jiffies, (lasttime + 5*HZ))) {
992 lasttime = jiffies; 985 lasttime = jiffies;
993 cmn_err(CE_ALERT, "Device %s, XFS metadata write error" 986 cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
994 " block 0x%llx in %s", 987 " block 0x%llx in %s",
995 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), 988 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
996 (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); 989 (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
997 } 990 }
998 lasttarg = XFS_BUF_TARGET(bp); 991 lasttarg = XFS_BUF_TARGET(bp);
999 992
1000 if (XFS_BUF_ISASYNC(bp)) { 993 /*
1001 /* 994 * If the write was asynchronous then noone will be looking for the
1002 * If the write was asynchronous then noone will be 995 * error. Clear the error state and write the buffer out again.
1003 * looking for the error. Clear the error state 996 *
1004 * and write the buffer out again delayed write. 997 * During sync or umount we'll write all pending buffers again
1005 * 998 * synchronous, which will catch these errors if they keep hanging
1006 * XXXsup This is OK, so long as we catch these 999 * around.
1007 * before we start the umount; we don't want these 1000 */
1008 * DELWRI metadata bufs to be hanging around. 1001 if (XFS_BUF_ISASYNC(bp)) {
1009 */ 1002 XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */
1010 XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */ 1003
1011 1004 if (!XFS_BUF_ISSTALE(bp)) {
1012 if (!(XFS_BUF_ISSTALE(bp))) { 1005 XFS_BUF_DELAYWRITE(bp);
1013 XFS_BUF_DELAYWRITE(bp);
1014 XFS_BUF_DONE(bp);
1015 XFS_BUF_SET_START(bp);
1016 }
1017 ASSERT(XFS_BUF_IODONE_FUNC(bp));
1018 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1019 xfs_buf_relse(bp);
1020 } else {
1021 /*
1022 * If the write of the buffer was not asynchronous,
1023 * then we want to make sure to return the error
1024 * to the caller of bwrite(). Because of this we
1025 * cannot clear the B_ERROR state at this point.
1026 * Instead we install a callback function that
1027 * will be called when the buffer is released, and
1028 * that routine will clear the error state and
1029 * set the buffer to be written out again after
1030 * some delay.
1031 */
1032 /* We actually overwrite the existing b-relse
1033 function at times, but we're gonna be shutting down
1034 anyway. */
1035 XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
1036 XFS_BUF_DONE(bp); 1006 XFS_BUF_DONE(bp);
1037 XFS_BUF_FINISH_IOWAIT(bp); 1007 XFS_BUF_SET_START(bp);
1038 } 1008 }
1009 ASSERT(XFS_BUF_IODONE_FUNC(bp));
1010 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1011 xfs_buf_relse(bp);
1039 return; 1012 return;
1040 } 1013 }
1041 1014
1042 xfs_buf_do_callbacks(bp); 1015 /*
1043 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1016 * If the write of the buffer was synchronous, we want to make
1044 XFS_BUF_CLR_IODONE_FUNC(bp); 1017 * sure to return the error to the caller of xfs_bwrite().
1045 xfs_buf_ioend(bp, 0); 1018 */
1046}
1047
1048/*
1049 * This is a callback routine attached to a buffer which gets an error
1050 * when being written out synchronously.
1051 */
1052STATIC void
1053xfs_buf_error_relse(
1054 xfs_buf_t *bp)
1055{
1056 xfs_log_item_t *lip;
1057 xfs_mount_t *mp;
1058
1059 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1060 mp = (xfs_mount_t *)lip->li_mountp;
1061 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
1062
1063 XFS_BUF_STALE(bp); 1019 XFS_BUF_STALE(bp);
1064 XFS_BUF_DONE(bp); 1020 XFS_BUF_DONE(bp);
1065 XFS_BUF_UNDELAYWRITE(bp); 1021 XFS_BUF_UNDELAYWRITE(bp);
1066 XFS_BUF_ERROR(bp,0);
1067 1022
1068 trace_xfs_buf_error_relse(bp, _RET_IP_); 1023 trace_xfs_buf_error_relse(bp, _RET_IP_);
1024 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1069 1025
1070 if (! XFS_FORCED_SHUTDOWN(mp)) 1026do_callbacks:
1071 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1072 /*
1073 * We have to unpin the pinned buffers so do the
1074 * callbacks.
1075 */
1076 xfs_buf_do_callbacks(bp); 1027 xfs_buf_do_callbacks(bp);
1077 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1028 XFS_BUF_SET_FSPRIVATE(bp, NULL);
1078 XFS_BUF_CLR_IODONE_FUNC(bp); 1029 XFS_BUF_CLR_IODONE_FUNC(bp);
1079 XFS_BUF_SET_BRELSE_FUNC(bp,NULL); 1030 xfs_buf_ioend(bp, 0);
1080 xfs_buf_relse(bp);
1081} 1031}
1082 1032
1083
1084/* 1033/*
1085 * This is the iodone() function for buffers which have been 1034 * This is the iodone() function for buffers which have been
1086 * logged. It is called when they are eventually flushed out. 1035 * logged. It is called when they are eventually flushed out.
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index c78cc6a3d87c..4c7db74a05f7 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -152,37 +152,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
152} 152}
153#endif /* DEBUG */ 153#endif /* DEBUG */
154 154
155
156void
157xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
158{
159 va_list ap;
160
161 va_start(ap, fmt);
162 xfs_fs_vcmn_err(level, mp, fmt, ap);
163 va_end(ap);
164}
165
166void
167xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
168{
169 va_list ap;
170
171#ifdef DEBUG
172 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
173#endif
174
175 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
176 && (level & CE_ALERT)) {
177 level &= ~CE_ALERT;
178 level |= CE_PANIC;
179 cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG.");
180 }
181 va_start(ap, fmt);
182 xfs_fs_vcmn_err(level, mp, fmt, ap);
183 va_end(ap);
184}
185
186void 155void
187xfs_error_report( 156xfs_error_report(
188 const char *tag, 157 const char *tag,
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index f338847f80b8..10dce5475f02 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -136,8 +136,8 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
136 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ 136 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
137 (rf)))) 137 (rf))))
138 138
139extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); 139extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
140extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); 140extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
141#else 141#else
142#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) 142#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr)
143#define xfs_errortag_add(tag, mp) (ENOSYS) 143#define xfs_errortag_add(tag, mp) (ENOSYS)
@@ -162,21 +162,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
162 162
163struct xfs_mount; 163struct xfs_mount;
164 164
165extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
166 char *fmt, va_list ap)
167 __attribute__ ((format (printf, 3, 0)));
168extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
169 char *fmt, ...)
170 __attribute__ ((format (printf, 4, 5)));
171extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
172 __attribute__ ((format (printf, 3, 4)));
173
174extern void xfs_hex_dump(void *p, int length); 165extern void xfs_hex_dump(void *p, int length);
175 166
176#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \ 167#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
177 xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args) 168 xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args)
178 169
179#define xfs_fs_mount_cmn_err(f, fmt, args...) \ 170#define xfs_fs_mount_cmn_err(f, fmt, args...) \
180 ((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args)) 171 do { \
172 if (!(f & XFS_MFSI_QUIET)) \
173 cmn_err(CE_WARN, "XFS: " fmt, ## args); \
174 } while (0)
181 175
182#endif /* __XFS_ERROR_H__ */ 176#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 75f2ef60e579..d22e62623437 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -138,7 +138,8 @@ xfs_efi_item_unpin(
138 138
139 if (remove) { 139 if (remove) {
140 ASSERT(!(lip->li_flags & XFS_LI_IN_AIL)); 140 ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
141 xfs_trans_del_item(lip); 141 if (lip->li_desc)
142 xfs_trans_del_item(lip);
142 xfs_efi_item_free(efip); 143 xfs_efi_item_free(efip);
143 return; 144 return;
144 } 145 }
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index f56d30e8040c..cec89dd5d7d2 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -612,12 +612,13 @@ out:
612 * 612 *
613 * We cannot use an inode here for this - that will push dirty state back up 613 * We cannot use an inode here for this - that will push dirty state back up
614 * into the VFS and then periodic inode flushing will prevent log covering from 614 * into the VFS and then periodic inode flushing will prevent log covering from
615 * making progress. Hence we log a field in the superblock instead. 615 * making progress. Hence we log a field in the superblock instead and use a
616 * synchronous transaction to ensure the superblock is immediately unpinned
617 * and can be written back.
616 */ 618 */
617int 619int
618xfs_fs_log_dummy( 620xfs_fs_log_dummy(
619 xfs_mount_t *mp, 621 xfs_mount_t *mp)
620 int flags)
621{ 622{
622 xfs_trans_t *tp; 623 xfs_trans_t *tp;
623 int error; 624 int error;
@@ -632,8 +633,7 @@ xfs_fs_log_dummy(
632 633
633 /* log the UUID because it is an unchanging field */ 634 /* log the UUID because it is an unchanging field */
634 xfs_mod_sb(tp, XFS_SB_UUID); 635 xfs_mod_sb(tp, XFS_SB_UUID);
635 if (flags & SYNC_WAIT) 636 xfs_trans_set_sync(tp);
636 xfs_trans_set_sync(tp);
637 return xfs_trans_commit(tp, 0); 637 return xfs_trans_commit(tp, 0);
638} 638}
639 639
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index a786c5212c1e..1b6a98b66886 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, 25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
26 xfs_fsop_resblks_t *outval); 26 xfs_fsop_resblks_t *outval);
27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); 27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
28extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags); 28extern int xfs_fs_log_dummy(struct xfs_mount *mp);
29 29
30#endif /* __XFS_FSOPS_H__ */ 30#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 55582bd66659..8a0f044750c3 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -337,7 +337,12 @@ xfs_iomap_prealloc_size(
337 int shift = 0; 337 int shift = 0;
338 int64_t freesp; 338 int64_t freesp;
339 339
340 alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size); 340 /*
341 * rounddown_pow_of_two() returns an undefined result
342 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
343 * ensure we always pass in a non-zero value.
344 */
345 alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
341 alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, 346 alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
342 rounddown_pow_of_two(alloc_blocks)); 347 rounddown_pow_of_two(alloc_blocks));
343 348
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 0bf24b11d0c4..ae6fef1ff563 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -377,7 +377,7 @@ xfs_log_mount(
377 cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname); 377 cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
378 else { 378 else {
379 cmn_err(CE_NOTE, 379 cmn_err(CE_NOTE,
380 "!Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", 380 "Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.",
381 mp->m_fsname); 381 mp->m_fsname);
382 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 382 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
383 } 383 }
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 916eb7db14d9..3bd3291ef8d2 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -191,7 +191,7 @@ void xfs_log_ticket_put(struct xlog_ticket *ticket);
191 191
192xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp); 192xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
193 193
194int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, 194void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
195 struct xfs_log_vec *log_vector, 195 struct xfs_log_vec *log_vector,
196 xfs_lsn_t *commit_lsn, int flags); 196 xfs_lsn_t *commit_lsn, int flags);
197bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); 197bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 9dc8125d04e5..9ca59be08977 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -543,7 +543,7 @@ xlog_cil_push(
543 543
544 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); 544 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
545 if (error) 545 if (error)
546 goto out_abort; 546 goto out_abort_free_ticket;
547 547
548 /* 548 /*
549 * now that we've written the checkpoint into the log, strictly 549 * now that we've written the checkpoint into the log, strictly
@@ -569,8 +569,9 @@ restart:
569 } 569 }
570 spin_unlock(&cil->xc_cil_lock); 570 spin_unlock(&cil->xc_cil_lock);
571 571
572 /* xfs_log_done always frees the ticket on error. */
572 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); 573 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
573 if (error || commit_lsn == -1) 574 if (commit_lsn == -1)
574 goto out_abort; 575 goto out_abort;
575 576
576 /* attach all the transactions w/ busy extents to iclog */ 577 /* attach all the transactions w/ busy extents to iclog */
@@ -600,6 +601,8 @@ out_free_ticket:
600 kmem_free(new_ctx); 601 kmem_free(new_ctx);
601 return 0; 602 return 0;
602 603
604out_abort_free_ticket:
605 xfs_log_ticket_put(tic);
603out_abort: 606out_abort:
604 xlog_cil_committed(ctx, XFS_LI_ABORTED); 607 xlog_cil_committed(ctx, XFS_LI_ABORTED);
605 return XFS_ERROR(EIO); 608 return XFS_ERROR(EIO);
@@ -622,7 +625,7 @@ out_abort:
622 * background commit, returns without it held once background commits are 625 * background commit, returns without it held once background commits are
623 * allowed again. 626 * allowed again.
624 */ 627 */
625int 628void
626xfs_log_commit_cil( 629xfs_log_commit_cil(
627 struct xfs_mount *mp, 630 struct xfs_mount *mp,
628 struct xfs_trans *tp, 631 struct xfs_trans *tp,
@@ -637,11 +640,6 @@ xfs_log_commit_cil(
637 if (flags & XFS_TRANS_RELEASE_LOG_RES) 640 if (flags & XFS_TRANS_RELEASE_LOG_RES)
638 log_flags = XFS_LOG_REL_PERM_RESERV; 641 log_flags = XFS_LOG_REL_PERM_RESERV;
639 642
640 if (XLOG_FORCED_SHUTDOWN(log)) {
641 xlog_cil_free_logvec(log_vector);
642 return XFS_ERROR(EIO);
643 }
644
645 /* 643 /*
646 * do all the hard work of formatting items (including memory 644 * do all the hard work of formatting items (including memory
647 * allocation) outside the CIL context lock. This prevents stalling CIL 645 * allocation) outside the CIL context lock. This prevents stalling CIL
@@ -701,7 +699,6 @@ xfs_log_commit_cil(
701 */ 699 */
702 if (push) 700 if (push)
703 xlog_cil_push(log, 0); 701 xlog_cil_push(log, 0);
704 return 0;
705} 702}
706 703
707/* 704/*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 204d8e5fa7fa..aa0ebb776903 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3800,7 +3800,7 @@ xlog_recover_finish(
3800 log->l_flags &= ~XLOG_RECOVERY_NEEDED; 3800 log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3801 } else { 3801 } else {
3802 cmn_err(CE_DEBUG, 3802 cmn_err(CE_DEBUG,
3803 "!Ending clean XFS mount for filesystem: %s\n", 3803 "Ending clean XFS mount for filesystem: %s\n",
3804 log->l_mp->m_fsname); 3804 log->l_mp->m_fsname);
3805 } 3805 }
3806 return 0; 3806 return 0;
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f80a067a4658..76922793f64f 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1137,7 +1137,7 @@ out_undo_fdblocks:
1137 if (blkdelta) 1137 if (blkdelta)
1138 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd); 1138 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
1139out: 1139out:
1140 ASSERT(error = 0); 1140 ASSERT(error == 0);
1141 return; 1141 return;
1142} 1142}
1143 1143
@@ -1446,6 +1446,14 @@ xfs_log_item_batch_insert(
1446 * Bulk operation version of xfs_trans_committed that takes a log vector of 1446 * Bulk operation version of xfs_trans_committed that takes a log vector of
1447 * items to insert into the AIL. This uses bulk AIL insertion techniques to 1447 * items to insert into the AIL. This uses bulk AIL insertion techniques to
1448 * minimise lock traffic. 1448 * minimise lock traffic.
1449 *
1450 * If we are called with the aborted flag set, it is because a log write during
1451 * a CIL checkpoint commit has failed. In this case, all the items in the
1452 * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which
1453 * means that checkpoint commit abort handling is treated exactly the same
1454 * as an iclog write error even though we haven't started any IO yet. Hence in
1455 * this case all we need to do is IOP_COMMITTED processing, followed by an
1456 * IOP_UNPIN(aborted) call.
1449 */ 1457 */
1450void 1458void
1451xfs_trans_committed_bulk( 1459xfs_trans_committed_bulk(
@@ -1472,6 +1480,16 @@ xfs_trans_committed_bulk(
1472 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) 1480 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
1473 continue; 1481 continue;
1474 1482
1483 /*
1484 * if we are aborting the operation, no point in inserting the
1485 * object into the AIL as we are in a shutdown situation.
1486 */
1487 if (aborted) {
1488 ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
1489 IOP_UNPIN(lip, 1);
1490 continue;
1491 }
1492
1475 if (item_lsn != commit_lsn) { 1493 if (item_lsn != commit_lsn) {
1476 1494
1477 /* 1495 /*
@@ -1503,20 +1521,24 @@ xfs_trans_committed_bulk(
1503} 1521}
1504 1522
1505/* 1523/*
1506 * Called from the trans_commit code when we notice that 1524 * Called from the trans_commit code when we notice that the filesystem is in
1507 * the filesystem is in the middle of a forced shutdown. 1525 * the middle of a forced shutdown.
1526 *
1527 * When we are called here, we have already pinned all the items in the
1528 * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called
1529 * so we can simply walk the items in the transaction, unpin them with an abort
1530 * flag and then free the items. Note that unpinning the items can result in
1531 * them being freed immediately, so we need to use a safe list traversal method
1532 * here.
1508 */ 1533 */
1509STATIC void 1534STATIC void
1510xfs_trans_uncommit( 1535xfs_trans_uncommit(
1511 struct xfs_trans *tp, 1536 struct xfs_trans *tp,
1512 uint flags) 1537 uint flags)
1513{ 1538{
1514 struct xfs_log_item_desc *lidp; 1539 struct xfs_log_item_desc *lidp, *n;
1515 1540
1516 list_for_each_entry(lidp, &tp->t_items, lid_trans) { 1541 list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) {
1517 /*
1518 * Unpin all but those that aren't dirty.
1519 */
1520 if (lidp->lid_flags & XFS_LID_DIRTY) 1542 if (lidp->lid_flags & XFS_LID_DIRTY)
1521 IOP_UNPIN(lidp->lid_item, 1); 1543 IOP_UNPIN(lidp->lid_item, 1);
1522 } 1544 }
@@ -1733,7 +1755,6 @@ xfs_trans_commit_cil(
1733 int flags) 1755 int flags)
1734{ 1756{
1735 struct xfs_log_vec *log_vector; 1757 struct xfs_log_vec *log_vector;
1736 int error;
1737 1758
1738 /* 1759 /*
1739 * Get each log item to allocate a vector structure for 1760 * Get each log item to allocate a vector structure for
@@ -1744,9 +1765,7 @@ xfs_trans_commit_cil(
1744 if (!log_vector) 1765 if (!log_vector)
1745 return ENOMEM; 1766 return ENOMEM;
1746 1767
1747 error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); 1768 xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
1748 if (error)
1749 return error;
1750 1769
1751 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1770 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1752 xfs_trans_free(tp); 1771 xfs_trans_free(tp);