aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2010-09-10 01:27:33 -0400
committerDavid S. Miller <davem@davemloft.net>2010-09-10 01:27:33 -0400
commite548833df83c3554229eff0672900bfe958b45fd (patch)
tree85efc4a76dc356593d6d394776aeb845dc580fb6 /fs
parentcbd9da7be869f676afc204e1a664163778c770bd (diff)
parent053d8f6622701f849fda2ca2c9ae596c13599ba9 (diff)
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
Conflicts: net/mac80211/main.c
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/fid.c3
-rw-r--r--fs/binfmt_misc.c2
-rw-r--r--fs/binfmt_script.c3
-rw-r--r--fs/buffer.c69
-rw-r--r--fs/ceph/addr.c12
-rw-r--r--fs/ceph/auth_x.c15
-rw-r--r--fs/ceph/caps.c32
-rw-r--r--fs/ceph/debugfs.c4
-rw-r--r--fs/ceph/dir.c2
-rw-r--r--fs/ceph/inode.c5
-rw-r--r--fs/ceph/locks.c14
-rw-r--r--fs/ceph/mds_client.c101
-rw-r--r--fs/ceph/mds_client.h3
-rw-r--r--fs/ceph/osd_client.c2
-rw-r--r--fs/ceph/snap.c89
-rw-r--r--fs/ceph/super.h11
-rw-r--r--fs/ceph/xattr.c1
-rw-r--r--fs/cifs/Kconfig2
-rw-r--r--fs/cifs/asn1.c6
-rw-r--r--fs/cifs/cifs_unicode.h18
-rw-r--r--fs/cifs/cifs_uniupr.h16
-rw-r--r--fs/cifs/cifsencrypt.c475
-rw-r--r--fs/cifs/cifsglob.h25
-rw-r--r--fs/cifs/cifspdu.h7
-rw-r--r--fs/cifs/cifsproto.h12
-rw-r--r--fs/cifs/cifssmb.c13
-rw-r--r--fs/cifs/connect.c17
-rw-r--r--fs/cifs/dir.c157
-rw-r--r--fs/cifs/file.c3
-rw-r--r--fs/cifs/inode.c2
-rw-r--r--fs/cifs/ntlmssp.h13
-rw-r--r--fs/cifs/sess.c132
-rw-r--r--fs/cifs/transport.c6
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/dcache.c71
-rw-r--r--fs/ecryptfs/crypto.c3
-rw-r--r--fs/ecryptfs/inode.c31
-rw-r--r--fs/ecryptfs/keystore.c2
-rw-r--r--fs/ecryptfs/kthread.c2
-rw-r--r--fs/ecryptfs/messaging.c2
-rw-r--r--fs/ecryptfs/miscdev.c2
-rw-r--r--fs/exec.c25
-rw-r--r--fs/fat/misc.c4
-rw-r--r--fs/file_table.c124
-rw-r--r--fs/fs_struct.c32
-rw-r--r--fs/generic_acl.c1
-rw-r--r--fs/hostfs/hostfs_kern.c4
-rw-r--r--fs/internal.h7
-rw-r--r--fs/jbd/checkpoint.c4
-rw-r--r--fs/jbd/commit.c49
-rw-r--r--fs/jbd/journal.c2
-rw-r--r--fs/jbd/revoke.c2
-rw-r--r--fs/jbd2/checkpoint.c4
-rw-r--r--fs/jbd2/commit.c39
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/jbd2/revoke.c2
-rw-r--r--fs/mbcache.c30
-rw-r--r--fs/namei.c119
-rw-r--r--fs/namespace.c200
-rw-r--r--fs/nfs/Kconfig1
-rw-r--r--fs/nfs/dir.c9
-rw-r--r--fs/nfs/file.c2
-rw-r--r--fs/nfs/nfs4proc.c11
-rw-r--r--fs/nfs/super.c7
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nfsd/nfs4state.c26
-rw-r--r--fs/nfsd/state.h14
-rw-r--r--fs/nfsd/vfs.c14
-rw-r--r--fs/nilfs2/super.c32
-rw-r--r--fs/nilfs2/the_nilfs.c9
-rw-r--r--fs/notify/fanotify/fanotify.c3
-rw-r--r--fs/notify/fanotify/fanotify_user.c29
-rw-r--r--fs/notify/fsnotify.c68
-rw-r--r--fs/open.c4
-rw-r--r--fs/pnode.c11
-rw-r--r--fs/reiserfs/inode.c1
-rw-r--r--fs/reiserfs/journal.c2
-rw-r--r--fs/super.c18
-rw-r--r--fs/sysfs/file.c2
-rw-r--r--fs/ufs/balloc.c24
-rw-r--r--fs/ufs/ialloc.c18
-rw-r--r--fs/ufs/truncate.c18
-rw-r--r--fs/ufs/util.c20
-rw-r--r--fs/ufs/util.h3
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c13
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c8
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c7
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c42
-rw-r--r--fs/xfs/xfs_bmap.c14
-rw-r--r--fs/xfs/xfs_fs.h4
-rw-r--r--fs/xfs/xfs_fsops.c31
-rw-r--r--fs/xfs/xfs_fsops.h2
-rw-r--r--fs/xfs/xfs_ialloc.c16
-rw-r--r--fs/xfs/xfs_inode.c49
-rw-r--r--fs/xfs/xfs_log.c7
-rw-r--r--fs/xfs/xfs_log_cil.c263
-rw-r--r--fs/xfs/xfs_log_priv.h13
-rw-r--r--fs/xfs/xfs_trans.c5
-rw-r--r--fs/xfs/xfs_trans_priv.h3
-rw-r--r--fs/xfs/xfs_vnodeops.c13
103 files changed, 1806 insertions, 1080 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 358563689064..6406f896bf95 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -242,7 +242,8 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
242 } 242 }
243 kfree(wnames); 243 kfree(wnames);
244fid_out: 244fid_out:
245 v9fs_fid_add(dentry, fid); 245 if (!IS_ERR(fid))
246 v9fs_fid_add(dentry, fid);
246err_out: 247err_out:
247 up_read(&v9ses->rename_sem); 248 up_read(&v9ses->rename_sem);
248 return fid; 249 return fid;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 9e60fd201716..a7528b913936 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -108,7 +108,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
108 Node *fmt; 108 Node *fmt;
109 struct file * interp_file = NULL; 109 struct file * interp_file = NULL;
110 char iname[BINPRM_BUF_SIZE]; 110 char iname[BINPRM_BUF_SIZE];
111 char *iname_addr = iname; 111 const char *iname_addr = iname;
112 int retval; 112 int retval;
113 int fd_binary = -1; 113 int fd_binary = -1;
114 114
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index aca9d55afb22..396a9884591f 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -16,7 +16,8 @@
16 16
17static int load_script(struct linux_binprm *bprm,struct pt_regs *regs) 17static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
18{ 18{
19 char *cp, *i_name, *i_arg; 19 const char *i_arg, *i_name;
20 char *cp;
20 struct file *file; 21 struct file *file;
21 char interp[BINPRM_BUF_SIZE]; 22 char interp[BINPRM_BUF_SIZE];
22 int retval; 23 int retval;
diff --git a/fs/buffer.c b/fs/buffer.c
index 50efa339e051..3e7dca279d1c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -770,11 +770,12 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
770 spin_unlock(lock); 770 spin_unlock(lock);
771 /* 771 /*
772 * Ensure any pending I/O completes so that 772 * Ensure any pending I/O completes so that
773 * ll_rw_block() actually writes the current 773 * write_dirty_buffer() actually writes the
774 * contents - it is a noop if I/O is still in 774 * current contents - it is a noop if I/O is
775 * flight on potentially older contents. 775 * still in flight on potentially older
776 * contents.
776 */ 777 */
777 ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh); 778 write_dirty_buffer(bh, WRITE_SYNC_PLUG);
778 779
779 /* 780 /*
780 * Kick off IO for the previous mapping. Note 781 * Kick off IO for the previous mapping. Note
@@ -2912,13 +2913,6 @@ int submit_bh(int rw, struct buffer_head * bh)
2912 BUG_ON(buffer_unwritten(bh)); 2913 BUG_ON(buffer_unwritten(bh));
2913 2914
2914 /* 2915 /*
2915 * Mask in barrier bit for a write (could be either a WRITE or a
2916 * WRITE_SYNC
2917 */
2918 if (buffer_ordered(bh) && (rw & WRITE))
2919 rw |= WRITE_BARRIER;
2920
2921 /*
2922 * Only clear out a write error when rewriting 2916 * Only clear out a write error when rewriting
2923 */ 2917 */
2924 if (test_set_buffer_req(bh) && (rw & WRITE)) 2918 if (test_set_buffer_req(bh) && (rw & WRITE))
@@ -2956,22 +2950,21 @@ EXPORT_SYMBOL(submit_bh);
2956 2950
2957/** 2951/**
2958 * ll_rw_block: low-level access to block devices (DEPRECATED) 2952 * ll_rw_block: low-level access to block devices (DEPRECATED)
2959 * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead) 2953 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2960 * @nr: number of &struct buffer_heads in the array 2954 * @nr: number of &struct buffer_heads in the array
2961 * @bhs: array of pointers to &struct buffer_head 2955 * @bhs: array of pointers to &struct buffer_head
2962 * 2956 *
2963 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and 2957 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2964 * requests an I/O operation on them, either a %READ or a %WRITE. The third 2958 * requests an I/O operation on them, either a %READ or a %WRITE. The third
2965 * %SWRITE is like %WRITE only we make sure that the *current* data in buffers 2959 * %READA option is described in the documentation for generic_make_request()
2966 * are sent to disk. The fourth %READA option is described in the documentation 2960 * which ll_rw_block() calls.
2967 * for generic_make_request() which ll_rw_block() calls.
2968 * 2961 *
2969 * This function drops any buffer that it cannot get a lock on (with the 2962 * This function drops any buffer that it cannot get a lock on (with the
2970 * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be 2963 * BH_Lock state bit), any buffer that appears to be clean when doing a write
2971 * clean when doing a write request, and any buffer that appears to be 2964 * request, and any buffer that appears to be up-to-date when doing read
2972 * up-to-date when doing read request. Further it marks as clean buffers that 2965 * request. Further it marks as clean buffers that are processed for
2973 * are processed for writing (the buffer cache won't assume that they are 2966 * writing (the buffer cache won't assume that they are actually clean
2974 * actually clean until the buffer gets unlocked). 2967 * until the buffer gets unlocked).
2975 * 2968 *
2976 * ll_rw_block sets b_end_io to simple completion handler that marks 2969 * ll_rw_block sets b_end_io to simple completion handler that marks
2977 * the buffer up-to-date (if approriate), unlocks the buffer and wakes 2970 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
@@ -2987,20 +2980,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2987 for (i = 0; i < nr; i++) { 2980 for (i = 0; i < nr; i++) {
2988 struct buffer_head *bh = bhs[i]; 2981 struct buffer_head *bh = bhs[i];
2989 2982
2990 if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG) 2983 if (!trylock_buffer(bh))
2991 lock_buffer(bh);
2992 else if (!trylock_buffer(bh))
2993 continue; 2984 continue;
2994 2985 if (rw == WRITE) {
2995 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
2996 rw == SWRITE_SYNC_PLUG) {
2997 if (test_clear_buffer_dirty(bh)) { 2986 if (test_clear_buffer_dirty(bh)) {
2998 bh->b_end_io = end_buffer_write_sync; 2987 bh->b_end_io = end_buffer_write_sync;
2999 get_bh(bh); 2988 get_bh(bh);
3000 if (rw == SWRITE_SYNC) 2989 submit_bh(WRITE, bh);
3001 submit_bh(WRITE_SYNC, bh);
3002 else
3003 submit_bh(WRITE, bh);
3004 continue; 2990 continue;
3005 } 2991 }
3006 } else { 2992 } else {
@@ -3016,12 +3002,25 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
3016} 3002}
3017EXPORT_SYMBOL(ll_rw_block); 3003EXPORT_SYMBOL(ll_rw_block);
3018 3004
3005void write_dirty_buffer(struct buffer_head *bh, int rw)
3006{
3007 lock_buffer(bh);
3008 if (!test_clear_buffer_dirty(bh)) {
3009 unlock_buffer(bh);
3010 return;
3011 }
3012 bh->b_end_io = end_buffer_write_sync;
3013 get_bh(bh);
3014 submit_bh(rw, bh);
3015}
3016EXPORT_SYMBOL(write_dirty_buffer);
3017
3019/* 3018/*
3020 * For a data-integrity writeout, we need to wait upon any in-progress I/O 3019 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3021 * and then start new I/O and then wait upon it. The caller must have a ref on 3020 * and then start new I/O and then wait upon it. The caller must have a ref on
3022 * the buffer_head. 3021 * the buffer_head.
3023 */ 3022 */
3024int sync_dirty_buffer(struct buffer_head *bh) 3023int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3025{ 3024{
3026 int ret = 0; 3025 int ret = 0;
3027 3026
@@ -3030,7 +3029,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
3030 if (test_clear_buffer_dirty(bh)) { 3029 if (test_clear_buffer_dirty(bh)) {
3031 get_bh(bh); 3030 get_bh(bh);
3032 bh->b_end_io = end_buffer_write_sync; 3031 bh->b_end_io = end_buffer_write_sync;
3033 ret = submit_bh(WRITE_SYNC, bh); 3032 ret = submit_bh(rw, bh);
3034 wait_on_buffer(bh); 3033 wait_on_buffer(bh);
3035 if (buffer_eopnotsupp(bh)) { 3034 if (buffer_eopnotsupp(bh)) {
3036 clear_buffer_eopnotsupp(bh); 3035 clear_buffer_eopnotsupp(bh);
@@ -3043,6 +3042,12 @@ int sync_dirty_buffer(struct buffer_head *bh)
3043 } 3042 }
3044 return ret; 3043 return ret;
3045} 3044}
3045EXPORT_SYMBOL(__sync_dirty_buffer);
3046
3047int sync_dirty_buffer(struct buffer_head *bh)
3048{
3049 return __sync_dirty_buffer(bh, WRITE_SYNC);
3050}
3046EXPORT_SYMBOL(sync_dirty_buffer); 3051EXPORT_SYMBOL(sync_dirty_buffer);
3047 3052
3048/* 3053/*
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5598a0d02295..4cfce1ee31fa 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -87,7 +87,7 @@ static int ceph_set_page_dirty(struct page *page)
87 87
88 /* dirty the head */ 88 /* dirty the head */
89 spin_lock(&inode->i_lock); 89 spin_lock(&inode->i_lock);
90 if (ci->i_wrbuffer_ref_head == 0) 90 if (ci->i_head_snapc == NULL)
91 ci->i_head_snapc = ceph_get_snap_context(snapc); 91 ci->i_head_snapc = ceph_get_snap_context(snapc);
92 ++ci->i_wrbuffer_ref_head; 92 ++ci->i_wrbuffer_ref_head;
93 if (ci->i_wrbuffer_ref == 0) 93 if (ci->i_wrbuffer_ref == 0)
@@ -105,13 +105,7 @@ static int ceph_set_page_dirty(struct page *page)
105 spin_lock_irq(&mapping->tree_lock); 105 spin_lock_irq(&mapping->tree_lock);
106 if (page->mapping) { /* Race with truncate? */ 106 if (page->mapping) { /* Race with truncate? */
107 WARN_ON_ONCE(!PageUptodate(page)); 107 WARN_ON_ONCE(!PageUptodate(page));
108 108 account_page_dirtied(page, page->mapping);
109 if (mapping_cap_account_dirty(mapping)) {
110 __inc_zone_page_state(page, NR_FILE_DIRTY);
111 __inc_bdi_stat(mapping->backing_dev_info,
112 BDI_RECLAIMABLE);
113 task_io_account_write(PAGE_CACHE_SIZE);
114 }
115 radix_tree_tag_set(&mapping->page_tree, 109 radix_tree_tag_set(&mapping->page_tree,
116 page_index(page), PAGECACHE_TAG_DIRTY); 110 page_index(page), PAGECACHE_TAG_DIRTY);
117 111
@@ -352,7 +346,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
352 break; 346 break;
353 } 347 }
354 } 348 }
355 if (!snapc && ci->i_head_snapc) { 349 if (!snapc && ci->i_wrbuffer_ref_head) {
356 snapc = ceph_get_snap_context(ci->i_head_snapc); 350 snapc = ceph_get_snap_context(ci->i_head_snapc);
357 dout(" head snapc %p has %d dirty pages\n", 351 dout(" head snapc %p has %d dirty pages\n",
358 snapc, ci->i_wrbuffer_ref_head); 352 snapc, ci->i_wrbuffer_ref_head);
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 582e0b2caf8a..a2d002cbdec2 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -376,7 +376,7 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
376 376
377 th = get_ticket_handler(ac, service); 377 th = get_ticket_handler(ac, service);
378 378
379 if (!th) { 379 if (IS_ERR(th)) {
380 *pneed |= service; 380 *pneed |= service;
381 continue; 381 continue;
382 } 382 }
@@ -399,6 +399,9 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
399 struct ceph_x_ticket_handler *th = 399 struct ceph_x_ticket_handler *th =
400 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); 400 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
401 401
402 if (IS_ERR(th))
403 return PTR_ERR(th);
404
402 ceph_x_validate_tickets(ac, &need); 405 ceph_x_validate_tickets(ac, &need);
403 406
404 dout("build_request want %x have %x need %x\n", 407 dout("build_request want %x have %x need %x\n",
@@ -450,7 +453,6 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
450 return -ERANGE; 453 return -ERANGE;
451 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY); 454 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
452 455
453 BUG_ON(!th);
454 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer); 456 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
455 if (ret) 457 if (ret)
456 return ret; 458 return ret;
@@ -505,7 +507,8 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
505 507
506 case CEPHX_GET_PRINCIPAL_SESSION_KEY: 508 case CEPHX_GET_PRINCIPAL_SESSION_KEY:
507 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); 509 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
508 BUG_ON(!th); 510 if (IS_ERR(th))
511 return PTR_ERR(th);
509 ret = ceph_x_proc_ticket_reply(ac, &th->session_key, 512 ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
510 buf + sizeof(*head), end); 513 buf + sizeof(*head), end);
511 break; 514 break;
@@ -563,8 +566,8 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
563 void *end = p + sizeof(au->reply_buf); 566 void *end = p + sizeof(au->reply_buf);
564 567
565 th = get_ticket_handler(ac, au->service); 568 th = get_ticket_handler(ac, au->service);
566 if (!th) 569 if (IS_ERR(th))
567 return -EIO; /* hrm! */ 570 return PTR_ERR(th);
568 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply)); 571 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
569 if (ret < 0) 572 if (ret < 0)
570 return ret; 573 return ret;
@@ -626,7 +629,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
626 struct ceph_x_ticket_handler *th; 629 struct ceph_x_ticket_handler *th;
627 630
628 th = get_ticket_handler(ac, peer_type); 631 th = get_ticket_handler(ac, peer_type);
629 if (th && !IS_ERR(th)) 632 if (!IS_ERR(th))
630 remove_ticket_handler(ac, th); 633 remove_ticket_handler(ac, th);
631} 634}
632 635
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7bf182b03973..a2069b6680ae 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1082,6 +1082,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1082 gid_t gid; 1082 gid_t gid;
1083 struct ceph_mds_session *session; 1083 struct ceph_mds_session *session;
1084 u64 xattr_version = 0; 1084 u64 xattr_version = 0;
1085 struct ceph_buffer *xattr_blob = NULL;
1085 int delayed = 0; 1086 int delayed = 0;
1086 u64 flush_tid = 0; 1087 u64 flush_tid = 0;
1087 int i; 1088 int i;
@@ -1142,6 +1143,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1142 for (i = 0; i < CEPH_CAP_BITS; i++) 1143 for (i = 0; i < CEPH_CAP_BITS; i++)
1143 if (flushing & (1 << i)) 1144 if (flushing & (1 << i))
1144 ci->i_cap_flush_tid[i] = flush_tid; 1145 ci->i_cap_flush_tid[i] = flush_tid;
1146
1147 follows = ci->i_head_snapc->seq;
1148 } else {
1149 follows = 0;
1145 } 1150 }
1146 1151
1147 keep = cap->implemented; 1152 keep = cap->implemented;
@@ -1155,14 +1160,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1155 mtime = inode->i_mtime; 1160 mtime = inode->i_mtime;
1156 atime = inode->i_atime; 1161 atime = inode->i_atime;
1157 time_warp_seq = ci->i_time_warp_seq; 1162 time_warp_seq = ci->i_time_warp_seq;
1158 follows = ci->i_snap_realm->cached_context->seq;
1159 uid = inode->i_uid; 1163 uid = inode->i_uid;
1160 gid = inode->i_gid; 1164 gid = inode->i_gid;
1161 mode = inode->i_mode; 1165 mode = inode->i_mode;
1162 1166
1163 if (dropping & CEPH_CAP_XATTR_EXCL) { 1167 if (flushing & CEPH_CAP_XATTR_EXCL) {
1164 __ceph_build_xattrs_blob(ci); 1168 __ceph_build_xattrs_blob(ci);
1165 xattr_version = ci->i_xattrs.version + 1; 1169 xattr_blob = ci->i_xattrs.blob;
1170 xattr_version = ci->i_xattrs.version;
1166 } 1171 }
1167 1172
1168 spin_unlock(&inode->i_lock); 1173 spin_unlock(&inode->i_lock);
@@ -1170,9 +1175,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1170 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, 1175 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
1171 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, 1176 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
1172 size, max_size, &mtime, &atime, time_warp_seq, 1177 size, max_size, &mtime, &atime, time_warp_seq,
1173 uid, gid, mode, 1178 uid, gid, mode, xattr_version, xattr_blob,
1174 xattr_version,
1175 (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
1176 follows); 1179 follows);
1177 if (ret < 0) { 1180 if (ret < 0) {
1178 dout("error sending cap msg, must requeue %p\n", inode); 1181 dout("error sending cap msg, must requeue %p\n", inode);
@@ -1282,7 +1285,7 @@ retry:
1282 &capsnap->mtime, &capsnap->atime, 1285 &capsnap->mtime, &capsnap->atime,
1283 capsnap->time_warp_seq, 1286 capsnap->time_warp_seq,
1284 capsnap->uid, capsnap->gid, capsnap->mode, 1287 capsnap->uid, capsnap->gid, capsnap->mode,
1285 0, NULL, 1288 capsnap->xattr_version, capsnap->xattr_blob,
1286 capsnap->follows); 1289 capsnap->follows);
1287 1290
1288 next_follows = capsnap->follows + 1; 1291 next_follows = capsnap->follows + 1;
@@ -1332,7 +1335,11 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1332 ceph_cap_string(was | mask)); 1335 ceph_cap_string(was | mask));
1333 ci->i_dirty_caps |= mask; 1336 ci->i_dirty_caps |= mask;
1334 if (was == 0) { 1337 if (was == 0) {
1335 dout(" inode %p now dirty\n", &ci->vfs_inode); 1338 if (!ci->i_head_snapc)
1339 ci->i_head_snapc = ceph_get_snap_context(
1340 ci->i_snap_realm->cached_context);
1341 dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
1342 ci->i_head_snapc);
1336 BUG_ON(!list_empty(&ci->i_dirty_item)); 1343 BUG_ON(!list_empty(&ci->i_dirty_item));
1337 spin_lock(&mdsc->cap_dirty_lock); 1344 spin_lock(&mdsc->cap_dirty_lock);
1338 list_add(&ci->i_dirty_item, &mdsc->cap_dirty); 1345 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
@@ -2190,7 +2197,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2190 2197
2191 if (ci->i_head_snapc == snapc) { 2198 if (ci->i_head_snapc == snapc) {
2192 ci->i_wrbuffer_ref_head -= nr; 2199 ci->i_wrbuffer_ref_head -= nr;
2193 if (!ci->i_wrbuffer_ref_head) { 2200 if (ci->i_wrbuffer_ref_head == 0 &&
2201 ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
2202 BUG_ON(!ci->i_head_snapc);
2194 ceph_put_snap_context(ci->i_head_snapc); 2203 ceph_put_snap_context(ci->i_head_snapc);
2195 ci->i_head_snapc = NULL; 2204 ci->i_head_snapc = NULL;
2196 } 2205 }
@@ -2483,6 +2492,11 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2483 dout(" inode %p now clean\n", inode); 2492 dout(" inode %p now clean\n", inode);
2484 BUG_ON(!list_empty(&ci->i_dirty_item)); 2493 BUG_ON(!list_empty(&ci->i_dirty_item));
2485 drop = 1; 2494 drop = 1;
2495 if (ci->i_wrbuffer_ref_head == 0) {
2496 BUG_ON(!ci->i_head_snapc);
2497 ceph_put_snap_context(ci->i_head_snapc);
2498 ci->i_head_snapc = NULL;
2499 }
2486 } else { 2500 } else {
2487 BUG_ON(list_empty(&ci->i_dirty_item)); 2501 BUG_ON(list_empty(&ci->i_dirty_item));
2488 } 2502 }
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 360c4f22718d..6fd8b20a8611 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -171,6 +171,8 @@ static int mdsc_show(struct seq_file *s, void *p)
171 } else if (req->r_dentry) { 171 } else if (req->r_dentry) {
172 path = ceph_mdsc_build_path(req->r_dentry, &pathlen, 172 path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
173 &pathbase, 0); 173 &pathbase, 0);
174 if (IS_ERR(path))
175 path = NULL;
174 spin_lock(&req->r_dentry->d_lock); 176 spin_lock(&req->r_dentry->d_lock);
175 seq_printf(s, " #%llx/%.*s (%s)", 177 seq_printf(s, " #%llx/%.*s (%s)",
176 ceph_ino(req->r_dentry->d_parent->d_inode), 178 ceph_ino(req->r_dentry->d_parent->d_inode),
@@ -187,6 +189,8 @@ static int mdsc_show(struct seq_file *s, void *p)
187 if (req->r_old_dentry) { 189 if (req->r_old_dentry) {
188 path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen, 190 path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
189 &pathbase, 0); 191 &pathbase, 0);
192 if (IS_ERR(path))
193 path = NULL;
190 spin_lock(&req->r_old_dentry->d_lock); 194 spin_lock(&req->r_old_dentry->d_lock);
191 seq_printf(s, " #%llx/%.*s (%s)", 195 seq_printf(s, " #%llx/%.*s (%s)",
192 ceph_ino(req->r_old_dentry->d_parent->d_inode), 196 ceph_ino(req->r_old_dentry->d_parent->d_inode),
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 67bbb41d5526..6e4f43ff23ec 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -46,7 +46,7 @@ int ceph_init_dentry(struct dentry *dentry)
46 else 46 else
47 dentry->d_op = &ceph_snap_dentry_ops; 47 dentry->d_op = &ceph_snap_dentry_ops;
48 48
49 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS); 49 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
50 if (!di) 50 if (!di)
51 return -ENOMEM; /* oh well */ 51 return -ENOMEM; /* oh well */
52 52
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 5d893d31e399..e7cca414da03 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -677,6 +677,7 @@ static int fill_inode(struct inode *inode,
677 if (ci->i_files == 0 && ci->i_subdirs == 0 && 677 if (ci->i_files == 0 && ci->i_subdirs == 0 &&
678 ceph_snap(inode) == CEPH_NOSNAP && 678 ceph_snap(inode) == CEPH_NOSNAP &&
679 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && 679 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
680 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
680 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { 681 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
681 dout(" marking %p complete (empty)\n", inode); 682 dout(" marking %p complete (empty)\n", inode);
682 ci->i_ceph_flags |= CEPH_I_COMPLETE; 683 ci->i_ceph_flags |= CEPH_I_COMPLETE;
@@ -1229,11 +1230,11 @@ retry_lookup:
1229 in = dn->d_inode; 1230 in = dn->d_inode;
1230 } else { 1231 } else {
1231 in = ceph_get_inode(parent->d_sb, vino); 1232 in = ceph_get_inode(parent->d_sb, vino);
1232 if (in == NULL) { 1233 if (IS_ERR(in)) {
1233 dout("new_inode badness\n"); 1234 dout("new_inode badness\n");
1234 d_delete(dn); 1235 d_delete(dn);
1235 dput(dn); 1236 dput(dn);
1236 err = -ENOMEM; 1237 err = PTR_ERR(in);
1237 goto out; 1238 goto out;
1238 } 1239 }
1239 dn = splice_dentry(dn, in, NULL); 1240 dn = splice_dentry(dn, in, NULL);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ae85af06454f..ff4e753aae92 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -82,7 +82,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
82 length = fl->fl_end - fl->fl_start + 1; 82 length = fl->fl_end - fl->fl_start + 1;
83 83
84 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, 84 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
85 (u64)fl->fl_pid, (u64)fl->fl_nspid, 85 (u64)fl->fl_pid,
86 (u64)(unsigned long)fl->fl_nspid,
86 lock_cmd, fl->fl_start, 87 lock_cmd, fl->fl_start,
87 length, wait); 88 length, wait);
88 if (!err) { 89 if (!err) {
@@ -92,7 +93,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
92 /* undo! This should only happen if the kernel detects 93 /* undo! This should only happen if the kernel detects
93 * local deadlock. */ 94 * local deadlock. */
94 ceph_lock_message(CEPH_LOCK_FCNTL, op, file, 95 ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
95 (u64)fl->fl_pid, (u64)fl->fl_nspid, 96 (u64)fl->fl_pid,
97 (u64)(unsigned long)fl->fl_nspid,
96 CEPH_LOCK_UNLOCK, fl->fl_start, 98 CEPH_LOCK_UNLOCK, fl->fl_start,
97 length, 0); 99 length, 0);
98 dout("got %d on posix_lock_file, undid lock", err); 100 dout("got %d on posix_lock_file, undid lock", err);
@@ -132,7 +134,8 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
132 length = fl->fl_end - fl->fl_start + 1; 134 length = fl->fl_end - fl->fl_start + 1;
133 135
134 err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, 136 err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
135 file, (u64)fl->fl_pid, (u64)fl->fl_nspid, 137 file, (u64)fl->fl_pid,
138 (u64)(unsigned long)fl->fl_nspid,
136 lock_cmd, fl->fl_start, 139 lock_cmd, fl->fl_start,
137 length, wait); 140 length, wait);
138 if (!err) { 141 if (!err) {
@@ -141,7 +144,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
141 ceph_lock_message(CEPH_LOCK_FLOCK, 144 ceph_lock_message(CEPH_LOCK_FLOCK,
142 CEPH_MDS_OP_SETFILELOCK, 145 CEPH_MDS_OP_SETFILELOCK,
143 file, (u64)fl->fl_pid, 146 file, (u64)fl->fl_pid,
144 (u64)fl->fl_nspid, 147 (u64)(unsigned long)fl->fl_nspid,
145 CEPH_LOCK_UNLOCK, fl->fl_start, 148 CEPH_LOCK_UNLOCK, fl->fl_start,
146 length, 0); 149 length, 0);
147 dout("got %d on flock_lock_file_wait, undid lock", err); 150 dout("got %d on flock_lock_file_wait, undid lock", err);
@@ -235,7 +238,8 @@ int lock_to_ceph_filelock(struct file_lock *lock,
235 cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); 238 cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
236 cephlock->client = cpu_to_le64(0); 239 cephlock->client = cpu_to_le64(0);
237 cephlock->pid = cpu_to_le64(lock->fl_pid); 240 cephlock->pid = cpu_to_le64(lock->fl_pid);
238 cephlock->pid_namespace = cpu_to_le64((u64)lock->fl_nspid); 241 cephlock->pid_namespace =
242 cpu_to_le64((u64)(unsigned long)lock->fl_nspid);
239 243
240 switch (lock->fl_type) { 244 switch (lock->fl_type) {
241 case F_RDLCK: 245 case F_RDLCK:
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a75ddbf9fe37..f091b1351786 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -560,6 +560,13 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
560 * 560 *
561 * Called under mdsc->mutex. 561 * Called under mdsc->mutex.
562 */ 562 */
563struct dentry *get_nonsnap_parent(struct dentry *dentry)
564{
565 while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
566 dentry = dentry->d_parent;
567 return dentry;
568}
569
563static int __choose_mds(struct ceph_mds_client *mdsc, 570static int __choose_mds(struct ceph_mds_client *mdsc,
564 struct ceph_mds_request *req) 571 struct ceph_mds_request *req)
565{ 572{
@@ -590,14 +597,29 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
590 if (req->r_inode) { 597 if (req->r_inode) {
591 inode = req->r_inode; 598 inode = req->r_inode;
592 } else if (req->r_dentry) { 599 } else if (req->r_dentry) {
593 if (req->r_dentry->d_inode) { 600 struct inode *dir = req->r_dentry->d_parent->d_inode;
601
602 if (dir->i_sb != mdsc->client->sb) {
603 /* not this fs! */
604 inode = req->r_dentry->d_inode;
605 } else if (ceph_snap(dir) != CEPH_NOSNAP) {
606 /* direct snapped/virtual snapdir requests
607 * based on parent dir inode */
608 struct dentry *dn =
609 get_nonsnap_parent(req->r_dentry->d_parent);
610 inode = dn->d_inode;
611 dout("__choose_mds using nonsnap parent %p\n", inode);
612 } else if (req->r_dentry->d_inode) {
613 /* dentry target */
594 inode = req->r_dentry->d_inode; 614 inode = req->r_dentry->d_inode;
595 } else { 615 } else {
596 inode = req->r_dentry->d_parent->d_inode; 616 /* dir + name */
617 inode = dir;
597 hash = req->r_dentry->d_name.hash; 618 hash = req->r_dentry->d_name.hash;
598 is_hash = true; 619 is_hash = true;
599 } 620 }
600 } 621 }
622
601 dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, 623 dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
602 (int)hash, mode); 624 (int)hash, mode);
603 if (!inode) 625 if (!inode)
@@ -2208,7 +2230,7 @@ static void handle_session(struct ceph_mds_session *session,
2208 pr_info("mds%d reconnect denied\n", session->s_mds); 2230 pr_info("mds%d reconnect denied\n", session->s_mds);
2209 remove_session_caps(session); 2231 remove_session_caps(session);
2210 wake = 1; /* for good measure */ 2232 wake = 1; /* for good measure */
2211 complete_all(&mdsc->session_close_waiters); 2233 wake_up_all(&mdsc->session_close_wq);
2212 kick_requests(mdsc, mds); 2234 kick_requests(mdsc, mds);
2213 break; 2235 break;
2214 2236
@@ -2302,7 +2324,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2302 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0); 2324 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
2303 if (IS_ERR(path)) { 2325 if (IS_ERR(path)) {
2304 err = PTR_ERR(path); 2326 err = PTR_ERR(path);
2305 BUG_ON(err); 2327 goto out_dput;
2306 } 2328 }
2307 } else { 2329 } else {
2308 path = NULL; 2330 path = NULL;
@@ -2310,7 +2332,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2310 } 2332 }
2311 err = ceph_pagelist_encode_string(pagelist, path, pathlen); 2333 err = ceph_pagelist_encode_string(pagelist, path, pathlen);
2312 if (err) 2334 if (err)
2313 goto out; 2335 goto out_free;
2314 2336
2315 spin_lock(&inode->i_lock); 2337 spin_lock(&inode->i_lock);
2316 cap->seq = 0; /* reset cap seq */ 2338 cap->seq = 0; /* reset cap seq */
@@ -2354,8 +2376,9 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2354 unlock_kernel(); 2376 unlock_kernel();
2355 } 2377 }
2356 2378
2357out: 2379out_free:
2358 kfree(path); 2380 kfree(path);
2381out_dput:
2359 dput(dentry); 2382 dput(dentry);
2360 return err; 2383 return err;
2361} 2384}
@@ -2876,7 +2899,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2876 return -ENOMEM; 2899 return -ENOMEM;
2877 2900
2878 init_completion(&mdsc->safe_umount_waiters); 2901 init_completion(&mdsc->safe_umount_waiters);
2879 init_completion(&mdsc->session_close_waiters); 2902 init_waitqueue_head(&mdsc->session_close_wq);
2880 INIT_LIST_HEAD(&mdsc->waiting_for_map); 2903 INIT_LIST_HEAD(&mdsc->waiting_for_map);
2881 mdsc->sessions = NULL; 2904 mdsc->sessions = NULL;
2882 mdsc->max_sessions = 0; 2905 mdsc->max_sessions = 0;
@@ -3021,6 +3044,23 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3021 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); 3044 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
3022} 3045}
3023 3046
3047/*
3048 * true if all sessions are closed, or we force unmount
3049 */
3050bool done_closing_sessions(struct ceph_mds_client *mdsc)
3051{
3052 int i, n = 0;
3053
3054 if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
3055 return true;
3056
3057 mutex_lock(&mdsc->mutex);
3058 for (i = 0; i < mdsc->max_sessions; i++)
3059 if (mdsc->sessions[i])
3060 n++;
3061 mutex_unlock(&mdsc->mutex);
3062 return n == 0;
3063}
3024 3064
3025/* 3065/*
3026 * called after sb is ro. 3066 * called after sb is ro.
@@ -3029,45 +3069,32 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
3029{ 3069{
3030 struct ceph_mds_session *session; 3070 struct ceph_mds_session *session;
3031 int i; 3071 int i;
3032 int n;
3033 struct ceph_client *client = mdsc->client; 3072 struct ceph_client *client = mdsc->client;
3034 unsigned long started, timeout = client->mount_args->mount_timeout * HZ; 3073 unsigned long timeout = client->mount_args->mount_timeout * HZ;
3035 3074
3036 dout("close_sessions\n"); 3075 dout("close_sessions\n");
3037 3076
3038 mutex_lock(&mdsc->mutex);
3039
3040 /* close sessions */ 3077 /* close sessions */
3041 started = jiffies; 3078 mutex_lock(&mdsc->mutex);
3042 while (time_before(jiffies, started + timeout)) { 3079 for (i = 0; i < mdsc->max_sessions; i++) {
3043 dout("closing sessions\n"); 3080 session = __ceph_lookup_mds_session(mdsc, i);
3044 n = 0; 3081 if (!session)
3045 for (i = 0; i < mdsc->max_sessions; i++) { 3082 continue;
3046 session = __ceph_lookup_mds_session(mdsc, i);
3047 if (!session)
3048 continue;
3049 mutex_unlock(&mdsc->mutex);
3050 mutex_lock(&session->s_mutex);
3051 __close_session(mdsc, session);
3052 mutex_unlock(&session->s_mutex);
3053 ceph_put_mds_session(session);
3054 mutex_lock(&mdsc->mutex);
3055 n++;
3056 }
3057 if (n == 0)
3058 break;
3059
3060 if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
3061 break;
3062
3063 dout("waiting for sessions to close\n");
3064 mutex_unlock(&mdsc->mutex); 3083 mutex_unlock(&mdsc->mutex);
3065 wait_for_completion_timeout(&mdsc->session_close_waiters, 3084 mutex_lock(&session->s_mutex);
3066 timeout); 3085 __close_session(mdsc, session);
3086 mutex_unlock(&session->s_mutex);
3087 ceph_put_mds_session(session);
3067 mutex_lock(&mdsc->mutex); 3088 mutex_lock(&mdsc->mutex);
3068 } 3089 }
3090 mutex_unlock(&mdsc->mutex);
3091
3092 dout("waiting for sessions to close\n");
3093 wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
3094 timeout);
3069 3095
3070 /* tear down remaining sessions */ 3096 /* tear down remaining sessions */
3097 mutex_lock(&mdsc->mutex);
3071 for (i = 0; i < mdsc->max_sessions; i++) { 3098 for (i = 0; i < mdsc->max_sessions; i++) {
3072 if (mdsc->sessions[i]) { 3099 if (mdsc->sessions[i]) {
3073 session = get_session(mdsc->sessions[i]); 3100 session = get_session(mdsc->sessions[i]);
@@ -3080,9 +3107,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
3080 mutex_lock(&mdsc->mutex); 3107 mutex_lock(&mdsc->mutex);
3081 } 3108 }
3082 } 3109 }
3083
3084 WARN_ON(!list_empty(&mdsc->cap_delay_list)); 3110 WARN_ON(!list_empty(&mdsc->cap_delay_list));
3085
3086 mutex_unlock(&mdsc->mutex); 3111 mutex_unlock(&mdsc->mutex);
3087 3112
3088 ceph_cleanup_empty_realms(mdsc); 3113 ceph_cleanup_empty_realms(mdsc);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ab7e89f5e344..c98267ce6d2a 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -234,7 +234,8 @@ struct ceph_mds_client {
234 struct mutex mutex; /* all nested structures */ 234 struct mutex mutex; /* all nested structures */
235 235
236 struct ceph_mdsmap *mdsmap; 236 struct ceph_mdsmap *mdsmap;
237 struct completion safe_umount_waiters, session_close_waiters; 237 struct completion safe_umount_waiters;
238 wait_queue_head_t session_close_wq;
238 struct list_head waiting_for_map; 239 struct list_head waiting_for_map;
239 240
240 struct ceph_mds_session **sessions; /* NULL for mds if no session */ 241 struct ceph_mds_session **sessions; /* NULL for mds if no session */
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index bed6391e52c7..dfced1dacbcd 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -661,7 +661,7 @@ static int __send_request(struct ceph_osd_client *osdc,
661 reqhead->reassert_version = req->r_reassert_version; 661 reqhead->reassert_version = req->r_reassert_version;
662 662
663 req->r_stamp = jiffies; 663 req->r_stamp = jiffies;
664 list_move_tail(&osdc->req_lru, &req->r_req_lru_item); 664 list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
665 665
666 ceph_msg_get(req->r_request); /* send consumes a ref */ 666 ceph_msg_get(req->r_request); /* send consumes a ref */
667 ceph_con_send(&req->r_osd->o_con, req->r_request); 667 ceph_con_send(&req->r_osd->o_con, req->r_request);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index c0b26b6badba..4868b9dcac5a 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -435,7 +435,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
435{ 435{
436 struct inode *inode = &ci->vfs_inode; 436 struct inode *inode = &ci->vfs_inode;
437 struct ceph_cap_snap *capsnap; 437 struct ceph_cap_snap *capsnap;
438 int used; 438 int used, dirty;
439 439
440 capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); 440 capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
441 if (!capsnap) { 441 if (!capsnap) {
@@ -445,6 +445,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
445 445
446 spin_lock(&inode->i_lock); 446 spin_lock(&inode->i_lock);
447 used = __ceph_caps_used(ci); 447 used = __ceph_caps_used(ci);
448 dirty = __ceph_caps_dirty(ci);
448 if (__ceph_have_pending_cap_snap(ci)) { 449 if (__ceph_have_pending_cap_snap(ci)) {
449 /* there is no point in queuing multiple "pending" cap_snaps, 450 /* there is no point in queuing multiple "pending" cap_snaps,
450 as no new writes are allowed to start when pending, so any 451 as no new writes are allowed to start when pending, so any
@@ -452,11 +453,15 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
452 cap_snap. lucky us. */ 453 cap_snap. lucky us. */
453 dout("queue_cap_snap %p already pending\n", inode); 454 dout("queue_cap_snap %p already pending\n", inode);
454 kfree(capsnap); 455 kfree(capsnap);
455 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) { 456 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) ||
457 (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
458 CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) {
456 struct ceph_snap_context *snapc = ci->i_head_snapc; 459 struct ceph_snap_context *snapc = ci->i_head_snapc;
457 460
461 dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode,
462 capsnap, snapc);
458 igrab(inode); 463 igrab(inode);
459 464
460 atomic_set(&capsnap->nref, 1); 465 atomic_set(&capsnap->nref, 1);
461 capsnap->ci = ci; 466 capsnap->ci = ci;
462 INIT_LIST_HEAD(&capsnap->ci_item); 467 INIT_LIST_HEAD(&capsnap->ci_item);
@@ -464,15 +469,21 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
464 469
465 capsnap->follows = snapc->seq - 1; 470 capsnap->follows = snapc->seq - 1;
466 capsnap->issued = __ceph_caps_issued(ci, NULL); 471 capsnap->issued = __ceph_caps_issued(ci, NULL);
467 capsnap->dirty = __ceph_caps_dirty(ci); 472 capsnap->dirty = dirty;
468 473
469 capsnap->mode = inode->i_mode; 474 capsnap->mode = inode->i_mode;
470 capsnap->uid = inode->i_uid; 475 capsnap->uid = inode->i_uid;
471 capsnap->gid = inode->i_gid; 476 capsnap->gid = inode->i_gid;
472 477
473 /* fixme? */ 478 if (dirty & CEPH_CAP_XATTR_EXCL) {
474 capsnap->xattr_blob = NULL; 479 __ceph_build_xattrs_blob(ci);
475 capsnap->xattr_len = 0; 480 capsnap->xattr_blob =
481 ceph_buffer_get(ci->i_xattrs.blob);
482 capsnap->xattr_version = ci->i_xattrs.version;
483 } else {
484 capsnap->xattr_blob = NULL;
485 capsnap->xattr_version = 0;
486 }
476 487
477 /* dirty page count moved from _head to this cap_snap; 488 /* dirty page count moved from _head to this cap_snap;
478 all subsequent writes page dirties occur _after_ this 489 all subsequent writes page dirties occur _after_ this
@@ -480,7 +491,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
480 capsnap->dirty_pages = ci->i_wrbuffer_ref_head; 491 capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
481 ci->i_wrbuffer_ref_head = 0; 492 ci->i_wrbuffer_ref_head = 0;
482 capsnap->context = snapc; 493 capsnap->context = snapc;
483 ci->i_head_snapc = NULL; 494 ci->i_head_snapc =
495 ceph_get_snap_context(ci->i_snap_realm->cached_context);
496 dout(" new snapc is %p\n", ci->i_head_snapc);
484 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); 497 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
485 498
486 if (used & CEPH_CAP_FILE_WR) { 499 if (used & CEPH_CAP_FILE_WR) {
@@ -539,6 +552,41 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
539 return 1; /* caller may want to ceph_flush_snaps */ 552 return 1; /* caller may want to ceph_flush_snaps */
540} 553}
541 554
555/*
556 * Queue cap_snaps for snap writeback for this realm and its children.
557 * Called under snap_rwsem, so realm topology won't change.
558 */
559static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
560{
561 struct ceph_inode_info *ci;
562 struct inode *lastinode = NULL;
563 struct ceph_snap_realm *child;
564
565 dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
566
567 spin_lock(&realm->inodes_with_caps_lock);
568 list_for_each_entry(ci, &realm->inodes_with_caps,
569 i_snap_realm_item) {
570 struct inode *inode = igrab(&ci->vfs_inode);
571 if (!inode)
572 continue;
573 spin_unlock(&realm->inodes_with_caps_lock);
574 if (lastinode)
575 iput(lastinode);
576 lastinode = inode;
577 ceph_queue_cap_snap(ci);
578 spin_lock(&realm->inodes_with_caps_lock);
579 }
580 spin_unlock(&realm->inodes_with_caps_lock);
581 if (lastinode)
582 iput(lastinode);
583
584 dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino);
585 list_for_each_entry(child, &realm->children, child_item)
586 queue_realm_cap_snaps(child);
587
588 dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
589}
542 590
543/* 591/*
544 * Parse and apply a snapblob "snap trace" from the MDS. This specifies 592 * Parse and apply a snapblob "snap trace" from the MDS. This specifies
@@ -589,29 +637,8 @@ more:
589 * 637 *
590 * ...unless it's a snap deletion! 638 * ...unless it's a snap deletion!
591 */ 639 */
592 if (!deletion) { 640 if (!deletion)
593 struct ceph_inode_info *ci; 641 queue_realm_cap_snaps(realm);
594 struct inode *lastinode = NULL;
595
596 spin_lock(&realm->inodes_with_caps_lock);
597 list_for_each_entry(ci, &realm->inodes_with_caps,
598 i_snap_realm_item) {
599 struct inode *inode = igrab(&ci->vfs_inode);
600 if (!inode)
601 continue;
602 spin_unlock(&realm->inodes_with_caps_lock);
603 if (lastinode)
604 iput(lastinode);
605 lastinode = inode;
606 ceph_queue_cap_snap(ci);
607 spin_lock(&realm->inodes_with_caps_lock);
608 }
609 spin_unlock(&realm->inodes_with_caps_lock);
610 if (lastinode)
611 iput(lastinode);
612 dout("update_snap_trace cap_snaps queued\n");
613 }
614
615 } else { 642 } else {
616 dout("update_snap_trace %llx %p seq %lld unchanged\n", 643 dout("update_snap_trace %llx %p seq %lld unchanged\n",
617 realm->ino, realm, realm->seq); 644 realm->ino, realm, realm->seq);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 2482d696f0de..c33897ae5725 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -216,8 +216,7 @@ struct ceph_cap_snap {
216 uid_t uid; 216 uid_t uid;
217 gid_t gid; 217 gid_t gid;
218 218
219 void *xattr_blob; 219 struct ceph_buffer *xattr_blob;
220 int xattr_len;
221 u64 xattr_version; 220 u64 xattr_version;
222 221
223 u64 size; 222 u64 size;
@@ -229,8 +228,11 @@ struct ceph_cap_snap {
229 228
230static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) 229static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
231{ 230{
232 if (atomic_dec_and_test(&capsnap->nref)) 231 if (atomic_dec_and_test(&capsnap->nref)) {
232 if (capsnap->xattr_blob)
233 ceph_buffer_put(capsnap->xattr_blob);
233 kfree(capsnap); 234 kfree(capsnap);
235 }
234} 236}
235 237
236/* 238/*
@@ -342,7 +344,8 @@ struct ceph_inode_info {
342 unsigned i_cap_exporting_issued; 344 unsigned i_cap_exporting_issued;
343 struct ceph_cap_reservation i_cap_migration_resv; 345 struct ceph_cap_reservation i_cap_migration_resv;
344 struct list_head i_cap_snaps; /* snapped state pending flush to mds */ 346 struct list_head i_cap_snaps; /* snapped state pending flush to mds */
345 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */ 347 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or
348 dirty|flushing caps */
346 unsigned i_snap_caps; /* cap bits for snapped files */ 349 unsigned i_snap_caps; /* cap bits for snapped files */
347 350
348 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ 351 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 097a2654c00f..9578af610b73 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -485,6 +485,7 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
485 ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob; 485 ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
486 ci->i_xattrs.prealloc_blob = NULL; 486 ci->i_xattrs.prealloc_blob = NULL;
487 ci->i_xattrs.dirty = false; 487 ci->i_xattrs.dirty = false;
488 ci->i_xattrs.version++;
488 } 489 }
489} 490}
490 491
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 917b7d449bb2..0da1debd499d 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -2,6 +2,8 @@ config CIFS
2 tristate "CIFS support (advanced network filesystem, SMBFS successor)" 2 tristate "CIFS support (advanced network filesystem, SMBFS successor)"
3 depends on INET 3 depends on INET
4 select NLS 4 select NLS
5 select CRYPTO_MD5
6 select CRYPTO_ARC4
5 help 7 help
6 This is the client VFS module for the Common Internet File System 8 This is the client VFS module for the Common Internet File System
7 (CIFS) protocol which is the successor to the Server Message Block 9 (CIFS) protocol which is the successor to the Server Message Block
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index cfd1ce34e0bc..21f0fbd86989 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -597,13 +597,13 @@ decode_negTokenInit(unsigned char *security_blob, int length,
597 if (compare_oid(oid, oidlen, MSKRB5_OID, 597 if (compare_oid(oid, oidlen, MSKRB5_OID,
598 MSKRB5_OID_LEN)) 598 MSKRB5_OID_LEN))
599 server->sec_mskerberos = true; 599 server->sec_mskerberos = true;
600 else if (compare_oid(oid, oidlen, KRB5U2U_OID, 600 if (compare_oid(oid, oidlen, KRB5U2U_OID,
601 KRB5U2U_OID_LEN)) 601 KRB5U2U_OID_LEN))
602 server->sec_kerberosu2u = true; 602 server->sec_kerberosu2u = true;
603 else if (compare_oid(oid, oidlen, KRB5_OID, 603 if (compare_oid(oid, oidlen, KRB5_OID,
604 KRB5_OID_LEN)) 604 KRB5_OID_LEN))
605 server->sec_kerberos = true; 605 server->sec_kerberos = true;
606 else if (compare_oid(oid, oidlen, NTLMSSP_OID, 606 if (compare_oid(oid, oidlen, NTLMSSP_OID,
607 NTLMSSP_OID_LEN)) 607 NTLMSSP_OID_LEN))
608 server->sec_ntlmssp = true; 608 server->sec_ntlmssp = true;
609 609
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 650638275a6f..7fe6b52df507 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -30,6 +30,8 @@
30 * This is a compressed table of upper and lower case conversion. 30 * This is a compressed table of upper and lower case conversion.
31 * 31 *
32 */ 32 */
33#ifndef _CIFS_UNICODE_H
34#define _CIFS_UNICODE_H
33 35
34#include <asm/byteorder.h> 36#include <asm/byteorder.h>
35#include <linux/types.h> 37#include <linux/types.h>
@@ -67,8 +69,8 @@ extern const struct UniCaseRange CifsUniUpperRange[];
67#endif /* UNIUPR_NOUPPER */ 69#endif /* UNIUPR_NOUPPER */
68 70
69#ifndef UNIUPR_NOLOWER 71#ifndef UNIUPR_NOLOWER
70extern signed char UniLowerTable[512]; 72extern signed char CifsUniLowerTable[512];
71extern struct UniCaseRange UniLowerRange[]; 73extern const struct UniCaseRange CifsUniLowerRange[];
72#endif /* UNIUPR_NOLOWER */ 74#endif /* UNIUPR_NOLOWER */
73 75
74#ifdef __KERNEL__ 76#ifdef __KERNEL__
@@ -337,15 +339,15 @@ UniStrupr(register wchar_t *upin)
337 * UniTolower: Convert a unicode character to lower case 339 * UniTolower: Convert a unicode character to lower case
338 */ 340 */
339static inline wchar_t 341static inline wchar_t
340UniTolower(wchar_t uc) 342UniTolower(register wchar_t uc)
341{ 343{
342 register struct UniCaseRange *rp; 344 register const struct UniCaseRange *rp;
343 345
344 if (uc < sizeof(UniLowerTable)) { 346 if (uc < sizeof(CifsUniLowerTable)) {
345 /* Latin characters */ 347 /* Latin characters */
346 return uc + UniLowerTable[uc]; /* Use base tables */ 348 return uc + CifsUniLowerTable[uc]; /* Use base tables */
347 } else { 349 } else {
348 rp = UniLowerRange; /* Use range tables */ 350 rp = CifsUniLowerRange; /* Use range tables */
349 while (rp->start) { 351 while (rp->start) {
350 if (uc < rp->start) /* Before start of range */ 352 if (uc < rp->start) /* Before start of range */
351 return uc; /* Uppercase = input */ 353 return uc; /* Uppercase = input */
@@ -374,3 +376,5 @@ UniStrlwr(register wchar_t *upin)
374} 376}
375 377
376#endif 378#endif
379
380#endif /* _CIFS_UNICODE_H */
diff --git a/fs/cifs/cifs_uniupr.h b/fs/cifs/cifs_uniupr.h
index 18a9d978e519..0ac7c5a8633a 100644
--- a/fs/cifs/cifs_uniupr.h
+++ b/fs/cifs/cifs_uniupr.h
@@ -140,7 +140,7 @@ const struct UniCaseRange CifsUniUpperRange[] = {
140/* 140/*
141 * Latin lower case 141 * Latin lower case
142 */ 142 */
143static signed char CifsUniLowerTable[512] = { 143signed char CifsUniLowerTable[512] = {
144 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ 144 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */
145 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ 145 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */
146 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ 146 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */
@@ -242,12 +242,12 @@ static signed char UniCaseRangeLff20[27] = {
242/* 242/*
243 * Lower Case Range 243 * Lower Case Range
244 */ 244 */
245static const struct UniCaseRange CifsUniLowerRange[] = { 245const struct UniCaseRange CifsUniLowerRange[] = {
246 0x0380, 0x03ab, UniCaseRangeL0380, 246 {0x0380, 0x03ab, UniCaseRangeL0380},
247 0x0400, 0x042f, UniCaseRangeL0400, 247 {0x0400, 0x042f, UniCaseRangeL0400},
248 0x0490, 0x04cb, UniCaseRangeL0490, 248 {0x0490, 0x04cb, UniCaseRangeL0490},
249 0x1e00, 0x1ff7, UniCaseRangeL1e00, 249 {0x1e00, 0x1ff7, UniCaseRangeL1e00},
250 0xff20, 0xff3a, UniCaseRangeLff20, 250 {0xff20, 0xff3a, UniCaseRangeLff20},
251 0, 0, 0 251 {0}
252}; 252};
253#endif 253#endif
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 847628dfdc44..709f2296bdb4 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -27,6 +27,7 @@
27#include "md5.h" 27#include "md5.h"
28#include "cifs_unicode.h" 28#include "cifs_unicode.h"
29#include "cifsproto.h" 29#include "cifsproto.h"
30#include "ntlmssp.h"
30#include <linux/ctype.h> 31#include <linux/ctype.h>
31#include <linux/random.h> 32#include <linux/random.h>
32 33
@@ -42,21 +43,43 @@ extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
42 unsigned char *p24); 43 unsigned char *p24);
43 44
44static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, 45static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
45 const struct mac_key *key, char *signature) 46 struct TCP_Server_Info *server, char *signature)
46{ 47{
47 struct MD5Context context; 48 int rc;
48 49
49 if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL)) 50 if (cifs_pdu == NULL || server == NULL || signature == NULL)
50 return -EINVAL; 51 return -EINVAL;
51 52
52 cifs_MD5_init(&context); 53 if (!server->ntlmssp.sdescmd5) {
53 cifs_MD5_update(&context, (char *)&key->data, key->len); 54 cERROR(1,
54 cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length); 55 "cifs_calculate_signature: can't generate signature\n");
56 return -1;
57 }
55 58
56 cifs_MD5_final(signature, &context); 59 rc = crypto_shash_init(&server->ntlmssp.sdescmd5->shash);
57 return 0; 60 if (rc) {
61 cERROR(1, "cifs_calculate_signature: oould not init md5\n");
62 return rc;
63 }
64
65 if (server->secType == RawNTLMSSP)
66 crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
67 server->session_key.data.ntlmv2.key,
68 CIFS_NTLMV2_SESSKEY_SIZE);
69 else
70 crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
71 (char *)&server->session_key.data,
72 server->session_key.len);
73
74 crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
75 cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
76
77 rc = crypto_shash_final(&server->ntlmssp.sdescmd5->shash, signature);
78
79 return rc;
58} 80}
59 81
82
60int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, 83int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
61 __u32 *pexpected_response_sequence_number) 84 __u32 *pexpected_response_sequence_number)
62{ 85{
@@ -78,8 +101,7 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
78 server->sequence_number++; 101 server->sequence_number++;
79 spin_unlock(&GlobalMid_Lock); 102 spin_unlock(&GlobalMid_Lock);
80 103
81 rc = cifs_calculate_signature(cifs_pdu, &server->mac_signing_key, 104 rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
82 smb_signature);
83 if (rc) 105 if (rc)
84 memset(cifs_pdu->Signature.SecuritySignature, 0, 8); 106 memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
85 else 107 else
@@ -89,21 +111,39 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
89} 111}
90 112
91static int cifs_calc_signature2(const struct kvec *iov, int n_vec, 113static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
92 const struct mac_key *key, char *signature) 114 struct TCP_Server_Info *server, char *signature)
93{ 115{
94 struct MD5Context context;
95 int i; 116 int i;
117 int rc;
96 118
97 if ((iov == NULL) || (signature == NULL) || (key == NULL)) 119 if (iov == NULL || server == NULL || signature == NULL)
98 return -EINVAL; 120 return -EINVAL;
99 121
100 cifs_MD5_init(&context); 122 if (!server->ntlmssp.sdescmd5) {
101 cifs_MD5_update(&context, (char *)&key->data, key->len); 123 cERROR(1, "cifs_calc_signature2: can't generate signature\n");
124 return -1;
125 }
126
127 rc = crypto_shash_init(&server->ntlmssp.sdescmd5->shash);
128 if (rc) {
129 cERROR(1, "cifs_calc_signature2: oould not init md5\n");
130 return rc;
131 }
132
133 if (server->secType == RawNTLMSSP)
134 crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
135 server->session_key.data.ntlmv2.key,
136 CIFS_NTLMV2_SESSKEY_SIZE);
137 else
138 crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
139 (char *)&server->session_key.data,
140 server->session_key.len);
141
102 for (i = 0; i < n_vec; i++) { 142 for (i = 0; i < n_vec; i++) {
103 if (iov[i].iov_len == 0) 143 if (iov[i].iov_len == 0)
104 continue; 144 continue;
105 if (iov[i].iov_base == NULL) { 145 if (iov[i].iov_base == NULL) {
106 cERROR(1, "null iovec entry"); 146 cERROR(1, "cifs_calc_signature2: null iovec entry");
107 return -EIO; 147 return -EIO;
108 } 148 }
109 /* The first entry includes a length field (which does not get 149 /* The first entry includes a length field (which does not get
@@ -111,18 +151,18 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
111 if (i == 0) { 151 if (i == 0) {
112 if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ 152 if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
113 break; /* nothing to sign or corrupt header */ 153 break; /* nothing to sign or corrupt header */
114 cifs_MD5_update(&context, iov[0].iov_base+4, 154 crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
115 iov[0].iov_len-4); 155 iov[i].iov_base + 4, iov[i].iov_len - 4);
116 } else 156 } else
117 cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len); 157 crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
158 iov[i].iov_base, iov[i].iov_len);
118 } 159 }
119 160
120 cifs_MD5_final(signature, &context); 161 rc = crypto_shash_final(&server->ntlmssp.sdescmd5->shash, signature);
121 162
122 return 0; 163 return rc;
123} 164}
124 165
125
126int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, 166int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
127 __u32 *pexpected_response_sequence_number) 167 __u32 *pexpected_response_sequence_number)
128{ 168{
@@ -145,8 +185,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
145 server->sequence_number++; 185 server->sequence_number++;
146 spin_unlock(&GlobalMid_Lock); 186 spin_unlock(&GlobalMid_Lock);
147 187
148 rc = cifs_calc_signature2(iov, n_vec, &server->mac_signing_key, 188 rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
149 smb_signature);
150 if (rc) 189 if (rc)
151 memset(cifs_pdu->Signature.SecuritySignature, 0, 8); 190 memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
152 else 191 else
@@ -156,14 +195,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
156} 195}
157 196
158int cifs_verify_signature(struct smb_hdr *cifs_pdu, 197int cifs_verify_signature(struct smb_hdr *cifs_pdu,
159 const struct mac_key *mac_key, 198 struct TCP_Server_Info *server,
160 __u32 expected_sequence_number) 199 __u32 expected_sequence_number)
161{ 200{
162 unsigned int rc; 201 int rc;
163 char server_response_sig[8]; 202 char server_response_sig[8];
164 char what_we_think_sig_should_be[20]; 203 char what_we_think_sig_should_be[20];
165 204
166 if ((cifs_pdu == NULL) || (mac_key == NULL)) 205 if (cifs_pdu == NULL || server == NULL)
167 return -EINVAL; 206 return -EINVAL;
168 207
169 if (cifs_pdu->Command == SMB_COM_NEGOTIATE) 208 if (cifs_pdu->Command == SMB_COM_NEGOTIATE)
@@ -192,7 +231,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
192 cpu_to_le32(expected_sequence_number); 231 cpu_to_le32(expected_sequence_number);
193 cifs_pdu->Signature.Sequence.Reserved = 0; 232 cifs_pdu->Signature.Sequence.Reserved = 0;
194 233
195 rc = cifs_calculate_signature(cifs_pdu, mac_key, 234 rc = cifs_calculate_signature(cifs_pdu, server,
196 what_we_think_sig_should_be); 235 what_we_think_sig_should_be);
197 236
198 if (rc) 237 if (rc)
@@ -209,7 +248,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
209} 248}
210 249
211/* We fill in key by putting in 40 byte array which was allocated by caller */ 250/* We fill in key by putting in 40 byte array which was allocated by caller */
212int cifs_calculate_mac_key(struct mac_key *key, const char *rn, 251int cifs_calculate_session_key(struct session_key *key, const char *rn,
213 const char *password) 252 const char *password)
214{ 253{
215 char temp_key[16]; 254 char temp_key[16];
@@ -223,63 +262,6 @@ int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
223 return 0; 262 return 0;
224} 263}
225 264
226int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *ses,
227 const struct nls_table *nls_info)
228{
229 char temp_hash[16];
230 struct HMACMD5Context ctx;
231 char *ucase_buf;
232 __le16 *unicode_buf;
233 unsigned int i, user_name_len, dom_name_len;
234
235 if (ses == NULL)
236 return -EINVAL;
237
238 E_md4hash(ses->password, temp_hash);
239
240 hmac_md5_init_limK_to_64(temp_hash, 16, &ctx);
241 user_name_len = strlen(ses->userName);
242 if (user_name_len > MAX_USERNAME_SIZE)
243 return -EINVAL;
244 if (ses->domainName == NULL)
245 return -EINVAL; /* BB should we use CIFS_LINUX_DOM */
246 dom_name_len = strlen(ses->domainName);
247 if (dom_name_len > MAX_USERNAME_SIZE)
248 return -EINVAL;
249
250 ucase_buf = kmalloc((MAX_USERNAME_SIZE+1), GFP_KERNEL);
251 if (ucase_buf == NULL)
252 return -ENOMEM;
253 unicode_buf = kmalloc((MAX_USERNAME_SIZE+1)*4, GFP_KERNEL);
254 if (unicode_buf == NULL) {
255 kfree(ucase_buf);
256 return -ENOMEM;
257 }
258
259 for (i = 0; i < user_name_len; i++)
260 ucase_buf[i] = nls_info->charset2upper[(int)ses->userName[i]];
261 ucase_buf[i] = 0;
262 user_name_len = cifs_strtoUCS(unicode_buf, ucase_buf,
263 MAX_USERNAME_SIZE*2, nls_info);
264 unicode_buf[user_name_len] = 0;
265 user_name_len++;
266
267 for (i = 0; i < dom_name_len; i++)
268 ucase_buf[i] = nls_info->charset2upper[(int)ses->domainName[i]];
269 ucase_buf[i] = 0;
270 dom_name_len = cifs_strtoUCS(unicode_buf+user_name_len, ucase_buf,
271 MAX_USERNAME_SIZE*2, nls_info);
272
273 unicode_buf[user_name_len + dom_name_len] = 0;
274 hmac_md5_update((const unsigned char *) unicode_buf,
275 (user_name_len+dom_name_len)*2, &ctx);
276
277 hmac_md5_final(ses->server->ntlmv2_hash, &ctx);
278 kfree(ucase_buf);
279 kfree(unicode_buf);
280 return 0;
281}
282
283#ifdef CONFIG_CIFS_WEAK_PW_HASH 265#ifdef CONFIG_CIFS_WEAK_PW_HASH
284void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, 266void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
285 char *lnm_session_key) 267 char *lnm_session_key)
@@ -324,38 +306,52 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses,
324{ 306{
325 int rc = 0; 307 int rc = 0;
326 int len; 308 int len;
327 char nt_hash[16]; 309 char nt_hash[CIFS_NTHASH_SIZE];
328 struct HMACMD5Context *pctxt;
329 wchar_t *user; 310 wchar_t *user;
330 wchar_t *domain; 311 wchar_t *domain;
312 wchar_t *server;
331 313
332 pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL); 314 if (!ses->server->ntlmssp.sdeschmacmd5) {
333 315 cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
334 if (pctxt == NULL) 316 return -1;
335 return -ENOMEM; 317 }
336 318
337 /* calculate md4 hash of password */ 319 /* calculate md4 hash of password */
338 E_md4hash(ses->password, nt_hash); 320 E_md4hash(ses->password, nt_hash);
339 321
340 /* convert Domainname to unicode and uppercase */ 322 crypto_shash_setkey(ses->server->ntlmssp.hmacmd5, nt_hash,
341 hmac_md5_init_limK_to_64(nt_hash, 16, pctxt); 323 CIFS_NTHASH_SIZE);
324
325 rc = crypto_shash_init(&ses->server->ntlmssp.sdeschmacmd5->shash);
326 if (rc) {
327 cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5\n");
328 return rc;
329 }
342 330
343 /* convert ses->userName to unicode and uppercase */ 331 /* convert ses->userName to unicode and uppercase */
344 len = strlen(ses->userName); 332 len = strlen(ses->userName);
345 user = kmalloc(2 + (len * 2), GFP_KERNEL); 333 user = kmalloc(2 + (len * 2), GFP_KERNEL);
346 if (user == NULL) 334 if (user == NULL) {
335 cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
336 rc = -ENOMEM;
347 goto calc_exit_2; 337 goto calc_exit_2;
338 }
348 len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp); 339 len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp);
349 UniStrupr(user); 340 UniStrupr(user);
350 hmac_md5_update((char *)user, 2*len, pctxt); 341
342 crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash,
343 (char *)user, 2 * len);
351 344
352 /* convert ses->domainName to unicode and uppercase */ 345 /* convert ses->domainName to unicode and uppercase */
353 if (ses->domainName) { 346 if (ses->domainName) {
354 len = strlen(ses->domainName); 347 len = strlen(ses->domainName);
355 348
356 domain = kmalloc(2 + (len * 2), GFP_KERNEL); 349 domain = kmalloc(2 + (len * 2), GFP_KERNEL);
357 if (domain == NULL) 350 if (domain == NULL) {
351 cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure");
352 rc = -ENOMEM;
358 goto calc_exit_1; 353 goto calc_exit_1;
354 }
359 len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len, 355 len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
360 nls_cp); 356 nls_cp);
361 /* the following line was removed since it didn't work well 357 /* the following line was removed since it didn't work well
@@ -363,65 +359,292 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses,
363 Maybe converting the domain name earlier makes sense */ 359 Maybe converting the domain name earlier makes sense */
364 /* UniStrupr(domain); */ 360 /* UniStrupr(domain); */
365 361
366 hmac_md5_update((char *)domain, 2*len, pctxt); 362 crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash,
363 (char *)domain, 2 * len);
367 364
368 kfree(domain); 365 kfree(domain);
366 } else if (ses->serverName) {
367 len = strlen(ses->serverName);
368
369 server = kmalloc(2 + (len * 2), GFP_KERNEL);
370 if (server == NULL) {
371 cERROR(1, "calc_ntlmv2_hash: server mem alloc failure");
372 rc = -ENOMEM;
373 goto calc_exit_1;
374 }
375 len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
376 nls_cp);
377 /* the following line was removed since it didn't work well
378 with lower cased domain name that passed as an option.
379 Maybe converting the domain name earlier makes sense */
380 /* UniStrupr(domain); */
381
382 crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash,
383 (char *)server, 2 * len);
384
385 kfree(server);
369 } 386 }
387
388 rc = crypto_shash_final(&ses->server->ntlmssp.sdeschmacmd5->shash,
389 ses->server->ntlmv2_hash);
390
370calc_exit_1: 391calc_exit_1:
371 kfree(user); 392 kfree(user);
372calc_exit_2: 393calc_exit_2:
373 /* BB FIXME what about bytes 24 through 40 of the signing key? 394 /* BB FIXME what about bytes 24 through 40 of the signing key?
374 compare with the NTLM example */ 395 compare with the NTLM example */
375 hmac_md5_final(ses->server->ntlmv2_hash, pctxt);
376 396
377 kfree(pctxt);
378 return rc; 397 return rc;
379} 398}
380 399
381void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf, 400static int
382 const struct nls_table *nls_cp) 401find_domain_name(struct cifsSesInfo *ses)
402{
403 int rc = 0;
404 unsigned int attrsize;
405 unsigned int type;
406 unsigned char *blobptr;
407 struct ntlmssp2_name *attrptr;
408
409 if (ses->server->tiblob) {
410 blobptr = ses->server->tiblob;
411 attrptr = (struct ntlmssp2_name *) blobptr;
412
413 while ((type = attrptr->type) != 0) {
414 blobptr += 2; /* advance attr type */
415 attrsize = attrptr->length;
416 blobptr += 2; /* advance attr size */
417 if (type == NTLMSSP_AV_NB_DOMAIN_NAME) {
418 if (!ses->domainName) {
419 ses->domainName =
420 kmalloc(attrptr->length + 1,
421 GFP_KERNEL);
422 if (!ses->domainName)
423 return -ENOMEM;
424 cifs_from_ucs2(ses->domainName,
425 (__le16 *)blobptr,
426 attrptr->length,
427 attrptr->length,
428 load_nls_default(), false);
429 }
430 }
431 blobptr += attrsize; /* advance attr value */
432 attrptr = (struct ntlmssp2_name *) blobptr;
433 }
434 } else {
435 ses->server->tilen = 2 * sizeof(struct ntlmssp2_name);
436 ses->server->tiblob = kmalloc(ses->server->tilen, GFP_KERNEL);
437 if (!ses->server->tiblob) {
438 ses->server->tilen = 0;
439 cERROR(1, "Challenge target info allocation failure");
440 return -ENOMEM;
441 }
442 memset(ses->server->tiblob, 0x0, ses->server->tilen);
443 attrptr = (struct ntlmssp2_name *) ses->server->tiblob;
444 attrptr->type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE);
445 }
446
447 return rc;
448}
449
450static int
451CalcNTLMv2_response(const struct TCP_Server_Info *server,
452 char *v2_session_response)
383{ 453{
384 int rc; 454 int rc;
455
456 if (!server->ntlmssp.sdeschmacmd5) {
457 cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
458 return -1;
459 }
460
461 crypto_shash_setkey(server->ntlmssp.hmacmd5, server->ntlmv2_hash,
462 CIFS_HMAC_MD5_HASH_SIZE);
463
464 rc = crypto_shash_init(&server->ntlmssp.sdeschmacmd5->shash);
465 if (rc) {
466 cERROR(1, "CalcNTLMv2_response: could not init hmacmd5");
467 return rc;
468 }
469
470 memcpy(v2_session_response + CIFS_SERVER_CHALLENGE_SIZE,
471 server->cryptKey, CIFS_SERVER_CHALLENGE_SIZE);
472 crypto_shash_update(&server->ntlmssp.sdeschmacmd5->shash,
473 v2_session_response + CIFS_SERVER_CHALLENGE_SIZE,
474 sizeof(struct ntlmv2_resp) - CIFS_SERVER_CHALLENGE_SIZE);
475
476 if (server->tilen)
477 crypto_shash_update(&server->ntlmssp.sdeschmacmd5->shash,
478 server->tiblob, server->tilen);
479
480 rc = crypto_shash_final(&server->ntlmssp.sdeschmacmd5->shash,
481 v2_session_response);
482
483 return rc;
484}
485
486int
487setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
488 const struct nls_table *nls_cp)
489{
490 int rc = 0;
385 struct ntlmv2_resp *buf = (struct ntlmv2_resp *)resp_buf; 491 struct ntlmv2_resp *buf = (struct ntlmv2_resp *)resp_buf;
386 struct HMACMD5Context context;
387 492
388 buf->blob_signature = cpu_to_le32(0x00000101); 493 buf->blob_signature = cpu_to_le32(0x00000101);
389 buf->reserved = 0; 494 buf->reserved = 0;
390 buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); 495 buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
391 get_random_bytes(&buf->client_chal, sizeof(buf->client_chal)); 496 get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
392 buf->reserved2 = 0; 497 buf->reserved2 = 0;
393 buf->names[0].type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE); 498
394 buf->names[0].length = 0; 499 if (!ses->domainName) {
395 buf->names[1].type = 0; 500 rc = find_domain_name(ses);
396 buf->names[1].length = 0; 501 if (rc) {
502 cERROR(1, "could not get domain/server name rc %d", rc);
503 return rc;
504 }
505 }
397 506
398 /* calculate buf->ntlmv2_hash */ 507 /* calculate buf->ntlmv2_hash */
399 rc = calc_ntlmv2_hash(ses, nls_cp); 508 rc = calc_ntlmv2_hash(ses, nls_cp);
400 if (rc) 509 if (rc) {
401 cERROR(1, "could not get v2 hash rc %d", rc); 510 cERROR(1, "could not get v2 hash rc %d", rc);
402 CalcNTLMv2_response(ses, resp_buf); 511 return rc;
512 }
513 rc = CalcNTLMv2_response(ses->server, resp_buf);
514 if (rc) {
515 cERROR(1, "could not get v2 hash rc %d", rc);
516 return rc;
517 }
403 518
404 /* now calculate the MAC key for NTLMv2 */ 519 if (!ses->server->ntlmssp.sdeschmacmd5) {
405 hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context); 520 cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
406 hmac_md5_update(resp_buf, 16, &context); 521 return -1;
407 hmac_md5_final(ses->server->mac_signing_key.data.ntlmv2.key, &context); 522 }
408 523
409 memcpy(&ses->server->mac_signing_key.data.ntlmv2.resp, resp_buf, 524 crypto_shash_setkey(ses->server->ntlmssp.hmacmd5,
410 sizeof(struct ntlmv2_resp)); 525 ses->server->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
411 ses->server->mac_signing_key.len = 16 + sizeof(struct ntlmv2_resp); 526
527 rc = crypto_shash_init(&ses->server->ntlmssp.sdeschmacmd5->shash);
528 if (rc) {
529 cERROR(1, "setup_ntlmv2_rsp: could not init hmacmd5\n");
530 return rc;
531 }
532
533 crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash,
534 resp_buf, CIFS_HMAC_MD5_HASH_SIZE);
535
536 rc = crypto_shash_final(&ses->server->ntlmssp.sdeschmacmd5->shash,
537 ses->server->session_key.data.ntlmv2.key);
538
539 memcpy(&ses->server->session_key.data.ntlmv2.resp, resp_buf,
540 sizeof(struct ntlmv2_resp));
541 ses->server->session_key.len = 16 + sizeof(struct ntlmv2_resp);
542
543 return rc;
412} 544}
413 545
414void CalcNTLMv2_response(const struct cifsSesInfo *ses, 546int
415 char *v2_session_response) 547calc_seckey(struct TCP_Server_Info *server)
416{ 548{
417 struct HMACMD5Context context; 549 int rc;
418 /* rest of v2 struct already generated */ 550 unsigned char sec_key[CIFS_NTLMV2_SESSKEY_SIZE];
419 memcpy(v2_session_response + 8, ses->server->cryptKey, 8); 551 struct crypto_blkcipher *tfm_arc4;
420 hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context); 552 struct scatterlist sgin, sgout;
553 struct blkcipher_desc desc;
554
555 get_random_bytes(sec_key, CIFS_NTLMV2_SESSKEY_SIZE);
556
557 tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)",
558 0, CRYPTO_ALG_ASYNC);
559 if (!tfm_arc4 || IS_ERR(tfm_arc4)) {
560 cERROR(1, "could not allocate " "master crypto API arc4\n");
561 return 1;
562 }
563
564 desc.tfm = tfm_arc4;
565
566 crypto_blkcipher_setkey(tfm_arc4,
567 server->session_key.data.ntlmv2.key, CIFS_CPHTXT_SIZE);
568 sg_init_one(&sgin, sec_key, CIFS_CPHTXT_SIZE);
569 sg_init_one(&sgout, server->ntlmssp.ciphertext, CIFS_CPHTXT_SIZE);
570 rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE);
421 571
422 hmac_md5_update(v2_session_response+8, 572 if (!rc)
423 sizeof(struct ntlmv2_resp) - 8, &context); 573 memcpy(server->session_key.data.ntlmv2.key,
574 sec_key, CIFS_NTLMV2_SESSKEY_SIZE);
575
576 crypto_free_blkcipher(tfm_arc4);
577
578 return 0;
579}
424 580
425 hmac_md5_final(v2_session_response, &context); 581void
426/* cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */ 582cifs_crypto_shash_release(struct TCP_Server_Info *server)
583{
584 if (server->ntlmssp.md5)
585 crypto_free_shash(server->ntlmssp.md5);
586
587 if (server->ntlmssp.hmacmd5)
588 crypto_free_shash(server->ntlmssp.hmacmd5);
589
590 kfree(server->ntlmssp.sdeschmacmd5);
591
592 kfree(server->ntlmssp.sdescmd5);
593}
594
595int
596cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
597{
598 int rc;
599 unsigned int size;
600
601 server->ntlmssp.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
602 if (!server->ntlmssp.hmacmd5 ||
603 IS_ERR(server->ntlmssp.hmacmd5)) {
604 cERROR(1, "could not allocate crypto hmacmd5\n");
605 return 1;
606 }
607
608 server->ntlmssp.md5 = crypto_alloc_shash("md5", 0, 0);
609 if (!server->ntlmssp.md5 || IS_ERR(server->ntlmssp.md5)) {
610 cERROR(1, "could not allocate crypto md5\n");
611 rc = 1;
612 goto cifs_crypto_shash_allocate_ret1;
613 }
614
615 size = sizeof(struct shash_desc) +
616 crypto_shash_descsize(server->ntlmssp.hmacmd5);
617 server->ntlmssp.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
618 if (!server->ntlmssp.sdeschmacmd5) {
619 cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5\n");
620 rc = -ENOMEM;
621 goto cifs_crypto_shash_allocate_ret2;
622 }
623 server->ntlmssp.sdeschmacmd5->shash.tfm = server->ntlmssp.hmacmd5;
624 server->ntlmssp.sdeschmacmd5->shash.flags = 0x0;
625
626
627 size = sizeof(struct shash_desc) +
628 crypto_shash_descsize(server->ntlmssp.md5);
629 server->ntlmssp.sdescmd5 = kmalloc(size, GFP_KERNEL);
630 if (!server->ntlmssp.sdescmd5) {
631 cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5\n");
632 rc = -ENOMEM;
633 goto cifs_crypto_shash_allocate_ret3;
634 }
635 server->ntlmssp.sdescmd5->shash.tfm = server->ntlmssp.md5;
636 server->ntlmssp.sdescmd5->shash.flags = 0x0;
637
638 return 0;
639
640cifs_crypto_shash_allocate_ret3:
641 kfree(server->ntlmssp.sdeschmacmd5);
642
643cifs_crypto_shash_allocate_ret2:
644 crypto_free_shash(server->ntlmssp.md5);
645
646cifs_crypto_shash_allocate_ret1:
647 crypto_free_shash(server->ntlmssp.hmacmd5);
648
649 return rc;
427} 650}
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 0cdfb8c32ac6..c9d0cfc086eb 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -25,6 +25,9 @@
25#include <linux/workqueue.h> 25#include <linux/workqueue.h>
26#include "cifs_fs_sb.h" 26#include "cifs_fs_sb.h"
27#include "cifsacl.h" 27#include "cifsacl.h"
28#include <crypto/internal/hash.h>
29#include <linux/scatterlist.h>
30
28/* 31/*
29 * The sizes of various internal tables and strings 32 * The sizes of various internal tables and strings
30 */ 33 */
@@ -97,7 +100,7 @@ enum protocolEnum {
97 /* Netbios frames protocol not supported at this time */ 100 /* Netbios frames protocol not supported at this time */
98}; 101};
99 102
100struct mac_key { 103struct session_key {
101 unsigned int len; 104 unsigned int len;
102 union { 105 union {
103 char ntlm[CIFS_SESS_KEY_SIZE + 16]; 106 char ntlm[CIFS_SESS_KEY_SIZE + 16];
@@ -120,6 +123,21 @@ struct cifs_cred {
120 struct cifs_ace *aces; 123 struct cifs_ace *aces;
121}; 124};
122 125
126struct sdesc {
127 struct shash_desc shash;
128 char ctx[];
129};
130
131struct ntlmssp_auth {
132 __u32 client_flags;
133 __u32 server_flags;
134 unsigned char ciphertext[CIFS_CPHTXT_SIZE];
135 struct crypto_shash *hmacmd5;
136 struct crypto_shash *md5;
137 struct sdesc *sdeschmacmd5;
138 struct sdesc *sdescmd5;
139};
140
123/* 141/*
124 ***************************************************************** 142 *****************************************************************
125 * Except the CIFS PDUs themselves all the 143 * Except the CIFS PDUs themselves all the
@@ -182,11 +200,14 @@ struct TCP_Server_Info {
182 /* 16th byte of RFC1001 workstation name is always null */ 200 /* 16th byte of RFC1001 workstation name is always null */
183 char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; 201 char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
184 __u32 sequence_number; /* needed for CIFS PDU signature */ 202 __u32 sequence_number; /* needed for CIFS PDU signature */
185 struct mac_key mac_signing_key; 203 struct session_key session_key;
186 char ntlmv2_hash[16]; 204 char ntlmv2_hash[16];
187 unsigned long lstrp; /* when we got last response from this server */ 205 unsigned long lstrp; /* when we got last response from this server */
188 u16 dialect; /* dialect index that server chose */ 206 u16 dialect; /* dialect index that server chose */
189 /* extended security flavors that server supports */ 207 /* extended security flavors that server supports */
208 unsigned int tilen; /* length of the target info blob */
209 unsigned char *tiblob; /* target info blob in challenge response */
210 struct ntlmssp_auth ntlmssp; /* various keys, ciphers, flags */
190 bool sec_kerberos; /* supports plain Kerberos */ 211 bool sec_kerberos; /* supports plain Kerberos */
191 bool sec_mskerberos; /* supports legacy MS Kerberos */ 212 bool sec_mskerberos; /* supports legacy MS Kerberos */
192 bool sec_kerberosu2u; /* supports U2U Kerberos */ 213 bool sec_kerberosu2u; /* supports U2U Kerberos */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 14d036d8db11..320e0fd0ba7b 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -134,6 +134,12 @@
134 * Size of the session key (crypto key encrypted with the password 134 * Size of the session key (crypto key encrypted with the password
135 */ 135 */
136#define CIFS_SESS_KEY_SIZE (24) 136#define CIFS_SESS_KEY_SIZE (24)
137#define CIFS_CLIENT_CHALLENGE_SIZE (8)
138#define CIFS_SERVER_CHALLENGE_SIZE (8)
139#define CIFS_HMAC_MD5_HASH_SIZE (16)
140#define CIFS_CPHTXT_SIZE (16)
141#define CIFS_NTLMV2_SESSKEY_SIZE (16)
142#define CIFS_NTHASH_SIZE (16)
137 143
138/* 144/*
139 * Maximum user name length 145 * Maximum user name length
@@ -663,7 +669,6 @@ struct ntlmv2_resp {
663 __le64 time; 669 __le64 time;
664 __u64 client_chal; /* random */ 670 __u64 client_chal; /* random */
665 __u32 reserved2; 671 __u32 reserved2;
666 struct ntlmssp2_name names[2];
667 /* array of name entries could follow ending in minimum 4 byte struct */ 672 /* array of name entries could follow ending in minimum 4 byte struct */
668} __attribute__((packed)); 673} __attribute__((packed));
669 674
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1f5450814087..1378d9133844 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -361,15 +361,15 @@ extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
361extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, 361extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
362 __u32 *); 362 __u32 *);
363extern int cifs_verify_signature(struct smb_hdr *, 363extern int cifs_verify_signature(struct smb_hdr *,
364 const struct mac_key *mac_key, 364 struct TCP_Server_Info *server,
365 __u32 expected_sequence_number); 365 __u32 expected_sequence_number);
366extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn, 366extern int cifs_calculate_session_key(struct session_key *key, const char *rn,
367 const char *pass); 367 const char *pass);
368extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *, 368extern int setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
369 const struct nls_table *);
370extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *);
371extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
372 const struct nls_table *); 369 const struct nls_table *);
370extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
371extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
372extern int calc_seckey(struct TCP_Server_Info *);
373#ifdef CONFIG_CIFS_WEAK_PW_HASH 373#ifdef CONFIG_CIFS_WEAK_PW_HASH
374extern void calc_lanman_hash(const char *password, const char *cryptkey, 374extern void calc_lanman_hash(const char *password, const char *cryptkey,
375 bool encrypt, char *lnm_session_key); 375 bool encrypt, char *lnm_session_key);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index c65c3419dd37..4bda920d1f75 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -604,11 +604,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
604 else 604 else
605 rc = -EINVAL; 605 rc = -EINVAL;
606 606
607 if (server->sec_kerberos || server->sec_mskerberos) 607 if (server->secType == Kerberos) {
608 server->secType = Kerberos; 608 if (!server->sec_kerberos &&
609 else if (server->sec_ntlmssp) 609 !server->sec_mskerberos)
610 server->secType = RawNTLMSSP; 610 rc = -EOPNOTSUPP;
611 else 611 } else if (server->secType == RawNTLMSSP) {
612 if (!server->sec_ntlmssp)
613 rc = -EOPNOTSUPP;
614 } else
612 rc = -EOPNOTSUPP; 615 rc = -EOPNOTSUPP;
613 } 616 }
614 } else 617 } else
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 95c2ea67edfb..ec0ea4a43bdb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1673,7 +1673,9 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
1673 MAX_USERNAME_SIZE)) 1673 MAX_USERNAME_SIZE))
1674 continue; 1674 continue;
1675 if (strlen(vol->username) != 0 && 1675 if (strlen(vol->username) != 0 &&
1676 strncmp(ses->password, vol->password, 1676 ses->password != NULL &&
1677 strncmp(ses->password,
1678 vol->password ? vol->password : "",
1677 MAX_PASSWORD_SIZE)) 1679 MAX_PASSWORD_SIZE))
1678 continue; 1680 continue;
1679 } 1681 }
@@ -1706,6 +1708,7 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
1706 CIFSSMBLogoff(xid, ses); 1708 CIFSSMBLogoff(xid, ses);
1707 _FreeXid(xid); 1709 _FreeXid(xid);
1708 } 1710 }
1711 cifs_crypto_shash_release(server);
1709 sesInfoFree(ses); 1712 sesInfoFree(ses);
1710 cifs_put_tcp_session(server); 1713 cifs_put_tcp_session(server);
1711} 1714}
@@ -1785,13 +1788,23 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1785 ses->linux_uid = volume_info->linux_uid; 1788 ses->linux_uid = volume_info->linux_uid;
1786 ses->overrideSecFlg = volume_info->secFlg; 1789 ses->overrideSecFlg = volume_info->secFlg;
1787 1790
1791 rc = cifs_crypto_shash_allocate(server);
1792 if (rc) {
1793 cERROR(1, "could not setup hash structures rc %d", rc);
1794 goto get_ses_fail;
1795 }
1796 server->tilen = 0;
1797 server->tiblob = NULL;
1798
1788 mutex_lock(&ses->session_mutex); 1799 mutex_lock(&ses->session_mutex);
1789 rc = cifs_negotiate_protocol(xid, ses); 1800 rc = cifs_negotiate_protocol(xid, ses);
1790 if (!rc) 1801 if (!rc)
1791 rc = cifs_setup_session(xid, ses, volume_info->local_nls); 1802 rc = cifs_setup_session(xid, ses, volume_info->local_nls);
1792 mutex_unlock(&ses->session_mutex); 1803 mutex_unlock(&ses->session_mutex);
1793 if (rc) 1804 if (rc) {
1805 cifs_crypto_shash_release(ses->server);
1794 goto get_ses_fail; 1806 goto get_ses_fail;
1807 }
1795 1808
1796 /* success, put it on the list */ 1809 /* success, put it on the list */
1797 write_lock(&cifs_tcp_ses_lock); 1810 write_lock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 578d88c5b46e..f9ed0751cc12 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -305,8 +305,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
305 full_path = build_path_from_dentry(direntry); 305 full_path = build_path_from_dentry(direntry);
306 if (full_path == NULL) { 306 if (full_path == NULL) {
307 rc = -ENOMEM; 307 rc = -ENOMEM;
308 FreeXid(xid); 308 goto cifs_create_out;
309 return rc;
310 } 309 }
311 310
312 if (oplockEnabled) 311 if (oplockEnabled)
@@ -365,9 +364,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
365 364
366 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); 365 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
367 if (buf == NULL) { 366 if (buf == NULL) {
368 kfree(full_path); 367 rc = -ENOMEM;
369 FreeXid(xid); 368 goto cifs_create_out;
370 return -ENOMEM;
371 } 369 }
372 370
373 /* 371 /*
@@ -496,6 +494,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
496 struct cifsTconInfo *pTcon; 494 struct cifsTconInfo *pTcon;
497 char *full_path = NULL; 495 char *full_path = NULL;
498 struct inode *newinode = NULL; 496 struct inode *newinode = NULL;
497 int oplock = 0;
498 u16 fileHandle;
499 FILE_ALL_INFO *buf = NULL;
500 unsigned int bytes_written;
501 struct win_dev *pdev;
499 502
500 if (!old_valid_dev(device_number)) 503 if (!old_valid_dev(device_number))
501 return -EINVAL; 504 return -EINVAL;
@@ -506,9 +509,12 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
506 pTcon = cifs_sb->tcon; 509 pTcon = cifs_sb->tcon;
507 510
508 full_path = build_path_from_dentry(direntry); 511 full_path = build_path_from_dentry(direntry);
509 if (full_path == NULL) 512 if (full_path == NULL) {
510 rc = -ENOMEM; 513 rc = -ENOMEM;
511 else if (pTcon->unix_ext) { 514 goto mknod_out;
515 }
516
517 if (pTcon->unix_ext) {
512 struct cifs_unix_set_info_args args = { 518 struct cifs_unix_set_info_args args = {
513 .mode = mode & ~current_umask(), 519 .mode = mode & ~current_umask(),
514 .ctime = NO_CHANGE_64, 520 .ctime = NO_CHANGE_64,
@@ -527,87 +533,78 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
527 cifs_sb->local_nls, 533 cifs_sb->local_nls,
528 cifs_sb->mnt_cifs_flags & 534 cifs_sb->mnt_cifs_flags &
529 CIFS_MOUNT_MAP_SPECIAL_CHR); 535 CIFS_MOUNT_MAP_SPECIAL_CHR);
536 if (rc)
537 goto mknod_out;
530 538
531 if (!rc) { 539 rc = cifs_get_inode_info_unix(&newinode, full_path,
532 rc = cifs_get_inode_info_unix(&newinode, full_path,
533 inode->i_sb, xid); 540 inode->i_sb, xid);
534 if (pTcon->nocase) 541 if (pTcon->nocase)
535 direntry->d_op = &cifs_ci_dentry_ops; 542 direntry->d_op = &cifs_ci_dentry_ops;
536 else 543 else
537 direntry->d_op = &cifs_dentry_ops; 544 direntry->d_op = &cifs_dentry_ops;
538 if (rc == 0)
539 d_instantiate(direntry, newinode);
540 }
541 } else {
542 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
543 int oplock = 0;
544 u16 fileHandle;
545 FILE_ALL_INFO *buf;
546 545
547 cFYI(1, "sfu compat create special file"); 546 if (rc == 0)
547 d_instantiate(direntry, newinode);
548 goto mknod_out;
549 }
548 550
549 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); 551 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
550 if (buf == NULL) { 552 goto mknod_out;
551 kfree(full_path);
552 rc = -ENOMEM;
553 FreeXid(xid);
554 return rc;
555 }
556 553
557 rc = CIFSSMBOpen(xid, pTcon, full_path, 554
558 FILE_CREATE, /* fail if exists */ 555 cFYI(1, "sfu compat create special file");
559 GENERIC_WRITE /* BB would 556
560 WRITE_OWNER | WRITE_DAC be better? */, 557 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
561 /* Create a file and set the 558 if (buf == NULL) {
562 file attribute to SYSTEM */ 559 kfree(full_path);
563 CREATE_NOT_DIR | CREATE_OPTION_SPECIAL, 560 rc = -ENOMEM;
564 &fileHandle, &oplock, buf, 561 FreeXid(xid);
565 cifs_sb->local_nls, 562 return rc;
566 cifs_sb->mnt_cifs_flags &
567 CIFS_MOUNT_MAP_SPECIAL_CHR);
568
569 /* BB FIXME - add handling for backlevel servers
570 which need legacy open and check for all
571 calls to SMBOpen for fallback to SMBLeagcyOpen */
572 if (!rc) {
573 /* BB Do not bother to decode buf since no
574 local inode yet to put timestamps in,
575 but we can reuse it safely */
576 unsigned int bytes_written;
577 struct win_dev *pdev;
578 pdev = (struct win_dev *)buf;
579 if (S_ISCHR(mode)) {
580 memcpy(pdev->type, "IntxCHR", 8);
581 pdev->major =
582 cpu_to_le64(MAJOR(device_number));
583 pdev->minor =
584 cpu_to_le64(MINOR(device_number));
585 rc = CIFSSMBWrite(xid, pTcon,
586 fileHandle,
587 sizeof(struct win_dev),
588 0, &bytes_written, (char *)pdev,
589 NULL, 0);
590 } else if (S_ISBLK(mode)) {
591 memcpy(pdev->type, "IntxBLK", 8);
592 pdev->major =
593 cpu_to_le64(MAJOR(device_number));
594 pdev->minor =
595 cpu_to_le64(MINOR(device_number));
596 rc = CIFSSMBWrite(xid, pTcon,
597 fileHandle,
598 sizeof(struct win_dev),
599 0, &bytes_written, (char *)pdev,
600 NULL, 0);
601 } /* else if(S_ISFIFO */
602 CIFSSMBClose(xid, pTcon, fileHandle);
603 d_drop(direntry);
604 }
605 kfree(buf);
606 /* add code here to set EAs */
607 }
608 } 563 }
609 564
565 /* FIXME: would WRITE_OWNER | WRITE_DAC be better? */
566 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE,
567 GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
568 &fileHandle, &oplock, buf, cifs_sb->local_nls,
569 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
570 if (rc)
571 goto mknod_out;
572
573 /* BB Do not bother to decode buf since no local inode yet to put
574 * timestamps in, but we can reuse it safely */
575
576 pdev = (struct win_dev *)buf;
577 if (S_ISCHR(mode)) {
578 memcpy(pdev->type, "IntxCHR", 8);
579 pdev->major =
580 cpu_to_le64(MAJOR(device_number));
581 pdev->minor =
582 cpu_to_le64(MINOR(device_number));
583 rc = CIFSSMBWrite(xid, pTcon,
584 fileHandle,
585 sizeof(struct win_dev),
586 0, &bytes_written, (char *)pdev,
587 NULL, 0);
588 } else if (S_ISBLK(mode)) {
589 memcpy(pdev->type, "IntxBLK", 8);
590 pdev->major =
591 cpu_to_le64(MAJOR(device_number));
592 pdev->minor =
593 cpu_to_le64(MINOR(device_number));
594 rc = CIFSSMBWrite(xid, pTcon,
595 fileHandle,
596 sizeof(struct win_dev),
597 0, &bytes_written, (char *)pdev,
598 NULL, 0);
599 } /* else if (S_ISFIFO) */
600 CIFSSMBClose(xid, pTcon, fileHandle);
601 d_drop(direntry);
602
603 /* FIXME: add code here to set EAs */
604
605mknod_out:
610 kfree(full_path); 606 kfree(full_path);
607 kfree(buf);
611 FreeXid(xid); 608 FreeXid(xid);
612 return rc; 609 return rc;
613} 610}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index db11fdef0e92..de748c652d11 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -242,8 +242,7 @@ int cifs_open(struct inode *inode, struct file *file)
242 full_path = build_path_from_dentry(file->f_path.dentry); 242 full_path = build_path_from_dentry(file->f_path.dentry);
243 if (full_path == NULL) { 243 if (full_path == NULL) {
244 rc = -ENOMEM; 244 rc = -ENOMEM;
245 FreeXid(xid); 245 goto out;
246 return rc;
247 } 246 }
248 247
249 cFYI(1, "inode = 0x%p file flags are 0x%x for %s", 248 cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 4bc47e5b5f29..86a164f08a74 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -834,7 +834,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
834 xid, NULL); 834 xid, NULL);
835 835
836 if (!inode) 836 if (!inode)
837 return ERR_PTR(-ENOMEM); 837 return ERR_PTR(rc);
838 838
839#ifdef CONFIG_CIFS_FSCACHE 839#ifdef CONFIG_CIFS_FSCACHE
840 /* populate tcon->resource_id */ 840 /* populate tcon->resource_id */
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 49c9a4e75319..1db0f0746a5b 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -61,6 +61,19 @@
61#define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000 61#define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000
62#define NTLMSSP_NEGOTIATE_56 0x80000000 62#define NTLMSSP_NEGOTIATE_56 0x80000000
63 63
64/* Define AV Pair Field IDs */
65#define NTLMSSP_AV_EOL 0
66#define NTLMSSP_AV_NB_COMPUTER_NAME 1
67#define NTLMSSP_AV_NB_DOMAIN_NAME 2
68#define NTLMSSP_AV_DNS_COMPUTER_NAME 3
69#define NTLMSSP_AV_DNS_DOMAIN_NAME 4
70#define NTLMSSP_AV_DNS_TREE_NAME 5
71#define NTLMSSP_AV_FLAGS 6
72#define NTLMSSP_AV_TIMESTAMP 7
73#define NTLMSSP_AV_RESTRICTION 8
74#define NTLMSSP_AV_TARGET_NAME 9
75#define NTLMSSP_AV_CHANNEL_BINDINGS 10
76
64/* Although typedefs are not commonly used for structure definitions */ 77/* Although typedefs are not commonly used for structure definitions */
65/* in the Linux kernel, in this particular case they are useful */ 78/* in the Linux kernel, in this particular case they are useful */
66/* to more closely match the standards document for NTLMSSP from */ 79/* to more closely match the standards document for NTLMSSP from */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 0a57cb7db5dd..795095f4eac6 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -383,6 +383,9 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
383static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, 383static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
384 struct cifsSesInfo *ses) 384 struct cifsSesInfo *ses)
385{ 385{
386 unsigned int tioffset; /* challeng message target info area */
387 unsigned int tilen; /* challeng message target info area length */
388
386 CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr; 389 CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
387 390
388 if (blob_len < sizeof(CHALLENGE_MESSAGE)) { 391 if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
@@ -405,6 +408,20 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
405 /* BB spec says that if AvId field of MsvAvTimestamp is populated then 408 /* BB spec says that if AvId field of MsvAvTimestamp is populated then
406 we must set the MIC field of the AUTHENTICATE_MESSAGE */ 409 we must set the MIC field of the AUTHENTICATE_MESSAGE */
407 410
411 ses->server->ntlmssp.server_flags = le32_to_cpu(pblob->NegotiateFlags);
412
413 tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset);
414 tilen = cpu_to_le16(pblob->TargetInfoArray.Length);
415 ses->server->tilen = tilen;
416 if (tilen) {
417 ses->server->tiblob = kmalloc(tilen, GFP_KERNEL);
418 if (!ses->server->tiblob) {
419 cERROR(1, "Challenge target info allocation failure");
420 return -ENOMEM;
421 }
422 memcpy(ses->server->tiblob, bcc_ptr + tioffset, tilen);
423 }
424
408 return 0; 425 return 0;
409} 426}
410 427
@@ -425,12 +442,13 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
425 /* BB is NTLMV2 session security format easier to use here? */ 442 /* BB is NTLMV2 session security format easier to use here? */
426 flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | 443 flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET |
427 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | 444 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
428 NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM; 445 NTLMSSP_NEGOTIATE_NTLM;
429 if (ses->server->secMode & 446 if (ses->server->secMode &
430 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 447 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
431 flags |= NTLMSSP_NEGOTIATE_SIGN; 448 flags |= NTLMSSP_NEGOTIATE_SIGN |
432 if (ses->server->secMode & SECMODE_SIGN_REQUIRED) 449 NTLMSSP_NEGOTIATE_KEY_XCH |
433 flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN; 450 NTLMSSP_NEGOTIATE_EXTENDED_SEC;
451 }
434 452
435 sec_blob->NegotiateFlags |= cpu_to_le32(flags); 453 sec_blob->NegotiateFlags |= cpu_to_le32(flags);
436 454
@@ -451,10 +469,12 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
451 struct cifsSesInfo *ses, 469 struct cifsSesInfo *ses,
452 const struct nls_table *nls_cp, bool first) 470 const struct nls_table *nls_cp, bool first)
453{ 471{
472 int rc;
473 unsigned int size;
454 AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer; 474 AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
455 __u32 flags; 475 __u32 flags;
456 unsigned char *tmp; 476 unsigned char *tmp;
457 char ntlm_session_key[CIFS_SESS_KEY_SIZE]; 477 struct ntlmv2_resp ntlmv2_response = {};
458 478
459 memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); 479 memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
460 sec_blob->MessageType = NtLmAuthenticate; 480 sec_blob->MessageType = NtLmAuthenticate;
@@ -477,19 +497,25 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
477 sec_blob->LmChallengeResponse.Length = 0; 497 sec_blob->LmChallengeResponse.Length = 0;
478 sec_blob->LmChallengeResponse.MaximumLength = 0; 498 sec_blob->LmChallengeResponse.MaximumLength = 0;
479 499
480 /* calculate session key, BB what about adding similar ntlmv2 path? */
481 SMBNTencrypt(ses->password, ses->server->cryptKey, ntlm_session_key);
482 if (first)
483 cifs_calculate_mac_key(&ses->server->mac_signing_key,
484 ntlm_session_key, ses->password);
485
486 memcpy(tmp, ntlm_session_key, CIFS_SESS_KEY_SIZE);
487 sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer); 500 sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
488 sec_blob->NtChallengeResponse.Length = cpu_to_le16(CIFS_SESS_KEY_SIZE); 501 rc = setup_ntlmv2_rsp(ses, (char *)&ntlmv2_response, nls_cp);
489 sec_blob->NtChallengeResponse.MaximumLength = 502 if (rc) {
490 cpu_to_le16(CIFS_SESS_KEY_SIZE); 503 cERROR(1, "error rc: %d during ntlmssp ntlmv2 setup", rc);
504 goto setup_ntlmv2_ret;
505 }
506 size = sizeof(struct ntlmv2_resp);
507 memcpy(tmp, (char *)&ntlmv2_response, size);
508 tmp += size;
509 if (ses->server->tilen > 0) {
510 memcpy(tmp, ses->server->tiblob, ses->server->tilen);
511 tmp += ses->server->tilen;
512 } else
513 ses->server->tilen = 0;
491 514
492 tmp += CIFS_SESS_KEY_SIZE; 515 sec_blob->NtChallengeResponse.Length = cpu_to_le16(size +
516 ses->server->tilen);
517 sec_blob->NtChallengeResponse.MaximumLength =
518 cpu_to_le16(size + ses->server->tilen);
493 519
494 if (ses->domainName == NULL) { 520 if (ses->domainName == NULL) {
495 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); 521 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
@@ -501,7 +527,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
501 len = cifs_strtoUCS((__le16 *)tmp, ses->domainName, 527 len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
502 MAX_USERNAME_SIZE, nls_cp); 528 MAX_USERNAME_SIZE, nls_cp);
503 len *= 2; /* unicode is 2 bytes each */ 529 len *= 2; /* unicode is 2 bytes each */
504 len += 2; /* trailing null */
505 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); 530 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
506 sec_blob->DomainName.Length = cpu_to_le16(len); 531 sec_blob->DomainName.Length = cpu_to_le16(len);
507 sec_blob->DomainName.MaximumLength = cpu_to_le16(len); 532 sec_blob->DomainName.MaximumLength = cpu_to_le16(len);
@@ -518,7 +543,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
518 len = cifs_strtoUCS((__le16 *)tmp, ses->userName, 543 len = cifs_strtoUCS((__le16 *)tmp, ses->userName,
519 MAX_USERNAME_SIZE, nls_cp); 544 MAX_USERNAME_SIZE, nls_cp);
520 len *= 2; /* unicode is 2 bytes each */ 545 len *= 2; /* unicode is 2 bytes each */
521 len += 2; /* trailing null */
522 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); 546 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
523 sec_blob->UserName.Length = cpu_to_le16(len); 547 sec_blob->UserName.Length = cpu_to_le16(len);
524 sec_blob->UserName.MaximumLength = cpu_to_le16(len); 548 sec_blob->UserName.MaximumLength = cpu_to_le16(len);
@@ -530,9 +554,26 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
530 sec_blob->WorkstationName.MaximumLength = 0; 554 sec_blob->WorkstationName.MaximumLength = 0;
531 tmp += 2; 555 tmp += 2;
532 556
533 sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); 557 if ((ses->server->ntlmssp.server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) &&
534 sec_blob->SessionKey.Length = 0; 558 !calc_seckey(ses->server)) {
535 sec_blob->SessionKey.MaximumLength = 0; 559 memcpy(tmp, ses->server->ntlmssp.ciphertext, CIFS_CPHTXT_SIZE);
560 sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
561 sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
562 sec_blob->SessionKey.MaximumLength =
563 cpu_to_le16(CIFS_CPHTXT_SIZE);
564 tmp += CIFS_CPHTXT_SIZE;
565 } else {
566 sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
567 sec_blob->SessionKey.Length = 0;
568 sec_blob->SessionKey.MaximumLength = 0;
569 }
570
571 ses->server->sequence_number = 0;
572
573setup_ntlmv2_ret:
574 if (ses->server->tilen > 0)
575 kfree(ses->server->tiblob);
576
536 return tmp - pbuffer; 577 return tmp - pbuffer;
537} 578}
538 579
@@ -546,15 +587,14 @@ static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
546 return; 587 return;
547} 588}
548 589
549static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB, 590static int setup_ntlmssp_auth_req(char *ntlmsspblob,
550 struct cifsSesInfo *ses, 591 struct cifsSesInfo *ses,
551 const struct nls_table *nls, bool first_time) 592 const struct nls_table *nls, bool first_time)
552{ 593{
553 int bloblen; 594 int bloblen;
554 595
555 bloblen = build_ntlmssp_auth_blob(&pSMB->req.SecurityBlob[0], ses, nls, 596 bloblen = build_ntlmssp_auth_blob(ntlmsspblob, ses, nls,
556 first_time); 597 first_time);
557 pSMB->req.SecurityBlobLength = cpu_to_le16(bloblen);
558 598
559 return bloblen; 599 return bloblen;
560} 600}
@@ -690,7 +730,7 @@ ssetup_ntlmssp_authenticate:
690 730
691 if (first_time) /* should this be moved into common code 731 if (first_time) /* should this be moved into common code
692 with similar ntlmv2 path? */ 732 with similar ntlmv2 path? */
693 cifs_calculate_mac_key(&ses->server->mac_signing_key, 733 cifs_calculate_session_key(&ses->server->session_key,
694 ntlm_session_key, ses->password); 734 ntlm_session_key, ses->password);
695 /* copy session key */ 735 /* copy session key */
696 736
@@ -729,12 +769,21 @@ ssetup_ntlmssp_authenticate:
729 cpu_to_le16(sizeof(struct ntlmv2_resp)); 769 cpu_to_le16(sizeof(struct ntlmv2_resp));
730 770
731 /* calculate session key */ 771 /* calculate session key */
732 setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp); 772 rc = setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
773 if (rc) {
774 kfree(v2_sess_key);
775 goto ssetup_exit;
776 }
733 /* FIXME: calculate MAC key */ 777 /* FIXME: calculate MAC key */
734 memcpy(bcc_ptr, (char *)v2_sess_key, 778 memcpy(bcc_ptr, (char *)v2_sess_key,
735 sizeof(struct ntlmv2_resp)); 779 sizeof(struct ntlmv2_resp));
736 bcc_ptr += sizeof(struct ntlmv2_resp); 780 bcc_ptr += sizeof(struct ntlmv2_resp);
737 kfree(v2_sess_key); 781 kfree(v2_sess_key);
782 if (ses->server->tilen > 0) {
783 memcpy(bcc_ptr, ses->server->tiblob,
784 ses->server->tilen);
785 bcc_ptr += ses->server->tilen;
786 }
738 if (ses->capabilities & CAP_UNICODE) { 787 if (ses->capabilities & CAP_UNICODE) {
739 if (iov[0].iov_len % 2) { 788 if (iov[0].iov_len % 2) {
740 *bcc_ptr = 0; 789 *bcc_ptr = 0;
@@ -765,15 +814,15 @@ ssetup_ntlmssp_authenticate:
765 } 814 }
766 /* bail out if key is too long */ 815 /* bail out if key is too long */
767 if (msg->sesskey_len > 816 if (msg->sesskey_len >
768 sizeof(ses->server->mac_signing_key.data.krb5)) { 817 sizeof(ses->server->session_key.data.krb5)) {
769 cERROR(1, "Kerberos signing key too long (%u bytes)", 818 cERROR(1, "Kerberos signing key too long (%u bytes)",
770 msg->sesskey_len); 819 msg->sesskey_len);
771 rc = -EOVERFLOW; 820 rc = -EOVERFLOW;
772 goto ssetup_exit; 821 goto ssetup_exit;
773 } 822 }
774 if (first_time) { 823 if (first_time) {
775 ses->server->mac_signing_key.len = msg->sesskey_len; 824 ses->server->session_key.len = msg->sesskey_len;
776 memcpy(ses->server->mac_signing_key.data.krb5, 825 memcpy(ses->server->session_key.data.krb5,
777 msg->data, msg->sesskey_len); 826 msg->data, msg->sesskey_len);
778 } 827 }
779 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; 828 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
@@ -815,12 +864,28 @@ ssetup_ntlmssp_authenticate:
815 if (phase == NtLmNegotiate) { 864 if (phase == NtLmNegotiate) {
816 setup_ntlmssp_neg_req(pSMB, ses); 865 setup_ntlmssp_neg_req(pSMB, ses);
817 iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE); 866 iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
867 iov[1].iov_base = &pSMB->req.SecurityBlob[0];
818 } else if (phase == NtLmAuthenticate) { 868 } else if (phase == NtLmAuthenticate) {
819 int blob_len; 869 int blob_len;
820 blob_len = setup_ntlmssp_auth_req(pSMB, ses, 870 char *ntlmsspblob;
821 nls_cp, 871
822 first_time); 872 ntlmsspblob = kmalloc(5 *
873 sizeof(struct _AUTHENTICATE_MESSAGE),
874 GFP_KERNEL);
875 if (!ntlmsspblob) {
876 cERROR(1, "Can't allocate NTLMSSP");
877 rc = -ENOMEM;
878 goto ssetup_exit;
879 }
880
881 blob_len = setup_ntlmssp_auth_req(ntlmsspblob,
882 ses,
883 nls_cp,
884 first_time);
823 iov[1].iov_len = blob_len; 885 iov[1].iov_len = blob_len;
886 iov[1].iov_base = ntlmsspblob;
887 pSMB->req.SecurityBlobLength =
888 cpu_to_le16(blob_len);
824 /* Make sure that we tell the server that we 889 /* Make sure that we tell the server that we
825 are using the uid that it just gave us back 890 are using the uid that it just gave us back
826 on the response (challenge) */ 891 on the response (challenge) */
@@ -830,7 +895,6 @@ ssetup_ntlmssp_authenticate:
830 rc = -ENOSYS; 895 rc = -ENOSYS;
831 goto ssetup_exit; 896 goto ssetup_exit;
832 } 897 }
833 iov[1].iov_base = &pSMB->req.SecurityBlob[0];
834 /* unicode strings must be word aligned */ 898 /* unicode strings must be word aligned */
835 if ((iov[0].iov_len + iov[1].iov_len) % 2) { 899 if ((iov[0].iov_len + iov[1].iov_len) % 2) {
836 *bcc_ptr = 0; 900 *bcc_ptr = 0;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 82f78c4d6978..e0588cdf4cc5 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -543,7 +543,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
543 (ses->server->secMode & (SECMODE_SIGN_REQUIRED | 543 (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
544 SECMODE_SIGN_ENABLED))) { 544 SECMODE_SIGN_ENABLED))) {
545 rc = cifs_verify_signature(midQ->resp_buf, 545 rc = cifs_verify_signature(midQ->resp_buf,
546 &ses->server->mac_signing_key, 546 ses->server,
547 midQ->sequence_number+1); 547 midQ->sequence_number+1);
548 if (rc) { 548 if (rc) {
549 cERROR(1, "Unexpected SMB signature"); 549 cERROR(1, "Unexpected SMB signature");
@@ -731,7 +731,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
731 (ses->server->secMode & (SECMODE_SIGN_REQUIRED | 731 (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
732 SECMODE_SIGN_ENABLED))) { 732 SECMODE_SIGN_ENABLED))) {
733 rc = cifs_verify_signature(out_buf, 733 rc = cifs_verify_signature(out_buf,
734 &ses->server->mac_signing_key, 734 ses->server,
735 midQ->sequence_number+1); 735 midQ->sequence_number+1);
736 if (rc) { 736 if (rc) {
737 cERROR(1, "Unexpected SMB signature"); 737 cERROR(1, "Unexpected SMB signature");
@@ -981,7 +981,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
981 (ses->server->secMode & (SECMODE_SIGN_REQUIRED | 981 (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
982 SECMODE_SIGN_ENABLED))) { 982 SECMODE_SIGN_ENABLED))) {
983 rc = cifs_verify_signature(out_buf, 983 rc = cifs_verify_signature(out_buf,
984 &ses->server->mac_signing_key, 984 ses->server,
985 midQ->sequence_number+1); 985 midQ->sequence_number+1);
986 if (rc) { 986 if (rc) {
987 cERROR(1, "Unexpected SMB signature"); 987 cERROR(1, "Unexpected SMB signature");
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index a53b130b366c..1e7a33028d33 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -80,7 +80,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
80 } 80 }
81 } else { 81 } else {
82 inode = iget_locked(sb, CRAMINO(cramfs_inode)); 82 inode = iget_locked(sb, CRAMINO(cramfs_inode));
83 if (inode) { 83 if (inode && (inode->i_state & I_NEW)) {
84 setup_inode(inode, cramfs_inode); 84 setup_inode(inode, cramfs_inode);
85 unlock_new_inode(inode); 85 unlock_new_inode(inode);
86 } 86 }
diff --git a/fs/dcache.c b/fs/dcache.c
index 4d13bf50b7b1..83293be48149 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1332,31 +1332,13 @@ EXPORT_SYMBOL(d_add_ci);
1332 * d_lookup - search for a dentry 1332 * d_lookup - search for a dentry
1333 * @parent: parent dentry 1333 * @parent: parent dentry
1334 * @name: qstr of name we wish to find 1334 * @name: qstr of name we wish to find
1335 * Returns: dentry, or NULL
1335 * 1336 *
1336 * Searches the children of the parent dentry for the name in question. If 1337 * d_lookup searches the children of the parent dentry for the name in
1337 * the dentry is found its reference count is incremented and the dentry 1338 * question. If the dentry is found its reference count is incremented and the
1338 * is returned. The caller must use dput to free the entry when it has 1339 * dentry is returned. The caller must use dput to free the entry when it has
1339 * finished using it. %NULL is returned on failure. 1340 * finished using it. %NULL is returned if the dentry does not exist.
1340 *
1341 * __d_lookup is dcache_lock free. The hash list is protected using RCU.
1342 * Memory barriers are used while updating and doing lockless traversal.
1343 * To avoid races with d_move while rename is happening, d_lock is used.
1344 *
1345 * Overflows in memcmp(), while d_move, are avoided by keeping the length
1346 * and name pointer in one structure pointed by d_qstr.
1347 *
1348 * rcu_read_lock() and rcu_read_unlock() are used to disable preemption while
1349 * lookup is going on.
1350 *
1351 * The dentry unused LRU is not updated even if lookup finds the required dentry
1352 * in there. It is updated in places such as prune_dcache, shrink_dcache_sb,
1353 * select_parent and __dget_locked. This laziness saves lookup from dcache_lock
1354 * acquisition.
1355 *
1356 * d_lookup() is protected against the concurrent renames in some unrelated
1357 * directory using the seqlockt_t rename_lock.
1358 */ 1341 */
1359
1360struct dentry * d_lookup(struct dentry * parent, struct qstr * name) 1342struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
1361{ 1343{
1362 struct dentry * dentry = NULL; 1344 struct dentry * dentry = NULL;
@@ -1372,6 +1354,21 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
1372} 1354}
1373EXPORT_SYMBOL(d_lookup); 1355EXPORT_SYMBOL(d_lookup);
1374 1356
1357/*
1358 * __d_lookup - search for a dentry (racy)
1359 * @parent: parent dentry
1360 * @name: qstr of name we wish to find
1361 * Returns: dentry, or NULL
1362 *
1363 * __d_lookup is like d_lookup, however it may (rarely) return a
1364 * false-negative result due to unrelated rename activity.
1365 *
1366 * __d_lookup is slightly faster by avoiding rename_lock read seqlock,
1367 * however it must be used carefully, eg. with a following d_lookup in
1368 * the case of failure.
1369 *
1370 * __d_lookup callers must be commented.
1371 */
1375struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) 1372struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1376{ 1373{
1377 unsigned int len = name->len; 1374 unsigned int len = name->len;
@@ -1382,6 +1379,19 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1382 struct hlist_node *node; 1379 struct hlist_node *node;
1383 struct dentry *dentry; 1380 struct dentry *dentry;
1384 1381
1382 /*
1383 * The hash list is protected using RCU.
1384 *
1385 * Take d_lock when comparing a candidate dentry, to avoid races
1386 * with d_move().
1387 *
1388 * It is possible that concurrent renames can mess up our list
1389 * walk here and result in missing our dentry, resulting in the
1390 * false-negative result. d_lookup() protects against concurrent
1391 * renames using rename_lock seqlock.
1392 *
1393 * See Documentation/vfs/dcache-locking.txt for more details.
1394 */
1385 rcu_read_lock(); 1395 rcu_read_lock();
1386 1396
1387 hlist_for_each_entry_rcu(dentry, node, head, d_hash) { 1397 hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
@@ -1396,8 +1406,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1396 1406
1397 /* 1407 /*
1398 * Recheck the dentry after taking the lock - d_move may have 1408 * Recheck the dentry after taking the lock - d_move may have
1399 * changed things. Don't bother checking the hash because we're 1409 * changed things. Don't bother checking the hash because
1400 * about to compare the whole name anyway. 1410 * we're about to compare the whole name anyway.
1401 */ 1411 */
1402 if (dentry->d_parent != parent) 1412 if (dentry->d_parent != parent)
1403 goto next; 1413 goto next;
@@ -1925,7 +1935,7 @@ static int prepend_path(const struct path *path, struct path *root,
1925 bool slash = false; 1935 bool slash = false;
1926 int error = 0; 1936 int error = 0;
1927 1937
1928 spin_lock(&vfsmount_lock); 1938 br_read_lock(vfsmount_lock);
1929 while (dentry != root->dentry || vfsmnt != root->mnt) { 1939 while (dentry != root->dentry || vfsmnt != root->mnt) {
1930 struct dentry * parent; 1940 struct dentry * parent;
1931 1941
@@ -1954,7 +1964,7 @@ out:
1954 if (!error && !slash) 1964 if (!error && !slash)
1955 error = prepend(buffer, buflen, "/", 1); 1965 error = prepend(buffer, buflen, "/", 1);
1956 1966
1957 spin_unlock(&vfsmount_lock); 1967 br_read_unlock(vfsmount_lock);
1958 return error; 1968 return error;
1959 1969
1960global_root: 1970global_root:
@@ -2292,11 +2302,12 @@ int path_is_under(struct path *path1, struct path *path2)
2292 struct vfsmount *mnt = path1->mnt; 2302 struct vfsmount *mnt = path1->mnt;
2293 struct dentry *dentry = path1->dentry; 2303 struct dentry *dentry = path1->dentry;
2294 int res; 2304 int res;
2295 spin_lock(&vfsmount_lock); 2305
2306 br_read_lock(vfsmount_lock);
2296 if (mnt != path2->mnt) { 2307 if (mnt != path2->mnt) {
2297 for (;;) { 2308 for (;;) {
2298 if (mnt->mnt_parent == mnt) { 2309 if (mnt->mnt_parent == mnt) {
2299 spin_unlock(&vfsmount_lock); 2310 br_read_unlock(vfsmount_lock);
2300 return 0; 2311 return 0;
2301 } 2312 }
2302 if (mnt->mnt_parent == path2->mnt) 2313 if (mnt->mnt_parent == path2->mnt)
@@ -2306,7 +2317,7 @@ int path_is_under(struct path *path1, struct path *path2)
2306 dentry = mnt->mnt_mountpoint; 2317 dentry = mnt->mnt_mountpoint;
2307 } 2318 }
2308 res = is_subdir(dentry, path2->dentry); 2319 res = is_subdir(dentry, path2->dentry);
2309 spin_unlock(&vfsmount_lock); 2320 br_read_unlock(vfsmount_lock);
2310 return res; 2321 return res;
2311} 2322}
2312EXPORT_SYMBOL(path_is_under); 2323EXPORT_SYMBOL(path_is_under);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index a2e3b562e65d..cbadc1bee6e7 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1793,7 +1793,7 @@ struct kmem_cache *ecryptfs_key_tfm_cache;
1793static struct list_head key_tfm_list; 1793static struct list_head key_tfm_list;
1794struct mutex key_tfm_list_mutex; 1794struct mutex key_tfm_list_mutex;
1795 1795
1796int ecryptfs_init_crypto(void) 1796int __init ecryptfs_init_crypto(void)
1797{ 1797{
1798 mutex_init(&key_tfm_list_mutex); 1798 mutex_init(&key_tfm_list_mutex);
1799 INIT_LIST_HEAD(&key_tfm_list); 1799 INIT_LIST_HEAD(&key_tfm_list);
@@ -2169,7 +2169,6 @@ int ecryptfs_encrypt_and_encode_filename(
2169 (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 2169 (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
2170 + encoded_name_no_prefix_size); 2170 + encoded_name_no_prefix_size);
2171 (*encoded_name)[(*encoded_name_size)] = '\0'; 2171 (*encoded_name)[(*encoded_name_size)] = '\0';
2172 (*encoded_name_size)++;
2173 } else { 2172 } else {
2174 rc = -EOPNOTSUPP; 2173 rc = -EOPNOTSUPP;
2175 } 2174 }
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 6c55113e7222..3fbc94203380 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -349,7 +349,7 @@ out:
349 349
350/** 350/**
351 * ecryptfs_new_lower_dentry 351 * ecryptfs_new_lower_dentry
352 * @ename: The name of the new dentry. 352 * @name: The name of the new dentry.
353 * @lower_dir_dentry: Parent directory of the new dentry. 353 * @lower_dir_dentry: Parent directory of the new dentry.
354 * @nd: nameidata from last lookup. 354 * @nd: nameidata from last lookup.
355 * 355 *
@@ -386,20 +386,19 @@ ecryptfs_new_lower_dentry(struct qstr *name, struct dentry *lower_dir_dentry,
386 * ecryptfs_lookup_one_lower 386 * ecryptfs_lookup_one_lower
387 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up 387 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
388 * @lower_dir_dentry: lower parent directory 388 * @lower_dir_dentry: lower parent directory
389 * @name: lower file name
389 * 390 *
390 * Get the lower dentry from vfs. If lower dentry does not exist yet, 391 * Get the lower dentry from vfs. If lower dentry does not exist yet,
391 * create it. 392 * create it.
392 */ 393 */
393static struct dentry * 394static struct dentry *
394ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry, 395ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry,
395 struct dentry *lower_dir_dentry) 396 struct dentry *lower_dir_dentry, struct qstr *name)
396{ 397{
397 struct nameidata nd; 398 struct nameidata nd;
398 struct vfsmount *lower_mnt; 399 struct vfsmount *lower_mnt;
399 struct qstr *name;
400 int err; 400 int err;
401 401
402 name = &ecryptfs_dentry->d_name;
403 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt( 402 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
404 ecryptfs_dentry->d_parent)); 403 ecryptfs_dentry->d_parent));
405 err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd); 404 err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd);
@@ -434,6 +433,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
434 size_t encrypted_and_encoded_name_size; 433 size_t encrypted_and_encoded_name_size;
435 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; 434 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
436 struct dentry *lower_dir_dentry, *lower_dentry; 435 struct dentry *lower_dir_dentry, *lower_dentry;
436 struct qstr lower_name;
437 int rc = 0; 437 int rc = 0;
438 438
439 ecryptfs_dentry->d_op = &ecryptfs_dops; 439 ecryptfs_dentry->d_op = &ecryptfs_dops;
@@ -444,9 +444,17 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
444 goto out_d_drop; 444 goto out_d_drop;
445 } 445 }
446 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); 446 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
447 447 lower_name.name = ecryptfs_dentry->d_name.name;
448 lower_name.len = ecryptfs_dentry->d_name.len;
449 lower_name.hash = ecryptfs_dentry->d_name.hash;
450 if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
451 rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
452 &lower_name);
453 if (rc < 0)
454 goto out_d_drop;
455 }
448 lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry, 456 lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
449 lower_dir_dentry); 457 lower_dir_dentry, &lower_name);
450 if (IS_ERR(lower_dentry)) { 458 if (IS_ERR(lower_dentry)) {
451 rc = PTR_ERR(lower_dentry); 459 rc = PTR_ERR(lower_dentry);
452 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned " 460 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
@@ -471,8 +479,17 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
471 "filename; rc = [%d]\n", __func__, rc); 479 "filename; rc = [%d]\n", __func__, rc);
472 goto out_d_drop; 480 goto out_d_drop;
473 } 481 }
482 lower_name.name = encrypted_and_encoded_name;
483 lower_name.len = encrypted_and_encoded_name_size;
484 lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
485 if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
486 rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
487 &lower_name);
488 if (rc < 0)
489 goto out_d_drop;
490 }
474 lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry, 491 lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
475 lower_dir_dentry); 492 lower_dir_dentry, &lower_name);
476 if (IS_ERR(lower_dentry)) { 493 if (IS_ERR(lower_dentry)) {
477 rc = PTR_ERR(lower_dentry); 494 rc = PTR_ERR(lower_dentry);
478 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned " 495 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 89c5476506ef..73811cfa2ea4 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -515,6 +515,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
515 if (!s) { 515 if (!s) {
516 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " 516 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
517 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); 517 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
518 rc = -ENOMEM;
518 goto out; 519 goto out;
519 } 520 }
520 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; 521 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
@@ -806,6 +807,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
806 if (!s) { 807 if (!s) {
807 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " 808 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
808 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); 809 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
810 rc = -ENOMEM;
809 goto out; 811 goto out;
810 } 812 }
811 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; 813 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index d8c3a373aafa..0851ab6980f5 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -86,7 +86,7 @@ out:
86 return 0; 86 return 0;
87} 87}
88 88
89int ecryptfs_init_kthread(void) 89int __init ecryptfs_init_kthread(void)
90{ 90{
91 int rc = 0; 91 int rc = 0;
92 92
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index bcb68c0cb1f0..ab2248090515 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -473,7 +473,7 @@ sleep:
473 return rc; 473 return rc;
474} 474}
475 475
476int ecryptfs_init_messaging(void) 476int __init ecryptfs_init_messaging(void)
477{ 477{
478 int i; 478 int i;
479 int rc = 0; 479 int rc = 0;
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 3745f612bcd4..00208c3d7e92 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -500,7 +500,7 @@ static struct miscdevice ecryptfs_miscdev = {
500 * 500 *
501 * Returns zero on success; non-zero otherwise 501 * Returns zero on success; non-zero otherwise
502 */ 502 */
503int ecryptfs_init_ecryptfs_miscdev(void) 503int __init ecryptfs_init_ecryptfs_miscdev(void)
504{ 504{
505 int rc; 505 int rc;
506 506
diff --git a/fs/exec.c b/fs/exec.c
index 7761837e4500..2d9455282744 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -361,13 +361,13 @@ err:
361/* 361/*
362 * count() counts the number of strings in array ARGV. 362 * count() counts the number of strings in array ARGV.
363 */ 363 */
364static int count(char __user * __user * argv, int max) 364static int count(const char __user * const __user * argv, int max)
365{ 365{
366 int i = 0; 366 int i = 0;
367 367
368 if (argv != NULL) { 368 if (argv != NULL) {
369 for (;;) { 369 for (;;) {
370 char __user * p; 370 const char __user * p;
371 371
372 if (get_user(p, argv)) 372 if (get_user(p, argv))
373 return -EFAULT; 373 return -EFAULT;
@@ -387,7 +387,7 @@ static int count(char __user * __user * argv, int max)
387 * processes's memory to the new process's stack. The call to get_user_pages() 387 * processes's memory to the new process's stack. The call to get_user_pages()
388 * ensures the destination page is created and not swapped out. 388 * ensures the destination page is created and not swapped out.
389 */ 389 */
390static int copy_strings(int argc, char __user * __user * argv, 390static int copy_strings(int argc, const char __user *const __user *argv,
391 struct linux_binprm *bprm) 391 struct linux_binprm *bprm)
392{ 392{
393 struct page *kmapped_page = NULL; 393 struct page *kmapped_page = NULL;
@@ -396,7 +396,7 @@ static int copy_strings(int argc, char __user * __user * argv,
396 int ret; 396 int ret;
397 397
398 while (argc-- > 0) { 398 while (argc-- > 0) {
399 char __user *str; 399 const char __user *str;
400 int len; 400 int len;
401 unsigned long pos; 401 unsigned long pos;
402 402
@@ -470,12 +470,13 @@ out:
470/* 470/*
471 * Like copy_strings, but get argv and its values from kernel memory. 471 * Like copy_strings, but get argv and its values from kernel memory.
472 */ 472 */
473int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm) 473int copy_strings_kernel(int argc, const char *const *argv,
474 struct linux_binprm *bprm)
474{ 475{
475 int r; 476 int r;
476 mm_segment_t oldfs = get_fs(); 477 mm_segment_t oldfs = get_fs();
477 set_fs(KERNEL_DS); 478 set_fs(KERNEL_DS);
478 r = copy_strings(argc, (char __user * __user *)argv, bprm); 479 r = copy_strings(argc, (const char __user *const __user *)argv, bprm);
479 set_fs(oldfs); 480 set_fs(oldfs);
480 return r; 481 return r;
481} 482}
@@ -997,7 +998,7 @@ EXPORT_SYMBOL(flush_old_exec);
997void setup_new_exec(struct linux_binprm * bprm) 998void setup_new_exec(struct linux_binprm * bprm)
998{ 999{
999 int i, ch; 1000 int i, ch;
1000 char * name; 1001 const char *name;
1001 char tcomm[sizeof(current->comm)]; 1002 char tcomm[sizeof(current->comm)];
1002 1003
1003 arch_pick_mmap_layout(current->mm); 1004 arch_pick_mmap_layout(current->mm);
@@ -1117,7 +1118,7 @@ int check_unsafe_exec(struct linux_binprm *bprm)
1117 bprm->unsafe = tracehook_unsafe_exec(p); 1118 bprm->unsafe = tracehook_unsafe_exec(p);
1118 1119
1119 n_fs = 1; 1120 n_fs = 1;
1120 write_lock(&p->fs->lock); 1121 spin_lock(&p->fs->lock);
1121 rcu_read_lock(); 1122 rcu_read_lock();
1122 for (t = next_thread(p); t != p; t = next_thread(t)) { 1123 for (t = next_thread(p); t != p; t = next_thread(t)) {
1123 if (t->fs == p->fs) 1124 if (t->fs == p->fs)
@@ -1134,7 +1135,7 @@ int check_unsafe_exec(struct linux_binprm *bprm)
1134 res = 1; 1135 res = 1;
1135 } 1136 }
1136 } 1137 }
1137 write_unlock(&p->fs->lock); 1138 spin_unlock(&p->fs->lock);
1138 1139
1139 return res; 1140 return res;
1140} 1141}
@@ -1316,9 +1317,9 @@ EXPORT_SYMBOL(search_binary_handler);
1316/* 1317/*
1317 * sys_execve() executes a new program. 1318 * sys_execve() executes a new program.
1318 */ 1319 */
1319int do_execve(char * filename, 1320int do_execve(const char * filename,
1320 char __user *__user *argv, 1321 const char __user *const __user *argv,
1321 char __user *__user *envp, 1322 const char __user *const __user *envp,
1322 struct pt_regs * regs) 1323 struct pt_regs * regs)
1323{ 1324{
1324 struct linux_binprm *bprm; 1325 struct linux_binprm *bprm;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 1fa23f6ffba5..1736f2356388 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -250,7 +250,9 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
250{ 250{
251 int i, err = 0; 251 int i, err = 0;
252 252
253 ll_rw_block(SWRITE, nr_bhs, bhs); 253 for (i = 0; i < nr_bhs; i++)
254 write_dirty_buffer(bhs[i], WRITE);
255
254 for (i = 0; i < nr_bhs; i++) { 256 for (i = 0; i < nr_bhs; i++) {
255 wait_on_buffer(bhs[i]); 257 wait_on_buffer(bhs[i]);
256 if (buffer_eopnotsupp(bhs[i])) { 258 if (buffer_eopnotsupp(bhs[i])) {
diff --git a/fs/file_table.c b/fs/file_table.c
index edecd36fed9b..a04bdd81c11c 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -20,7 +20,9 @@
20#include <linux/cdev.h> 20#include <linux/cdev.h>
21#include <linux/fsnotify.h> 21#include <linux/fsnotify.h>
22#include <linux/sysctl.h> 22#include <linux/sysctl.h>
23#include <linux/lglock.h>
23#include <linux/percpu_counter.h> 24#include <linux/percpu_counter.h>
25#include <linux/percpu.h>
24#include <linux/ima.h> 26#include <linux/ima.h>
25 27
26#include <asm/atomic.h> 28#include <asm/atomic.h>
@@ -32,8 +34,8 @@ struct files_stat_struct files_stat = {
32 .max_files = NR_FILE 34 .max_files = NR_FILE
33}; 35};
34 36
35/* public. Not pretty! */ 37DECLARE_LGLOCK(files_lglock);
36__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); 38DEFINE_LGLOCK(files_lglock);
37 39
38/* SLAB cache for file structures */ 40/* SLAB cache for file structures */
39static struct kmem_cache *filp_cachep __read_mostly; 41static struct kmem_cache *filp_cachep __read_mostly;
@@ -249,7 +251,7 @@ static void __fput(struct file *file)
249 cdev_put(inode->i_cdev); 251 cdev_put(inode->i_cdev);
250 fops_put(file->f_op); 252 fops_put(file->f_op);
251 put_pid(file->f_owner.pid); 253 put_pid(file->f_owner.pid);
252 file_kill(file); 254 file_sb_list_del(file);
253 if (file->f_mode & FMODE_WRITE) 255 if (file->f_mode & FMODE_WRITE)
254 drop_file_write_access(file); 256 drop_file_write_access(file);
255 file->f_path.dentry = NULL; 257 file->f_path.dentry = NULL;
@@ -328,41 +330,107 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
328 return file; 330 return file;
329} 331}
330 332
331
332void put_filp(struct file *file) 333void put_filp(struct file *file)
333{ 334{
334 if (atomic_long_dec_and_test(&file->f_count)) { 335 if (atomic_long_dec_and_test(&file->f_count)) {
335 security_file_free(file); 336 security_file_free(file);
336 file_kill(file); 337 file_sb_list_del(file);
337 file_free(file); 338 file_free(file);
338 } 339 }
339} 340}
340 341
341void file_move(struct file *file, struct list_head *list) 342static inline int file_list_cpu(struct file *file)
342{ 343{
343 if (!list) 344#ifdef CONFIG_SMP
344 return; 345 return file->f_sb_list_cpu;
345 file_list_lock(); 346#else
346 list_move(&file->f_u.fu_list, list); 347 return smp_processor_id();
347 file_list_unlock(); 348#endif
349}
350
351/* helper for file_sb_list_add to reduce ifdefs */
352static inline void __file_sb_list_add(struct file *file, struct super_block *sb)
353{
354 struct list_head *list;
355#ifdef CONFIG_SMP
356 int cpu;
357 cpu = smp_processor_id();
358 file->f_sb_list_cpu = cpu;
359 list = per_cpu_ptr(sb->s_files, cpu);
360#else
361 list = &sb->s_files;
362#endif
363 list_add(&file->f_u.fu_list, list);
348} 364}
349 365
350void file_kill(struct file *file) 366/**
367 * file_sb_list_add - add a file to the sb's file list
368 * @file: file to add
369 * @sb: sb to add it to
370 *
371 * Use this function to associate a file with the superblock of the inode it
372 * refers to.
373 */
374void file_sb_list_add(struct file *file, struct super_block *sb)
375{
376 lg_local_lock(files_lglock);
377 __file_sb_list_add(file, sb);
378 lg_local_unlock(files_lglock);
379}
380
381/**
382 * file_sb_list_del - remove a file from the sb's file list
383 * @file: file to remove
384 * @sb: sb to remove it from
385 *
386 * Use this function to remove a file from its superblock.
387 */
388void file_sb_list_del(struct file *file)
351{ 389{
352 if (!list_empty(&file->f_u.fu_list)) { 390 if (!list_empty(&file->f_u.fu_list)) {
353 file_list_lock(); 391 lg_local_lock_cpu(files_lglock, file_list_cpu(file));
354 list_del_init(&file->f_u.fu_list); 392 list_del_init(&file->f_u.fu_list);
355 file_list_unlock(); 393 lg_local_unlock_cpu(files_lglock, file_list_cpu(file));
356 } 394 }
357} 395}
358 396
397#ifdef CONFIG_SMP
398
399/*
400 * These macros iterate all files on all CPUs for a given superblock.
401 * files_lglock must be held globally.
402 */
403#define do_file_list_for_each_entry(__sb, __file) \
404{ \
405 int i; \
406 for_each_possible_cpu(i) { \
407 struct list_head *list; \
408 list = per_cpu_ptr((__sb)->s_files, i); \
409 list_for_each_entry((__file), list, f_u.fu_list)
410
411#define while_file_list_for_each_entry \
412 } \
413}
414
415#else
416
417#define do_file_list_for_each_entry(__sb, __file) \
418{ \
419 struct list_head *list; \
420 list = &(sb)->s_files; \
421 list_for_each_entry((__file), list, f_u.fu_list)
422
423#define while_file_list_for_each_entry \
424}
425
426#endif
427
359int fs_may_remount_ro(struct super_block *sb) 428int fs_may_remount_ro(struct super_block *sb)
360{ 429{
361 struct file *file; 430 struct file *file;
362
363 /* Check that no files are currently opened for writing. */ 431 /* Check that no files are currently opened for writing. */
364 file_list_lock(); 432 lg_global_lock(files_lglock);
365 list_for_each_entry(file, &sb->s_files, f_u.fu_list) { 433 do_file_list_for_each_entry(sb, file) {
366 struct inode *inode = file->f_path.dentry->d_inode; 434 struct inode *inode = file->f_path.dentry->d_inode;
367 435
368 /* File with pending delete? */ 436 /* File with pending delete? */
@@ -372,11 +440,11 @@ int fs_may_remount_ro(struct super_block *sb)
372 /* Writeable file? */ 440 /* Writeable file? */
373 if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE)) 441 if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
374 goto too_bad; 442 goto too_bad;
375 } 443 } while_file_list_for_each_entry;
376 file_list_unlock(); 444 lg_global_unlock(files_lglock);
377 return 1; /* Tis' cool bro. */ 445 return 1; /* Tis' cool bro. */
378too_bad: 446too_bad:
379 file_list_unlock(); 447 lg_global_unlock(files_lglock);
380 return 0; 448 return 0;
381} 449}
382 450
@@ -392,8 +460,8 @@ void mark_files_ro(struct super_block *sb)
392 struct file *f; 460 struct file *f;
393 461
394retry: 462retry:
395 file_list_lock(); 463 lg_global_lock(files_lglock);
396 list_for_each_entry(f, &sb->s_files, f_u.fu_list) { 464 do_file_list_for_each_entry(sb, f) {
397 struct vfsmount *mnt; 465 struct vfsmount *mnt;
398 if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) 466 if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
399 continue; 467 continue;
@@ -408,16 +476,13 @@ retry:
408 continue; 476 continue;
409 file_release_write(f); 477 file_release_write(f);
410 mnt = mntget(f->f_path.mnt); 478 mnt = mntget(f->f_path.mnt);
411 file_list_unlock(); 479 /* This can sleep, so we can't hold the spinlock. */
412 /* 480 lg_global_unlock(files_lglock);
413 * This can sleep, so we can't hold
414 * the file_list_lock() spinlock.
415 */
416 mnt_drop_write(mnt); 481 mnt_drop_write(mnt);
417 mntput(mnt); 482 mntput(mnt);
418 goto retry; 483 goto retry;
419 } 484 } while_file_list_for_each_entry;
420 file_list_unlock(); 485 lg_global_unlock(files_lglock);
421} 486}
422 487
423void __init files_init(unsigned long mempages) 488void __init files_init(unsigned long mempages)
@@ -437,5 +502,6 @@ void __init files_init(unsigned long mempages)
437 if (files_stat.max_files < NR_FILE) 502 if (files_stat.max_files < NR_FILE)
438 files_stat.max_files = NR_FILE; 503 files_stat.max_files = NR_FILE;
439 files_defer_init(); 504 files_defer_init();
505 lg_lock_init(files_lglock);
440 percpu_counter_init(&nr_files, 0); 506 percpu_counter_init(&nr_files, 0);
441} 507}
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 1ee40eb9a2c0..ed45a9cf5f3d 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -13,11 +13,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
13{ 13{
14 struct path old_root; 14 struct path old_root;
15 15
16 write_lock(&fs->lock); 16 spin_lock(&fs->lock);
17 old_root = fs->root; 17 old_root = fs->root;
18 fs->root = *path; 18 fs->root = *path;
19 path_get(path); 19 path_get(path);
20 write_unlock(&fs->lock); 20 spin_unlock(&fs->lock);
21 if (old_root.dentry) 21 if (old_root.dentry)
22 path_put(&old_root); 22 path_put(&old_root);
23} 23}
@@ -30,11 +30,11 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
30{ 30{
31 struct path old_pwd; 31 struct path old_pwd;
32 32
33 write_lock(&fs->lock); 33 spin_lock(&fs->lock);
34 old_pwd = fs->pwd; 34 old_pwd = fs->pwd;
35 fs->pwd = *path; 35 fs->pwd = *path;
36 path_get(path); 36 path_get(path);
37 write_unlock(&fs->lock); 37 spin_unlock(&fs->lock);
38 38
39 if (old_pwd.dentry) 39 if (old_pwd.dentry)
40 path_put(&old_pwd); 40 path_put(&old_pwd);
@@ -51,7 +51,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
51 task_lock(p); 51 task_lock(p);
52 fs = p->fs; 52 fs = p->fs;
53 if (fs) { 53 if (fs) {
54 write_lock(&fs->lock); 54 spin_lock(&fs->lock);
55 if (fs->root.dentry == old_root->dentry 55 if (fs->root.dentry == old_root->dentry
56 && fs->root.mnt == old_root->mnt) { 56 && fs->root.mnt == old_root->mnt) {
57 path_get(new_root); 57 path_get(new_root);
@@ -64,7 +64,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
64 fs->pwd = *new_root; 64 fs->pwd = *new_root;
65 count++; 65 count++;
66 } 66 }
67 write_unlock(&fs->lock); 67 spin_unlock(&fs->lock);
68 } 68 }
69 task_unlock(p); 69 task_unlock(p);
70 } while_each_thread(g, p); 70 } while_each_thread(g, p);
@@ -87,10 +87,10 @@ void exit_fs(struct task_struct *tsk)
87 if (fs) { 87 if (fs) {
88 int kill; 88 int kill;
89 task_lock(tsk); 89 task_lock(tsk);
90 write_lock(&fs->lock); 90 spin_lock(&fs->lock);
91 tsk->fs = NULL; 91 tsk->fs = NULL;
92 kill = !--fs->users; 92 kill = !--fs->users;
93 write_unlock(&fs->lock); 93 spin_unlock(&fs->lock);
94 task_unlock(tsk); 94 task_unlock(tsk);
95 if (kill) 95 if (kill)
96 free_fs_struct(fs); 96 free_fs_struct(fs);
@@ -104,7 +104,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
104 if (fs) { 104 if (fs) {
105 fs->users = 1; 105 fs->users = 1;
106 fs->in_exec = 0; 106 fs->in_exec = 0;
107 rwlock_init(&fs->lock); 107 spin_lock_init(&fs->lock);
108 fs->umask = old->umask; 108 fs->umask = old->umask;
109 get_fs_root_and_pwd(old, &fs->root, &fs->pwd); 109 get_fs_root_and_pwd(old, &fs->root, &fs->pwd);
110 } 110 }
@@ -121,10 +121,10 @@ int unshare_fs_struct(void)
121 return -ENOMEM; 121 return -ENOMEM;
122 122
123 task_lock(current); 123 task_lock(current);
124 write_lock(&fs->lock); 124 spin_lock(&fs->lock);
125 kill = !--fs->users; 125 kill = !--fs->users;
126 current->fs = new_fs; 126 current->fs = new_fs;
127 write_unlock(&fs->lock); 127 spin_unlock(&fs->lock);
128 task_unlock(current); 128 task_unlock(current);
129 129
130 if (kill) 130 if (kill)
@@ -143,7 +143,7 @@ EXPORT_SYMBOL(current_umask);
143/* to be mentioned only in INIT_TASK */ 143/* to be mentioned only in INIT_TASK */
144struct fs_struct init_fs = { 144struct fs_struct init_fs = {
145 .users = 1, 145 .users = 1,
146 .lock = __RW_LOCK_UNLOCKED(init_fs.lock), 146 .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock),
147 .umask = 0022, 147 .umask = 0022,
148}; 148};
149 149
@@ -156,14 +156,14 @@ void daemonize_fs_struct(void)
156 156
157 task_lock(current); 157 task_lock(current);
158 158
159 write_lock(&init_fs.lock); 159 spin_lock(&init_fs.lock);
160 init_fs.users++; 160 init_fs.users++;
161 write_unlock(&init_fs.lock); 161 spin_unlock(&init_fs.lock);
162 162
163 write_lock(&fs->lock); 163 spin_lock(&fs->lock);
164 current->fs = &init_fs; 164 current->fs = &init_fs;
165 kill = !--fs->users; 165 kill = !--fs->users;
166 write_unlock(&fs->lock); 166 spin_unlock(&fs->lock);
167 167
168 task_unlock(current); 168 task_unlock(current);
169 if (kill) 169 if (kill)
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 99800e564157..6bc9e3a5a693 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -94,6 +94,7 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value,
94 if (error < 0) 94 if (error < 0)
95 goto failed; 95 goto failed;
96 inode->i_mode = mode; 96 inode->i_mode = mode;
97 inode->i_ctime = CURRENT_TIME;
97 if (error == 0) { 98 if (error == 0) {
98 posix_acl_release(acl); 99 posix_acl_release(acl);
99 acl = NULL; 100 acl = NULL;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index dd1e55535a4e..f7dc9b5f9ef8 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -104,7 +104,7 @@ static char *__dentry_name(struct dentry *dentry, char *name)
104 __putname(name); 104 __putname(name);
105 return NULL; 105 return NULL;
106 } 106 }
107 strncpy(name, root, PATH_MAX); 107 strlcpy(name, root, PATH_MAX);
108 if (len > p - name) { 108 if (len > p - name) {
109 __putname(name); 109 __putname(name);
110 return NULL; 110 return NULL;
@@ -876,7 +876,7 @@ static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd)
876 char *path = dentry_name(dentry); 876 char *path = dentry_name(dentry);
877 int err = -ENOMEM; 877 int err = -ENOMEM;
878 if (path) { 878 if (path) {
879 int err = hostfs_do_readlink(path, link, PATH_MAX); 879 err = hostfs_do_readlink(path, link, PATH_MAX);
880 if (err == PATH_MAX) 880 if (err == PATH_MAX)
881 err = -E2BIG; 881 err = -E2BIG;
882 __putname(path); 882 __putname(path);
diff --git a/fs/internal.h b/fs/internal.h
index 6b706bc60a66..a6910e91cee8 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -9,6 +9,8 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/lglock.h>
13
12struct super_block; 14struct super_block;
13struct linux_binprm; 15struct linux_binprm;
14struct path; 16struct path;
@@ -70,7 +72,8 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
70 72
71extern void __init mnt_init(void); 73extern void __init mnt_init(void);
72 74
73extern spinlock_t vfsmount_lock; 75DECLARE_BRLOCK(vfsmount_lock);
76
74 77
75/* 78/*
76 * fs_struct.c 79 * fs_struct.c
@@ -80,6 +83,8 @@ extern void chroot_fs_refs(struct path *, struct path *);
80/* 83/*
81 * file_table.c 84 * file_table.c
82 */ 85 */
86extern void file_sb_list_add(struct file *f, struct super_block *sb);
87extern void file_sb_list_del(struct file *f);
83extern void mark_files_ro(struct super_block *); 88extern void mark_files_ro(struct super_block *);
84extern struct file *get_empty_filp(void); 89extern struct file *get_empty_filp(void);
85 90
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index b0435dd0654d..05a38b9c4c0e 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -254,7 +254,9 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
254{ 254{
255 int i; 255 int i;
256 256
257 ll_rw_block(SWRITE, *batch_count, bhs); 257 for (i = 0; i < *batch_count; i++)
258 write_dirty_buffer(bhs[i], WRITE);
259
258 for (i = 0; i < *batch_count; i++) { 260 for (i = 0; i < *batch_count; i++) {
259 struct buffer_head *bh = bhs[i]; 261 struct buffer_head *bh = bhs[i];
260 clear_buffer_jwrite(bh); 262 clear_buffer_jwrite(bh);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 28a9ddaa0c49..95d8c11c929e 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -119,7 +119,6 @@ static int journal_write_commit_record(journal_t *journal,
119 struct buffer_head *bh; 119 struct buffer_head *bh;
120 journal_header_t *header; 120 journal_header_t *header;
121 int ret; 121 int ret;
122 int barrier_done = 0;
123 122
124 if (is_journal_aborted(journal)) 123 if (is_journal_aborted(journal))
125 return 0; 124 return 0;
@@ -137,34 +136,36 @@ static int journal_write_commit_record(journal_t *journal,
137 136
138 JBUFFER_TRACE(descriptor, "write commit block"); 137 JBUFFER_TRACE(descriptor, "write commit block");
139 set_buffer_dirty(bh); 138 set_buffer_dirty(bh);
139
140 if (journal->j_flags & JFS_BARRIER) { 140 if (journal->j_flags & JFS_BARRIER) {
141 set_buffer_ordered(bh); 141 ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER);
142 barrier_done = 1;
143 }
144 ret = sync_dirty_buffer(bh);
145 if (barrier_done)
146 clear_buffer_ordered(bh);
147 /* is it possible for another commit to fail at roughly
148 * the same time as this one? If so, we don't want to
149 * trust the barrier flag in the super, but instead want
150 * to remember if we sent a barrier request
151 */
152 if (ret == -EOPNOTSUPP && barrier_done) {
153 char b[BDEVNAME_SIZE];
154 142
155 printk(KERN_WARNING 143 /*
156 "JBD: barrier-based sync failed on %s - " 144 * Is it possible for another commit to fail at roughly
157 "disabling barriers\n", 145 * the same time as this one? If so, we don't want to
158 bdevname(journal->j_dev, b)); 146 * trust the barrier flag in the super, but instead want
159 spin_lock(&journal->j_state_lock); 147 * to remember if we sent a barrier request
160 journal->j_flags &= ~JFS_BARRIER; 148 */
161 spin_unlock(&journal->j_state_lock); 149 if (ret == -EOPNOTSUPP) {
150 char b[BDEVNAME_SIZE];
162 151
163 /* And try again, without the barrier */ 152 printk(KERN_WARNING
164 set_buffer_uptodate(bh); 153 "JBD: barrier-based sync failed on %s - "
165 set_buffer_dirty(bh); 154 "disabling barriers\n",
155 bdevname(journal->j_dev, b));
156 spin_lock(&journal->j_state_lock);
157 journal->j_flags &= ~JFS_BARRIER;
158 spin_unlock(&journal->j_state_lock);
159
160 /* And try again, without the barrier */
161 set_buffer_uptodate(bh);
162 set_buffer_dirty(bh);
163 ret = sync_dirty_buffer(bh);
164 }
165 } else {
166 ret = sync_dirty_buffer(bh); 166 ret = sync_dirty_buffer(bh);
167 } 167 }
168
168 put_bh(bh); /* One for getblk() */ 169 put_bh(bh); /* One for getblk() */
169 journal_put_journal_head(descriptor); 170 journal_put_journal_head(descriptor);
170 171
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index f19ce94693d8..2c4b1f109da9 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1024,7 +1024,7 @@ void journal_update_superblock(journal_t *journal, int wait)
1024 if (wait) 1024 if (wait)
1025 sync_dirty_buffer(bh); 1025 sync_dirty_buffer(bh);
1026 else 1026 else
1027 ll_rw_block(SWRITE, 1, &bh); 1027 write_dirty_buffer(bh, WRITE);
1028 1028
1029out: 1029out:
1030 /* If we have just flushed the log (by marking s_start==0), then 1030 /* If we have just flushed the log (by marking s_start==0), then
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index ad717328343a..d29018307e2e 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -617,7 +617,7 @@ static void flush_descriptor(journal_t *journal,
617 set_buffer_jwrite(bh); 617 set_buffer_jwrite(bh);
618 BUFFER_TRACE(bh, "write"); 618 BUFFER_TRACE(bh, "write");
619 set_buffer_dirty(bh); 619 set_buffer_dirty(bh);
620 ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh); 620 write_dirty_buffer(bh, write_op);
621} 621}
622#endif 622#endif
623 623
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 1c23a0f4e8a3..5247e7ffdcb4 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -255,7 +255,9 @@ __flush_batch(journal_t *journal, int *batch_count)
255{ 255{
256 int i; 256 int i;
257 257
258 ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs); 258 for (i = 0; i < *batch_count; i++)
259 write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE);
260
259 for (i = 0; i < *batch_count; i++) { 261 for (i = 0; i < *batch_count; i++) {
260 struct buffer_head *bh = journal->j_chkpt_bhs[i]; 262 struct buffer_head *bh = journal->j_chkpt_bhs[i];
261 clear_buffer_jwrite(bh); 263 clear_buffer_jwrite(bh);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f52e5e8049f1..7c068c189d80 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -101,7 +101,6 @@ static int journal_submit_commit_record(journal_t *journal,
101 struct commit_header *tmp; 101 struct commit_header *tmp;
102 struct buffer_head *bh; 102 struct buffer_head *bh;
103 int ret; 103 int ret;
104 int barrier_done = 0;
105 struct timespec now = current_kernel_time(); 104 struct timespec now = current_kernel_time();
106 105
107 if (is_journal_aborted(journal)) 106 if (is_journal_aborted(journal))
@@ -136,30 +135,22 @@ static int journal_submit_commit_record(journal_t *journal,
136 if (journal->j_flags & JBD2_BARRIER && 135 if (journal->j_flags & JBD2_BARRIER &&
137 !JBD2_HAS_INCOMPAT_FEATURE(journal, 136 !JBD2_HAS_INCOMPAT_FEATURE(journal,
138 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 137 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
139 set_buffer_ordered(bh); 138 ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh);
140 barrier_done = 1; 139 if (ret == -EOPNOTSUPP) {
141 } 140 printk(KERN_WARNING
142 ret = submit_bh(WRITE_SYNC_PLUG, bh); 141 "JBD2: Disabling barriers on %s, "
143 if (barrier_done) 142 "not supported by device\n", journal->j_devname);
144 clear_buffer_ordered(bh); 143 write_lock(&journal->j_state_lock);
145 144 journal->j_flags &= ~JBD2_BARRIER;
146 /* is it possible for another commit to fail at roughly 145 write_unlock(&journal->j_state_lock);
147 * the same time as this one? If so, we don't want to
148 * trust the barrier flag in the super, but instead want
149 * to remember if we sent a barrier request
150 */
151 if (ret == -EOPNOTSUPP && barrier_done) {
152 printk(KERN_WARNING
153 "JBD2: Disabling barriers on %s, "
154 "not supported by device\n", journal->j_devname);
155 write_lock(&journal->j_state_lock);
156 journal->j_flags &= ~JBD2_BARRIER;
157 write_unlock(&journal->j_state_lock);
158 146
159 /* And try again, without the barrier */ 147 /* And try again, without the barrier */
160 lock_buffer(bh); 148 lock_buffer(bh);
161 set_buffer_uptodate(bh); 149 set_buffer_uptodate(bh);
162 clear_buffer_dirty(bh); 150 clear_buffer_dirty(bh);
151 ret = submit_bh(WRITE_SYNC_PLUG, bh);
152 }
153 } else {
163 ret = submit_bh(WRITE_SYNC_PLUG, bh); 154 ret = submit_bh(WRITE_SYNC_PLUG, bh);
164 } 155 }
165 *cbh = bh; 156 *cbh = bh;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index ad5866aaf0f9..0e8014ea6b94 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1124,7 +1124,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
1124 set_buffer_uptodate(bh); 1124 set_buffer_uptodate(bh);
1125 } 1125 }
1126 } else 1126 } else
1127 ll_rw_block(SWRITE, 1, &bh); 1127 write_dirty_buffer(bh, WRITE);
1128 1128
1129out: 1129out:
1130 /* If we have just flushed the log (by marking s_start==0), then 1130 /* If we have just flushed the log (by marking s_start==0), then
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index a360b06af2e3..9ad321fd63fd 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -625,7 +625,7 @@ static void flush_descriptor(journal_t *journal,
625 set_buffer_jwrite(bh); 625 set_buffer_jwrite(bh);
626 BUFFER_TRACE(bh, "write"); 626 BUFFER_TRACE(bh, "write");
627 set_buffer_dirty(bh); 627 set_buffer_dirty(bh);
628 ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh); 628 write_dirty_buffer(bh, write_op);
629} 629}
630#endif 630#endif
631 631
diff --git a/fs/mbcache.c b/fs/mbcache.c
index cf4e6cdfd15b..93444747237b 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -80,6 +80,7 @@ struct mb_cache {
80 struct list_head c_cache_list; 80 struct list_head c_cache_list;
81 const char *c_name; 81 const char *c_name;
82 atomic_t c_entry_count; 82 atomic_t c_entry_count;
83 int c_max_entries;
83 int c_bucket_bits; 84 int c_bucket_bits;
84 struct kmem_cache *c_entry_cache; 85 struct kmem_cache *c_entry_cache;
85 struct list_head *c_block_hash; 86 struct list_head *c_block_hash;
@@ -243,6 +244,12 @@ mb_cache_create(const char *name, int bucket_bits)
243 if (!cache->c_entry_cache) 244 if (!cache->c_entry_cache)
244 goto fail2; 245 goto fail2;
245 246
247 /*
248 * Set an upper limit on the number of cache entries so that the hash
249 * chains won't grow too long.
250 */
251 cache->c_max_entries = bucket_count << 4;
252
246 spin_lock(&mb_cache_spinlock); 253 spin_lock(&mb_cache_spinlock);
247 list_add(&cache->c_cache_list, &mb_cache_list); 254 list_add(&cache->c_cache_list, &mb_cache_list);
248 spin_unlock(&mb_cache_spinlock); 255 spin_unlock(&mb_cache_spinlock);
@@ -333,7 +340,6 @@ mb_cache_destroy(struct mb_cache *cache)
333 kfree(cache); 340 kfree(cache);
334} 341}
335 342
336
337/* 343/*
338 * mb_cache_entry_alloc() 344 * mb_cache_entry_alloc()
339 * 345 *
@@ -345,17 +351,29 @@ mb_cache_destroy(struct mb_cache *cache)
345struct mb_cache_entry * 351struct mb_cache_entry *
346mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags) 352mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
347{ 353{
348 struct mb_cache_entry *ce; 354 struct mb_cache_entry *ce = NULL;
349 355
350 ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); 356 if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
351 if (ce) { 357 spin_lock(&mb_cache_spinlock);
358 if (!list_empty(&mb_cache_lru_list)) {
359 ce = list_entry(mb_cache_lru_list.next,
360 struct mb_cache_entry, e_lru_list);
361 list_del_init(&ce->e_lru_list);
362 __mb_cache_entry_unhash(ce);
363 }
364 spin_unlock(&mb_cache_spinlock);
365 }
366 if (!ce) {
367 ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
368 if (!ce)
369 return NULL;
352 atomic_inc(&cache->c_entry_count); 370 atomic_inc(&cache->c_entry_count);
353 INIT_LIST_HEAD(&ce->e_lru_list); 371 INIT_LIST_HEAD(&ce->e_lru_list);
354 INIT_LIST_HEAD(&ce->e_block_list); 372 INIT_LIST_HEAD(&ce->e_block_list);
355 ce->e_cache = cache; 373 ce->e_cache = cache;
356 ce->e_used = 1 + MB_CACHE_WRITER;
357 ce->e_queued = 0; 374 ce->e_queued = 0;
358 } 375 }
376 ce->e_used = 1 + MB_CACHE_WRITER;
359 return ce; 377 return ce;
360} 378}
361 379
diff --git a/fs/namei.c b/fs/namei.c
index 17ea76bf2fbe..24896e833565 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -595,15 +595,16 @@ int follow_up(struct path *path)
595{ 595{
596 struct vfsmount *parent; 596 struct vfsmount *parent;
597 struct dentry *mountpoint; 597 struct dentry *mountpoint;
598 spin_lock(&vfsmount_lock); 598
599 br_read_lock(vfsmount_lock);
599 parent = path->mnt->mnt_parent; 600 parent = path->mnt->mnt_parent;
600 if (parent == path->mnt) { 601 if (parent == path->mnt) {
601 spin_unlock(&vfsmount_lock); 602 br_read_unlock(vfsmount_lock);
602 return 0; 603 return 0;
603 } 604 }
604 mntget(parent); 605 mntget(parent);
605 mountpoint = dget(path->mnt->mnt_mountpoint); 606 mountpoint = dget(path->mnt->mnt_mountpoint);
606 spin_unlock(&vfsmount_lock); 607 br_read_unlock(vfsmount_lock);
607 dput(path->dentry); 608 dput(path->dentry);
608 path->dentry = mountpoint; 609 path->dentry = mountpoint;
609 mntput(path->mnt); 610 mntput(path->mnt);
@@ -686,6 +687,35 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
686} 687}
687 688
688/* 689/*
690 * Allocate a dentry with name and parent, and perform a parent
691 * directory ->lookup on it. Returns the new dentry, or ERR_PTR
692 * on error. parent->d_inode->i_mutex must be held. d_lookup must
693 * have verified that no child exists while under i_mutex.
694 */
695static struct dentry *d_alloc_and_lookup(struct dentry *parent,
696 struct qstr *name, struct nameidata *nd)
697{
698 struct inode *inode = parent->d_inode;
699 struct dentry *dentry;
700 struct dentry *old;
701
702 /* Don't create child dentry for a dead directory. */
703 if (unlikely(IS_DEADDIR(inode)))
704 return ERR_PTR(-ENOENT);
705
706 dentry = d_alloc(parent, name);
707 if (unlikely(!dentry))
708 return ERR_PTR(-ENOMEM);
709
710 old = inode->i_op->lookup(inode, dentry, nd);
711 if (unlikely(old)) {
712 dput(dentry);
713 dentry = old;
714 }
715 return dentry;
716}
717
718/*
689 * It's more convoluted than I'd like it to be, but... it's still fairly 719 * It's more convoluted than I'd like it to be, but... it's still fairly
690 * small and for now I'd prefer to have fast path as straight as possible. 720 * small and for now I'd prefer to have fast path as straight as possible.
691 * It _is_ time-critical. 721 * It _is_ time-critical.
@@ -706,9 +736,15 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
706 return err; 736 return err;
707 } 737 }
708 738
739 /*
740 * Rename seqlock is not required here because in the off chance
741 * of a false negative due to a concurrent rename, we're going to
742 * do the non-racy lookup, below.
743 */
709 dentry = __d_lookup(nd->path.dentry, name); 744 dentry = __d_lookup(nd->path.dentry, name);
710 if (!dentry) 745 if (!dentry)
711 goto need_lookup; 746 goto need_lookup;
747found:
712 if (dentry->d_op && dentry->d_op->d_revalidate) 748 if (dentry->d_op && dentry->d_op->d_revalidate)
713 goto need_revalidate; 749 goto need_revalidate;
714done: 750done:
@@ -724,56 +760,28 @@ need_lookup:
724 mutex_lock(&dir->i_mutex); 760 mutex_lock(&dir->i_mutex);
725 /* 761 /*
726 * First re-do the cached lookup just in case it was created 762 * First re-do the cached lookup just in case it was created
727 * while we waited for the directory semaphore.. 763 * while we waited for the directory semaphore, or the first
764 * lookup failed due to an unrelated rename.
728 * 765 *
729 * FIXME! This could use version numbering or similar to 766 * This could use version numbering or similar to avoid unnecessary
730 * avoid unnecessary cache lookups. 767 * cache lookups, but then we'd have to do the first lookup in the
731 * 768 * non-racy way. However in the common case here, everything should
732 * The "dcache_lock" is purely to protect the RCU list walker 769 * be hot in cache, so would it be a big win?
733 * from concurrent renames at this point (we mustn't get false
734 * negatives from the RCU list walk here, unlike the optimistic
735 * fast walk).
736 *
737 * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
738 */ 770 */
739 dentry = d_lookup(parent, name); 771 dentry = d_lookup(parent, name);
740 if (!dentry) { 772 if (likely(!dentry)) {
741 struct dentry *new; 773 dentry = d_alloc_and_lookup(parent, name, nd);
742
743 /* Don't create child dentry for a dead directory. */
744 dentry = ERR_PTR(-ENOENT);
745 if (IS_DEADDIR(dir))
746 goto out_unlock;
747
748 new = d_alloc(parent, name);
749 dentry = ERR_PTR(-ENOMEM);
750 if (new) {
751 dentry = dir->i_op->lookup(dir, new, nd);
752 if (dentry)
753 dput(new);
754 else
755 dentry = new;
756 }
757out_unlock:
758 mutex_unlock(&dir->i_mutex); 774 mutex_unlock(&dir->i_mutex);
759 if (IS_ERR(dentry)) 775 if (IS_ERR(dentry))
760 goto fail; 776 goto fail;
761 goto done; 777 goto done;
762 } 778 }
763
764 /* 779 /*
765 * Uhhuh! Nasty case: the cache was re-populated while 780 * Uhhuh! Nasty case: the cache was re-populated while
766 * we waited on the semaphore. Need to revalidate. 781 * we waited on the semaphore. Need to revalidate.
767 */ 782 */
768 mutex_unlock(&dir->i_mutex); 783 mutex_unlock(&dir->i_mutex);
769 if (dentry->d_op && dentry->d_op->d_revalidate) { 784 goto found;
770 dentry = do_revalidate(dentry, nd);
771 if (!dentry)
772 dentry = ERR_PTR(-ENOENT);
773 }
774 if (IS_ERR(dentry))
775 goto fail;
776 goto done;
777 785
778need_revalidate: 786need_revalidate:
779 dentry = do_revalidate(dentry, nd); 787 dentry = do_revalidate(dentry, nd);
@@ -1130,35 +1138,18 @@ static struct dentry *__lookup_hash(struct qstr *name,
1130 goto out; 1138 goto out;
1131 } 1139 }
1132 1140
1133 dentry = __d_lookup(base, name); 1141 /*
1134 1142 * Don't bother with __d_lookup: callers are for creat as
1135 /* lockess __d_lookup may fail due to concurrent d_move() 1143 * well as unlink, so a lot of the time it would cost
1136 * in some unrelated directory, so try with d_lookup 1144 * a double lookup.
1137 */ 1145 */
1138 if (!dentry) 1146 dentry = d_lookup(base, name);
1139 dentry = d_lookup(base, name);
1140 1147
1141 if (dentry && dentry->d_op && dentry->d_op->d_revalidate) 1148 if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
1142 dentry = do_revalidate(dentry, nd); 1149 dentry = do_revalidate(dentry, nd);
1143 1150
1144 if (!dentry) { 1151 if (!dentry)
1145 struct dentry *new; 1152 dentry = d_alloc_and_lookup(base, name, nd);
1146
1147 /* Don't create child dentry for a dead directory. */
1148 dentry = ERR_PTR(-ENOENT);
1149 if (IS_DEADDIR(inode))
1150 goto out;
1151
1152 new = d_alloc(base, name);
1153 dentry = ERR_PTR(-ENOMEM);
1154 if (!new)
1155 goto out;
1156 dentry = inode->i_op->lookup(inode, new, nd);
1157 if (!dentry)
1158 dentry = new;
1159 else
1160 dput(new);
1161 }
1162out: 1153out:
1163 return dentry; 1154 return dentry;
1164} 1155}
diff --git a/fs/namespace.c b/fs/namespace.c
index 2e10cb19c5b0..a72eaabfe8f2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -11,6 +11,8 @@
11#include <linux/syscalls.h> 11#include <linux/syscalls.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/spinlock.h>
15#include <linux/percpu.h>
14#include <linux/smp_lock.h> 16#include <linux/smp_lock.h>
15#include <linux/init.h> 17#include <linux/init.h>
16#include <linux/kernel.h> 18#include <linux/kernel.h>
@@ -38,12 +40,10 @@
38#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head)) 40#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
39#define HASH_SIZE (1UL << HASH_SHIFT) 41#define HASH_SIZE (1UL << HASH_SHIFT)
40 42
41/* spinlock for vfsmount related operations, inplace of dcache_lock */
42__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
43
44static int event; 43static int event;
45static DEFINE_IDA(mnt_id_ida); 44static DEFINE_IDA(mnt_id_ida);
46static DEFINE_IDA(mnt_group_ida); 45static DEFINE_IDA(mnt_group_ida);
46static DEFINE_SPINLOCK(mnt_id_lock);
47static int mnt_id_start = 0; 47static int mnt_id_start = 0;
48static int mnt_group_start = 1; 48static int mnt_group_start = 1;
49 49
@@ -55,6 +55,16 @@ static struct rw_semaphore namespace_sem;
55struct kobject *fs_kobj; 55struct kobject *fs_kobj;
56EXPORT_SYMBOL_GPL(fs_kobj); 56EXPORT_SYMBOL_GPL(fs_kobj);
57 57
58/*
59 * vfsmount lock may be taken for read to prevent changes to the
60 * vfsmount hash, ie. during mountpoint lookups or walking back
61 * up the tree.
62 *
63 * It should be taken for write in all cases where the vfsmount
64 * tree or hash is modified or when a vfsmount structure is modified.
65 */
66DEFINE_BRLOCK(vfsmount_lock);
67
58static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) 68static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
59{ 69{
60 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); 70 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
@@ -65,18 +75,21 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
65 75
66#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16) 76#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
67 77
68/* allocation is serialized by namespace_sem */ 78/*
79 * allocation is serialized by namespace_sem, but we need the spinlock to
80 * serialize with freeing.
81 */
69static int mnt_alloc_id(struct vfsmount *mnt) 82static int mnt_alloc_id(struct vfsmount *mnt)
70{ 83{
71 int res; 84 int res;
72 85
73retry: 86retry:
74 ida_pre_get(&mnt_id_ida, GFP_KERNEL); 87 ida_pre_get(&mnt_id_ida, GFP_KERNEL);
75 spin_lock(&vfsmount_lock); 88 spin_lock(&mnt_id_lock);
76 res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id); 89 res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
77 if (!res) 90 if (!res)
78 mnt_id_start = mnt->mnt_id + 1; 91 mnt_id_start = mnt->mnt_id + 1;
79 spin_unlock(&vfsmount_lock); 92 spin_unlock(&mnt_id_lock);
80 if (res == -EAGAIN) 93 if (res == -EAGAIN)
81 goto retry; 94 goto retry;
82 95
@@ -86,11 +99,11 @@ retry:
86static void mnt_free_id(struct vfsmount *mnt) 99static void mnt_free_id(struct vfsmount *mnt)
87{ 100{
88 int id = mnt->mnt_id; 101 int id = mnt->mnt_id;
89 spin_lock(&vfsmount_lock); 102 spin_lock(&mnt_id_lock);
90 ida_remove(&mnt_id_ida, id); 103 ida_remove(&mnt_id_ida, id);
91 if (mnt_id_start > id) 104 if (mnt_id_start > id)
92 mnt_id_start = id; 105 mnt_id_start = id;
93 spin_unlock(&vfsmount_lock); 106 spin_unlock(&mnt_id_lock);
94} 107}
95 108
96/* 109/*
@@ -348,7 +361,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
348{ 361{
349 int ret = 0; 362 int ret = 0;
350 363
351 spin_lock(&vfsmount_lock); 364 br_write_lock(vfsmount_lock);
352 mnt->mnt_flags |= MNT_WRITE_HOLD; 365 mnt->mnt_flags |= MNT_WRITE_HOLD;
353 /* 366 /*
354 * After storing MNT_WRITE_HOLD, we'll read the counters. This store 367 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -382,15 +395,15 @@ static int mnt_make_readonly(struct vfsmount *mnt)
382 */ 395 */
383 smp_wmb(); 396 smp_wmb();
384 mnt->mnt_flags &= ~MNT_WRITE_HOLD; 397 mnt->mnt_flags &= ~MNT_WRITE_HOLD;
385 spin_unlock(&vfsmount_lock); 398 br_write_unlock(vfsmount_lock);
386 return ret; 399 return ret;
387} 400}
388 401
389static void __mnt_unmake_readonly(struct vfsmount *mnt) 402static void __mnt_unmake_readonly(struct vfsmount *mnt)
390{ 403{
391 spin_lock(&vfsmount_lock); 404 br_write_lock(vfsmount_lock);
392 mnt->mnt_flags &= ~MNT_READONLY; 405 mnt->mnt_flags &= ~MNT_READONLY;
393 spin_unlock(&vfsmount_lock); 406 br_write_unlock(vfsmount_lock);
394} 407}
395 408
396void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) 409void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
@@ -414,6 +427,7 @@ void free_vfsmnt(struct vfsmount *mnt)
414/* 427/*
415 * find the first or last mount at @dentry on vfsmount @mnt depending on 428 * find the first or last mount at @dentry on vfsmount @mnt depending on
416 * @dir. If @dir is set return the first mount else return the last mount. 429 * @dir. If @dir is set return the first mount else return the last mount.
430 * vfsmount_lock must be held for read or write.
417 */ 431 */
418struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, 432struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
419 int dir) 433 int dir)
@@ -443,10 +457,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
443struct vfsmount *lookup_mnt(struct path *path) 457struct vfsmount *lookup_mnt(struct path *path)
444{ 458{
445 struct vfsmount *child_mnt; 459 struct vfsmount *child_mnt;
446 spin_lock(&vfsmount_lock); 460
461 br_read_lock(vfsmount_lock);
447 if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1))) 462 if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
448 mntget(child_mnt); 463 mntget(child_mnt);
449 spin_unlock(&vfsmount_lock); 464 br_read_unlock(vfsmount_lock);
450 return child_mnt; 465 return child_mnt;
451} 466}
452 467
@@ -455,6 +470,9 @@ static inline int check_mnt(struct vfsmount *mnt)
455 return mnt->mnt_ns == current->nsproxy->mnt_ns; 470 return mnt->mnt_ns == current->nsproxy->mnt_ns;
456} 471}
457 472
473/*
474 * vfsmount lock must be held for write
475 */
458static void touch_mnt_namespace(struct mnt_namespace *ns) 476static void touch_mnt_namespace(struct mnt_namespace *ns)
459{ 477{
460 if (ns) { 478 if (ns) {
@@ -463,6 +481,9 @@ static void touch_mnt_namespace(struct mnt_namespace *ns)
463 } 481 }
464} 482}
465 483
484/*
485 * vfsmount lock must be held for write
486 */
466static void __touch_mnt_namespace(struct mnt_namespace *ns) 487static void __touch_mnt_namespace(struct mnt_namespace *ns)
467{ 488{
468 if (ns && ns->event != event) { 489 if (ns && ns->event != event) {
@@ -471,6 +492,9 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
471 } 492 }
472} 493}
473 494
495/*
496 * vfsmount lock must be held for write
497 */
474static void detach_mnt(struct vfsmount *mnt, struct path *old_path) 498static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
475{ 499{
476 old_path->dentry = mnt->mnt_mountpoint; 500 old_path->dentry = mnt->mnt_mountpoint;
@@ -482,6 +506,9 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
482 old_path->dentry->d_mounted--; 506 old_path->dentry->d_mounted--;
483} 507}
484 508
509/*
510 * vfsmount lock must be held for write
511 */
485void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, 512void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
486 struct vfsmount *child_mnt) 513 struct vfsmount *child_mnt)
487{ 514{
@@ -490,6 +517,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
490 dentry->d_mounted++; 517 dentry->d_mounted++;
491} 518}
492 519
520/*
521 * vfsmount lock must be held for write
522 */
493static void attach_mnt(struct vfsmount *mnt, struct path *path) 523static void attach_mnt(struct vfsmount *mnt, struct path *path)
494{ 524{
495 mnt_set_mountpoint(path->mnt, path->dentry, mnt); 525 mnt_set_mountpoint(path->mnt, path->dentry, mnt);
@@ -499,7 +529,7 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
499} 529}
500 530
501/* 531/*
502 * the caller must hold vfsmount_lock 532 * vfsmount lock must be held for write
503 */ 533 */
504static void commit_tree(struct vfsmount *mnt) 534static void commit_tree(struct vfsmount *mnt)
505{ 535{
@@ -623,39 +653,43 @@ static inline void __mntput(struct vfsmount *mnt)
623void mntput_no_expire(struct vfsmount *mnt) 653void mntput_no_expire(struct vfsmount *mnt)
624{ 654{
625repeat: 655repeat:
626 if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) { 656 if (atomic_add_unless(&mnt->mnt_count, -1, 1))
627 if (likely(!mnt->mnt_pinned)) { 657 return;
628 spin_unlock(&vfsmount_lock); 658 br_write_lock(vfsmount_lock);
629 __mntput(mnt); 659 if (!atomic_dec_and_test(&mnt->mnt_count)) {
630 return; 660 br_write_unlock(vfsmount_lock);
631 } 661 return;
632 atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
633 mnt->mnt_pinned = 0;
634 spin_unlock(&vfsmount_lock);
635 acct_auto_close_mnt(mnt);
636 goto repeat;
637 } 662 }
663 if (likely(!mnt->mnt_pinned)) {
664 br_write_unlock(vfsmount_lock);
665 __mntput(mnt);
666 return;
667 }
668 atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
669 mnt->mnt_pinned = 0;
670 br_write_unlock(vfsmount_lock);
671 acct_auto_close_mnt(mnt);
672 goto repeat;
638} 673}
639
640EXPORT_SYMBOL(mntput_no_expire); 674EXPORT_SYMBOL(mntput_no_expire);
641 675
642void mnt_pin(struct vfsmount *mnt) 676void mnt_pin(struct vfsmount *mnt)
643{ 677{
644 spin_lock(&vfsmount_lock); 678 br_write_lock(vfsmount_lock);
645 mnt->mnt_pinned++; 679 mnt->mnt_pinned++;
646 spin_unlock(&vfsmount_lock); 680 br_write_unlock(vfsmount_lock);
647} 681}
648 682
649EXPORT_SYMBOL(mnt_pin); 683EXPORT_SYMBOL(mnt_pin);
650 684
651void mnt_unpin(struct vfsmount *mnt) 685void mnt_unpin(struct vfsmount *mnt)
652{ 686{
653 spin_lock(&vfsmount_lock); 687 br_write_lock(vfsmount_lock);
654 if (mnt->mnt_pinned) { 688 if (mnt->mnt_pinned) {
655 atomic_inc(&mnt->mnt_count); 689 atomic_inc(&mnt->mnt_count);
656 mnt->mnt_pinned--; 690 mnt->mnt_pinned--;
657 } 691 }
658 spin_unlock(&vfsmount_lock); 692 br_write_unlock(vfsmount_lock);
659} 693}
660 694
661EXPORT_SYMBOL(mnt_unpin); 695EXPORT_SYMBOL(mnt_unpin);
@@ -746,12 +780,12 @@ int mnt_had_events(struct proc_mounts *p)
746 struct mnt_namespace *ns = p->ns; 780 struct mnt_namespace *ns = p->ns;
747 int res = 0; 781 int res = 0;
748 782
749 spin_lock(&vfsmount_lock); 783 br_read_lock(vfsmount_lock);
750 if (p->event != ns->event) { 784 if (p->event != ns->event) {
751 p->event = ns->event; 785 p->event = ns->event;
752 res = 1; 786 res = 1;
753 } 787 }
754 spin_unlock(&vfsmount_lock); 788 br_read_unlock(vfsmount_lock);
755 789
756 return res; 790 return res;
757} 791}
@@ -952,12 +986,12 @@ int may_umount_tree(struct vfsmount *mnt)
952 int minimum_refs = 0; 986 int minimum_refs = 0;
953 struct vfsmount *p; 987 struct vfsmount *p;
954 988
955 spin_lock(&vfsmount_lock); 989 br_read_lock(vfsmount_lock);
956 for (p = mnt; p; p = next_mnt(p, mnt)) { 990 for (p = mnt; p; p = next_mnt(p, mnt)) {
957 actual_refs += atomic_read(&p->mnt_count); 991 actual_refs += atomic_read(&p->mnt_count);
958 minimum_refs += 2; 992 minimum_refs += 2;
959 } 993 }
960 spin_unlock(&vfsmount_lock); 994 br_read_unlock(vfsmount_lock);
961 995
962 if (actual_refs > minimum_refs) 996 if (actual_refs > minimum_refs)
963 return 0; 997 return 0;
@@ -984,10 +1018,10 @@ int may_umount(struct vfsmount *mnt)
984{ 1018{
985 int ret = 1; 1019 int ret = 1;
986 down_read(&namespace_sem); 1020 down_read(&namespace_sem);
987 spin_lock(&vfsmount_lock); 1021 br_read_lock(vfsmount_lock);
988 if (propagate_mount_busy(mnt, 2)) 1022 if (propagate_mount_busy(mnt, 2))
989 ret = 0; 1023 ret = 0;
990 spin_unlock(&vfsmount_lock); 1024 br_read_unlock(vfsmount_lock);
991 up_read(&namespace_sem); 1025 up_read(&namespace_sem);
992 return ret; 1026 return ret;
993} 1027}
@@ -1003,13 +1037,14 @@ void release_mounts(struct list_head *head)
1003 if (mnt->mnt_parent != mnt) { 1037 if (mnt->mnt_parent != mnt) {
1004 struct dentry *dentry; 1038 struct dentry *dentry;
1005 struct vfsmount *m; 1039 struct vfsmount *m;
1006 spin_lock(&vfsmount_lock); 1040
1041 br_write_lock(vfsmount_lock);
1007 dentry = mnt->mnt_mountpoint; 1042 dentry = mnt->mnt_mountpoint;
1008 m = mnt->mnt_parent; 1043 m = mnt->mnt_parent;
1009 mnt->mnt_mountpoint = mnt->mnt_root; 1044 mnt->mnt_mountpoint = mnt->mnt_root;
1010 mnt->mnt_parent = mnt; 1045 mnt->mnt_parent = mnt;
1011 m->mnt_ghosts--; 1046 m->mnt_ghosts--;
1012 spin_unlock(&vfsmount_lock); 1047 br_write_unlock(vfsmount_lock);
1013 dput(dentry); 1048 dput(dentry);
1014 mntput(m); 1049 mntput(m);
1015 } 1050 }
@@ -1017,6 +1052,10 @@ void release_mounts(struct list_head *head)
1017 } 1052 }
1018} 1053}
1019 1054
1055/*
1056 * vfsmount lock must be held for write
1057 * namespace_sem must be held for write
1058 */
1020void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) 1059void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
1021{ 1060{
1022 struct vfsmount *p; 1061 struct vfsmount *p;
@@ -1107,7 +1146,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
1107 } 1146 }
1108 1147
1109 down_write(&namespace_sem); 1148 down_write(&namespace_sem);
1110 spin_lock(&vfsmount_lock); 1149 br_write_lock(vfsmount_lock);
1111 event++; 1150 event++;
1112 1151
1113 if (!(flags & MNT_DETACH)) 1152 if (!(flags & MNT_DETACH))
@@ -1119,7 +1158,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
1119 umount_tree(mnt, 1, &umount_list); 1158 umount_tree(mnt, 1, &umount_list);
1120 retval = 0; 1159 retval = 0;
1121 } 1160 }
1122 spin_unlock(&vfsmount_lock); 1161 br_write_unlock(vfsmount_lock);
1123 up_write(&namespace_sem); 1162 up_write(&namespace_sem);
1124 release_mounts(&umount_list); 1163 release_mounts(&umount_list);
1125 return retval; 1164 return retval;
@@ -1231,19 +1270,19 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
1231 q = clone_mnt(p, p->mnt_root, flag); 1270 q = clone_mnt(p, p->mnt_root, flag);
1232 if (!q) 1271 if (!q)
1233 goto Enomem; 1272 goto Enomem;
1234 spin_lock(&vfsmount_lock); 1273 br_write_lock(vfsmount_lock);
1235 list_add_tail(&q->mnt_list, &res->mnt_list); 1274 list_add_tail(&q->mnt_list, &res->mnt_list);
1236 attach_mnt(q, &path); 1275 attach_mnt(q, &path);
1237 spin_unlock(&vfsmount_lock); 1276 br_write_unlock(vfsmount_lock);
1238 } 1277 }
1239 } 1278 }
1240 return res; 1279 return res;
1241Enomem: 1280Enomem:
1242 if (res) { 1281 if (res) {
1243 LIST_HEAD(umount_list); 1282 LIST_HEAD(umount_list);
1244 spin_lock(&vfsmount_lock); 1283 br_write_lock(vfsmount_lock);
1245 umount_tree(res, 0, &umount_list); 1284 umount_tree(res, 0, &umount_list);
1246 spin_unlock(&vfsmount_lock); 1285 br_write_unlock(vfsmount_lock);
1247 release_mounts(&umount_list); 1286 release_mounts(&umount_list);
1248 } 1287 }
1249 return NULL; 1288 return NULL;
@@ -1262,9 +1301,9 @@ void drop_collected_mounts(struct vfsmount *mnt)
1262{ 1301{
1263 LIST_HEAD(umount_list); 1302 LIST_HEAD(umount_list);
1264 down_write(&namespace_sem); 1303 down_write(&namespace_sem);
1265 spin_lock(&vfsmount_lock); 1304 br_write_lock(vfsmount_lock);
1266 umount_tree(mnt, 0, &umount_list); 1305 umount_tree(mnt, 0, &umount_list);
1267 spin_unlock(&vfsmount_lock); 1306 br_write_unlock(vfsmount_lock);
1268 up_write(&namespace_sem); 1307 up_write(&namespace_sem);
1269 release_mounts(&umount_list); 1308 release_mounts(&umount_list);
1270} 1309}
@@ -1392,7 +1431,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
1392 if (err) 1431 if (err)
1393 goto out_cleanup_ids; 1432 goto out_cleanup_ids;
1394 1433
1395 spin_lock(&vfsmount_lock); 1434 br_write_lock(vfsmount_lock);
1396 1435
1397 if (IS_MNT_SHARED(dest_mnt)) { 1436 if (IS_MNT_SHARED(dest_mnt)) {
1398 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 1437 for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1411,7 +1450,8 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
1411 list_del_init(&child->mnt_hash); 1450 list_del_init(&child->mnt_hash);
1412 commit_tree(child); 1451 commit_tree(child);
1413 } 1452 }
1414 spin_unlock(&vfsmount_lock); 1453 br_write_unlock(vfsmount_lock);
1454
1415 return 0; 1455 return 0;
1416 1456
1417 out_cleanup_ids: 1457 out_cleanup_ids:
@@ -1444,13 +1484,30 @@ out_unlock:
1444} 1484}
1445 1485
1446/* 1486/*
1487 * Sanity check the flags to change_mnt_propagation.
1488 */
1489
1490static int flags_to_propagation_type(int flags)
1491{
1492 int type = flags & ~MS_REC;
1493
1494 /* Fail if any non-propagation flags are set */
1495 if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
1496 return 0;
1497 /* Only one propagation flag should be set */
1498 if (!is_power_of_2(type))
1499 return 0;
1500 return type;
1501}
1502
1503/*
1447 * recursively change the type of the mountpoint. 1504 * recursively change the type of the mountpoint.
1448 */ 1505 */
1449static int do_change_type(struct path *path, int flag) 1506static int do_change_type(struct path *path, int flag)
1450{ 1507{
1451 struct vfsmount *m, *mnt = path->mnt; 1508 struct vfsmount *m, *mnt = path->mnt;
1452 int recurse = flag & MS_REC; 1509 int recurse = flag & MS_REC;
1453 int type = flag & ~MS_REC; 1510 int type;
1454 int err = 0; 1511 int err = 0;
1455 1512
1456 if (!capable(CAP_SYS_ADMIN)) 1513 if (!capable(CAP_SYS_ADMIN))
@@ -1459,6 +1516,10 @@ static int do_change_type(struct path *path, int flag)
1459 if (path->dentry != path->mnt->mnt_root) 1516 if (path->dentry != path->mnt->mnt_root)
1460 return -EINVAL; 1517 return -EINVAL;
1461 1518
1519 type = flags_to_propagation_type(flag);
1520 if (!type)
1521 return -EINVAL;
1522
1462 down_write(&namespace_sem); 1523 down_write(&namespace_sem);
1463 if (type == MS_SHARED) { 1524 if (type == MS_SHARED) {
1464 err = invent_group_ids(mnt, recurse); 1525 err = invent_group_ids(mnt, recurse);
@@ -1466,10 +1527,10 @@ static int do_change_type(struct path *path, int flag)
1466 goto out_unlock; 1527 goto out_unlock;
1467 } 1528 }
1468 1529
1469 spin_lock(&vfsmount_lock); 1530 br_write_lock(vfsmount_lock);
1470 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) 1531 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
1471 change_mnt_propagation(m, type); 1532 change_mnt_propagation(m, type);
1472 spin_unlock(&vfsmount_lock); 1533 br_write_unlock(vfsmount_lock);
1473 1534
1474 out_unlock: 1535 out_unlock:
1475 up_write(&namespace_sem); 1536 up_write(&namespace_sem);
@@ -1513,9 +1574,10 @@ static int do_loopback(struct path *path, char *old_name,
1513 err = graft_tree(mnt, path); 1574 err = graft_tree(mnt, path);
1514 if (err) { 1575 if (err) {
1515 LIST_HEAD(umount_list); 1576 LIST_HEAD(umount_list);
1516 spin_lock(&vfsmount_lock); 1577
1578 br_write_lock(vfsmount_lock);
1517 umount_tree(mnt, 0, &umount_list); 1579 umount_tree(mnt, 0, &umount_list);
1518 spin_unlock(&vfsmount_lock); 1580 br_write_unlock(vfsmount_lock);
1519 release_mounts(&umount_list); 1581 release_mounts(&umount_list);
1520 } 1582 }
1521 1583
@@ -1568,16 +1630,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
1568 else 1630 else
1569 err = do_remount_sb(sb, flags, data, 0); 1631 err = do_remount_sb(sb, flags, data, 0);
1570 if (!err) { 1632 if (!err) {
1571 spin_lock(&vfsmount_lock); 1633 br_write_lock(vfsmount_lock);
1572 mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK; 1634 mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
1573 path->mnt->mnt_flags = mnt_flags; 1635 path->mnt->mnt_flags = mnt_flags;
1574 spin_unlock(&vfsmount_lock); 1636 br_write_unlock(vfsmount_lock);
1575 } 1637 }
1576 up_write(&sb->s_umount); 1638 up_write(&sb->s_umount);
1577 if (!err) { 1639 if (!err) {
1578 spin_lock(&vfsmount_lock); 1640 br_write_lock(vfsmount_lock);
1579 touch_mnt_namespace(path->mnt->mnt_ns); 1641 touch_mnt_namespace(path->mnt->mnt_ns);
1580 spin_unlock(&vfsmount_lock); 1642 br_write_unlock(vfsmount_lock);
1581 } 1643 }
1582 return err; 1644 return err;
1583} 1645}
@@ -1754,7 +1816,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
1754 return; 1816 return;
1755 1817
1756 down_write(&namespace_sem); 1818 down_write(&namespace_sem);
1757 spin_lock(&vfsmount_lock); 1819 br_write_lock(vfsmount_lock);
1758 1820
1759 /* extract from the expiration list every vfsmount that matches the 1821 /* extract from the expiration list every vfsmount that matches the
1760 * following criteria: 1822 * following criteria:
@@ -1773,7 +1835,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
1773 touch_mnt_namespace(mnt->mnt_ns); 1835 touch_mnt_namespace(mnt->mnt_ns);
1774 umount_tree(mnt, 1, &umounts); 1836 umount_tree(mnt, 1, &umounts);
1775 } 1837 }
1776 spin_unlock(&vfsmount_lock); 1838 br_write_unlock(vfsmount_lock);
1777 up_write(&namespace_sem); 1839 up_write(&namespace_sem);
1778 1840
1779 release_mounts(&umounts); 1841 release_mounts(&umounts);
@@ -1830,6 +1892,8 @@ resume:
1830/* 1892/*
1831 * process a list of expirable mountpoints with the intent of discarding any 1893 * process a list of expirable mountpoints with the intent of discarding any
1832 * submounts of a specific parent mountpoint 1894 * submounts of a specific parent mountpoint
1895 *
1896 * vfsmount_lock must be held for write
1833 */ 1897 */
1834static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts) 1898static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
1835{ 1899{
@@ -2048,9 +2112,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2048 kfree(new_ns); 2112 kfree(new_ns);
2049 return ERR_PTR(-ENOMEM); 2113 return ERR_PTR(-ENOMEM);
2050 } 2114 }
2051 spin_lock(&vfsmount_lock); 2115 br_write_lock(vfsmount_lock);
2052 list_add_tail(&new_ns->list, &new_ns->root->mnt_list); 2116 list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
2053 spin_unlock(&vfsmount_lock); 2117 br_write_unlock(vfsmount_lock);
2054 2118
2055 /* 2119 /*
2056 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts 2120 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2244,7 +2308,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2244 goto out2; /* not attached */ 2308 goto out2; /* not attached */
2245 /* make sure we can reach put_old from new_root */ 2309 /* make sure we can reach put_old from new_root */
2246 tmp = old.mnt; 2310 tmp = old.mnt;
2247 spin_lock(&vfsmount_lock); 2311 br_write_lock(vfsmount_lock);
2248 if (tmp != new.mnt) { 2312 if (tmp != new.mnt) {
2249 for (;;) { 2313 for (;;) {
2250 if (tmp->mnt_parent == tmp) 2314 if (tmp->mnt_parent == tmp)
@@ -2264,7 +2328,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2264 /* mount new_root on / */ 2328 /* mount new_root on / */
2265 attach_mnt(new.mnt, &root_parent); 2329 attach_mnt(new.mnt, &root_parent);
2266 touch_mnt_namespace(current->nsproxy->mnt_ns); 2330 touch_mnt_namespace(current->nsproxy->mnt_ns);
2267 spin_unlock(&vfsmount_lock); 2331 br_write_unlock(vfsmount_lock);
2268 chroot_fs_refs(&root, &new); 2332 chroot_fs_refs(&root, &new);
2269 error = 0; 2333 error = 0;
2270 path_put(&root_parent); 2334 path_put(&root_parent);
@@ -2279,7 +2343,7 @@ out1:
2279out0: 2343out0:
2280 return error; 2344 return error;
2281out3: 2345out3:
2282 spin_unlock(&vfsmount_lock); 2346 br_write_unlock(vfsmount_lock);
2283 goto out2; 2347 goto out2;
2284} 2348}
2285 2349
@@ -2326,6 +2390,8 @@ void __init mnt_init(void)
2326 for (u = 0; u < HASH_SIZE; u++) 2390 for (u = 0; u < HASH_SIZE; u++)
2327 INIT_LIST_HEAD(&mount_hashtable[u]); 2391 INIT_LIST_HEAD(&mount_hashtable[u]);
2328 2392
2393 br_lock_init(vfsmount_lock);
2394
2329 err = sysfs_init(); 2395 err = sysfs_init();
2330 if (err) 2396 if (err)
2331 printk(KERN_WARNING "%s: sysfs_init error: %d\n", 2397 printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2344,9 +2410,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
2344 if (!atomic_dec_and_test(&ns->count)) 2410 if (!atomic_dec_and_test(&ns->count))
2345 return; 2411 return;
2346 down_write(&namespace_sem); 2412 down_write(&namespace_sem);
2347 spin_lock(&vfsmount_lock); 2413 br_write_lock(vfsmount_lock);
2348 umount_tree(ns->root, 0, &umount_list); 2414 umount_tree(ns->root, 0, &umount_list);
2349 spin_unlock(&vfsmount_lock); 2415 br_write_unlock(vfsmount_lock);
2350 up_write(&namespace_sem); 2416 up_write(&namespace_sem);
2351 release_mounts(&umount_list); 2417 release_mounts(&umount_list);
2352 kfree(ns); 2418 kfree(ns);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 26a510a7be09..6c2aad49d731 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -63,7 +63,6 @@ config NFS_V3_ACL
63config NFS_V4 63config NFS_V4
64 bool "NFS client support for NFS version 4" 64 bool "NFS client support for NFS version 4"
65 depends on NFS_FS 65 depends on NFS_FS
66 select RPCSEC_GSS_KRB5
67 help 66 help
68 This option enables support for version 4 of the NFS protocol 67 This option enables support for version 4 of the NFS protocol
69 (RFC 3530) in the kernel's NFS client. 68 (RFC 3530) in the kernel's NFS client.
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 29539ceeb745..e257172d438c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -140,6 +140,13 @@ nfs_opendir(struct inode *inode, struct file *filp)
140 140
141 /* Call generic open code in order to cache credentials */ 141 /* Call generic open code in order to cache credentials */
142 res = nfs_open(inode, filp); 142 res = nfs_open(inode, filp);
143 if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) {
144 /* This is a mountpoint, so d_revalidate will never
145 * have been called, so we need to refresh the
146 * inode (for close-open consistency) ourselves.
147 */
148 __nfs_revalidate_inode(NFS_SERVER(inode), inode);
149 }
143 return res; 150 return res;
144} 151}
145 152
@@ -1103,7 +1110,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1103 if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) 1110 if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
1104 goto no_open_dput; 1111 goto no_open_dput;
1105 /* We can't create new files, or truncate existing ones here */ 1112 /* We can't create new files, or truncate existing ones here */
1106 openflags &= ~(O_CREAT|O_TRUNC); 1113 openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
1107 1114
1108 /* 1115 /*
1109 * Note: we're not holding inode->i_mutex and so may be racing with 1116 * Note: we're not holding inode->i_mutex and so may be racing with
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 2d141a74ae82..eb51bd6201da 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -323,7 +323,7 @@ nfs_file_fsync(struct file *file, int datasync)
323 have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); 323 have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
324 if (have_error) 324 if (have_error)
325 ret = xchg(&ctx->error, 0); 325 ret = xchg(&ctx->error, 0);
326 if (!ret) 326 if (!ret && status < 0)
327 ret = status; 327 ret = status;
328 return ret; 328 return ret;
329} 329}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7ffbb98ddec3..089da5b5d20a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2036,7 +2036,8 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
2036 struct rpc_cred *cred; 2036 struct rpc_cred *cred;
2037 struct nfs4_state *state; 2037 struct nfs4_state *state;
2038 struct dentry *res; 2038 struct dentry *res;
2039 fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); 2039 int open_flags = nd->intent.open.flags;
2040 fmode_t fmode = open_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
2040 2041
2041 if (nd->flags & LOOKUP_CREATE) { 2042 if (nd->flags & LOOKUP_CREATE) {
2042 attr.ia_mode = nd->intent.open.create_mode; 2043 attr.ia_mode = nd->intent.open.create_mode;
@@ -2044,8 +2045,9 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
2044 if (!IS_POSIXACL(dir)) 2045 if (!IS_POSIXACL(dir))
2045 attr.ia_mode &= ~current_umask(); 2046 attr.ia_mode &= ~current_umask();
2046 } else { 2047 } else {
2048 open_flags &= ~O_EXCL;
2047 attr.ia_valid = 0; 2049 attr.ia_valid = 0;
2048 BUG_ON(nd->intent.open.flags & O_CREAT); 2050 BUG_ON(open_flags & O_CREAT);
2049 } 2051 }
2050 2052
2051 cred = rpc_lookup_cred(); 2053 cred = rpc_lookup_cred();
@@ -2054,7 +2056,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
2054 parent = dentry->d_parent; 2056 parent = dentry->d_parent;
2055 /* Protect against concurrent sillydeletes */ 2057 /* Protect against concurrent sillydeletes */
2056 nfs_block_sillyrename(parent); 2058 nfs_block_sillyrename(parent);
2057 state = nfs4_do_open(dir, &path, fmode, nd->intent.open.flags, &attr, cred); 2059 state = nfs4_do_open(dir, &path, fmode, open_flags, &attr, cred);
2058 put_rpccred(cred); 2060 put_rpccred(cred);
2059 if (IS_ERR(state)) { 2061 if (IS_ERR(state)) {
2060 if (PTR_ERR(state) == -ENOENT) { 2062 if (PTR_ERR(state) == -ENOENT) {
@@ -2273,8 +2275,7 @@ static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct
2273out: 2275out:
2274 if (page) 2276 if (page)
2275 __free_page(page); 2277 __free_page(page);
2276 if (locations) 2278 kfree(locations);
2277 kfree(locations);
2278 return status; 2279 return status;
2279} 2280}
2280 2281
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ee26316ad1f4..ec3966e4706b 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -655,6 +655,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
655 655
656 if (nfss->options & NFS_OPTION_FSCACHE) 656 if (nfss->options & NFS_OPTION_FSCACHE)
657 seq_printf(m, ",fsc"); 657 seq_printf(m, ",fsc");
658
659 if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) {
660 if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
661 seq_printf(m, ",lookupcache=none");
662 else
663 seq_printf(m, ",lookupcache=pos");
664 }
658} 665}
659 666
660/* 667/*
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 503b9da159a3..95932f523aef 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -69,7 +69,6 @@ config NFSD_V4
69 depends on NFSD && PROC_FS && EXPERIMENTAL 69 depends on NFSD && PROC_FS && EXPERIMENTAL
70 select NFSD_V3 70 select NFSD_V3
71 select FS_POSIX_ACL 71 select FS_POSIX_ACL
72 select RPCSEC_GSS_KRB5
73 help 72 help
74 This option enables support in your system's NFS server for 73 This option enables support in your system's NFS server for
75 version 4 of the NFS protocol (RFC 3530). 74 version 4 of the NFS protocol (RFC 3530).
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2e7357104cfd..3dfef0623968 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2450,14 +2450,13 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
2450static __be32 2450static __be32
2451nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open) 2451nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
2452{ 2452{
2453 u32 op_share_access, new_access; 2453 u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
2454 bool new_access;
2454 __be32 status; 2455 __be32 status;
2455 2456
2456 set_access(&new_access, stp->st_access_bmap); 2457 new_access = !test_bit(op_share_access, &stp->st_access_bmap);
2457 new_access = (~new_access) & open->op_share_access & ~NFS4_SHARE_WANT_MASK;
2458
2459 if (new_access) { 2458 if (new_access) {
2460 status = nfs4_get_vfs_file(rqstp, fp, cur_fh, new_access); 2459 status = nfs4_get_vfs_file(rqstp, fp, cur_fh, op_share_access);
2461 if (status) 2460 if (status)
2462 return status; 2461 return status;
2463 } 2462 }
@@ -2470,7 +2469,6 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
2470 return status; 2469 return status;
2471 } 2470 }
2472 /* remember the open */ 2471 /* remember the open */
2473 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
2474 __set_bit(op_share_access, &stp->st_access_bmap); 2472 __set_bit(op_share_access, &stp->st_access_bmap);
2475 __set_bit(open->op_share_deny, &stp->st_deny_bmap); 2473 __set_bit(open->op_share_deny, &stp->st_deny_bmap);
2476 2474
@@ -2983,7 +2981,6 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2983 *filpp = find_readable_file(stp->st_file); 2981 *filpp = find_readable_file(stp->st_file);
2984 else 2982 else
2985 *filpp = find_writeable_file(stp->st_file); 2983 *filpp = find_writeable_file(stp->st_file);
2986 BUG_ON(!*filpp); /* assured by check_openmode */
2987 } 2984 }
2988 } 2985 }
2989 status = nfs_ok; 2986 status = nfs_ok;
@@ -3561,7 +3558,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3561 struct nfs4_stateowner *open_sop = NULL; 3558 struct nfs4_stateowner *open_sop = NULL;
3562 struct nfs4_stateowner *lock_sop = NULL; 3559 struct nfs4_stateowner *lock_sop = NULL;
3563 struct nfs4_stateid *lock_stp; 3560 struct nfs4_stateid *lock_stp;
3564 struct file *filp; 3561 struct nfs4_file *fp;
3562 struct file *filp = NULL;
3565 struct file_lock file_lock; 3563 struct file_lock file_lock;
3566 struct file_lock conflock; 3564 struct file_lock conflock;
3567 __be32 status = 0; 3565 __be32 status = 0;
@@ -3591,7 +3589,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3591 * lock stateid. 3589 * lock stateid.
3592 */ 3590 */
3593 struct nfs4_stateid *open_stp = NULL; 3591 struct nfs4_stateid *open_stp = NULL;
3594 struct nfs4_file *fp;
3595 3592
3596 status = nfserr_stale_clientid; 3593 status = nfserr_stale_clientid;
3597 if (!nfsd4_has_session(cstate) && 3594 if (!nfsd4_has_session(cstate) &&
@@ -3634,6 +3631,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3634 if (status) 3631 if (status)
3635 goto out; 3632 goto out;
3636 lock_sop = lock->lk_replay_owner; 3633 lock_sop = lock->lk_replay_owner;
3634 fp = lock_stp->st_file;
3637 } 3635 }
3638 /* lock->lk_replay_owner and lock_stp have been created or found */ 3636 /* lock->lk_replay_owner and lock_stp have been created or found */
3639 3637
@@ -3648,13 +3646,19 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3648 switch (lock->lk_type) { 3646 switch (lock->lk_type) {
3649 case NFS4_READ_LT: 3647 case NFS4_READ_LT:
3650 case NFS4_READW_LT: 3648 case NFS4_READW_LT:
3651 filp = find_readable_file(lock_stp->st_file); 3649 if (find_readable_file(lock_stp->st_file)) {
3650 nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_READ);
3651 filp = find_readable_file(lock_stp->st_file);
3652 }
3652 file_lock.fl_type = F_RDLCK; 3653 file_lock.fl_type = F_RDLCK;
3653 cmd = F_SETLK; 3654 cmd = F_SETLK;
3654 break; 3655 break;
3655 case NFS4_WRITE_LT: 3656 case NFS4_WRITE_LT:
3656 case NFS4_WRITEW_LT: 3657 case NFS4_WRITEW_LT:
3657 filp = find_writeable_file(lock_stp->st_file); 3658 if (find_writeable_file(lock_stp->st_file)) {
3659 nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_WRITE);
3660 filp = find_writeable_file(lock_stp->st_file);
3661 }
3658 file_lock.fl_type = F_WRLCK; 3662 file_lock.fl_type = F_WRLCK;
3659 cmd = F_SETLK; 3663 cmd = F_SETLK;
3660 break; 3664 break;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 7731a75971dd..322518c88e4b 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -363,23 +363,23 @@ struct nfs4_file {
363 * at all? */ 363 * at all? */
364static inline struct file *find_writeable_file(struct nfs4_file *f) 364static inline struct file *find_writeable_file(struct nfs4_file *f)
365{ 365{
366 if (f->fi_fds[O_RDWR]) 366 if (f->fi_fds[O_WRONLY])
367 return f->fi_fds[O_RDWR]; 367 return f->fi_fds[O_WRONLY];
368 return f->fi_fds[O_WRONLY]; 368 return f->fi_fds[O_RDWR];
369} 369}
370 370
371static inline struct file *find_readable_file(struct nfs4_file *f) 371static inline struct file *find_readable_file(struct nfs4_file *f)
372{ 372{
373 if (f->fi_fds[O_RDWR]) 373 if (f->fi_fds[O_RDONLY])
374 return f->fi_fds[O_RDWR]; 374 return f->fi_fds[O_RDONLY];
375 return f->fi_fds[O_RDONLY]; 375 return f->fi_fds[O_RDWR];
376} 376}
377 377
378static inline struct file *find_any_file(struct nfs4_file *f) 378static inline struct file *find_any_file(struct nfs4_file *f)
379{ 379{
380 if (f->fi_fds[O_RDWR]) 380 if (f->fi_fds[O_RDWR])
381 return f->fi_fds[O_RDWR]; 381 return f->fi_fds[O_RDWR];
382 else if (f->fi_fds[O_RDWR]) 382 else if (f->fi_fds[O_WRONLY])
383 return f->fi_fds[O_WRONLY]; 383 return f->fi_fds[O_WRONLY];
384 else 384 else
385 return f->fi_fds[O_RDONLY]; 385 return f->fi_fds[O_RDONLY];
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 96360a83cb91..661a6cf8e826 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -2033,15 +2033,17 @@ out:
2033__be32 2033__be32
2034nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access) 2034nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
2035{ 2035{
2036 struct path path = {
2037 .mnt = fhp->fh_export->ex_path.mnt,
2038 .dentry = fhp->fh_dentry,
2039 };
2040 __be32 err; 2036 __be32 err;
2041 2037
2042 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access); 2038 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
2043 if (!err && vfs_statfs(&path, stat)) 2039 if (!err) {
2044 err = nfserr_io; 2040 struct path path = {
2041 .mnt = fhp->fh_export->ex_path.mnt,
2042 .dentry = fhp->fh_dentry,
2043 };
2044 if (vfs_statfs(&path, stat))
2045 err = nfserr_io;
2046 }
2045 return err; 2047 return err;
2046} 2048}
2047 2049
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 1fa86b9df73b..922263393c76 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -175,24 +175,24 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
175{ 175{
176 struct the_nilfs *nilfs = sbi->s_nilfs; 176 struct the_nilfs *nilfs = sbi->s_nilfs;
177 int err; 177 int err;
178 int barrier_done = 0;
179 178
180 if (nilfs_test_opt(sbi, BARRIER)) {
181 set_buffer_ordered(nilfs->ns_sbh[0]);
182 barrier_done = 1;
183 }
184 retry: 179 retry:
185 set_buffer_dirty(nilfs->ns_sbh[0]); 180 set_buffer_dirty(nilfs->ns_sbh[0]);
186 err = sync_dirty_buffer(nilfs->ns_sbh[0]); 181
187 if (err == -EOPNOTSUPP && barrier_done) { 182 if (nilfs_test_opt(sbi, BARRIER)) {
188 nilfs_warning(sbi->s_super, __func__, 183 err = __sync_dirty_buffer(nilfs->ns_sbh[0],
189 "barrier-based sync failed. " 184 WRITE_SYNC | WRITE_BARRIER);
190 "disabling barriers\n"); 185 if (err == -EOPNOTSUPP) {
191 nilfs_clear_opt(sbi, BARRIER); 186 nilfs_warning(sbi->s_super, __func__,
192 barrier_done = 0; 187 "barrier-based sync failed. "
193 clear_buffer_ordered(nilfs->ns_sbh[0]); 188 "disabling barriers\n");
194 goto retry; 189 nilfs_clear_opt(sbi, BARRIER);
190 goto retry;
191 }
192 } else {
193 err = sync_dirty_buffer(nilfs->ns_sbh[0]);
195 } 194 }
195
196 if (unlikely(err)) { 196 if (unlikely(err)) {
197 printk(KERN_ERR 197 printk(KERN_ERR
198 "NILFS: unable to write superblock (err=%d)\n", err); 198 "NILFS: unable to write superblock (err=%d)\n", err);
@@ -400,9 +400,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
400 list_add(&sbi->s_list, &nilfs->ns_supers); 400 list_add(&sbi->s_list, &nilfs->ns_supers);
401 up_write(&nilfs->ns_super_sem); 401 up_write(&nilfs->ns_super_sem);
402 402
403 err = -ENOMEM;
403 sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size); 404 sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size);
404 if (!sbi->s_ifile) 405 if (!sbi->s_ifile)
405 return -ENOMEM; 406 goto delist;
406 407
407 down_read(&nilfs->ns_segctor_sem); 408 down_read(&nilfs->ns_segctor_sem);
408 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, 409 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
@@ -433,6 +434,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
433 nilfs_mdt_destroy(sbi->s_ifile); 434 nilfs_mdt_destroy(sbi->s_ifile);
434 sbi->s_ifile = NULL; 435 sbi->s_ifile = NULL;
435 436
437 delist:
436 down_write(&nilfs->ns_super_sem); 438 down_write(&nilfs->ns_super_sem);
437 list_del_init(&sbi->s_list); 439 list_del_init(&sbi->s_list);
438 up_write(&nilfs->ns_super_sem); 440 up_write(&nilfs->ns_super_sem);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 37de1f062d81..ba7c10c917fc 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -446,6 +446,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
446 nilfs_mdt_destroy(nilfs->ns_cpfile); 446 nilfs_mdt_destroy(nilfs->ns_cpfile);
447 nilfs_mdt_destroy(nilfs->ns_sufile); 447 nilfs_mdt_destroy(nilfs->ns_sufile);
448 nilfs_mdt_destroy(nilfs->ns_dat); 448 nilfs_mdt_destroy(nilfs->ns_dat);
449 nilfs_mdt_destroy(nilfs->ns_gc_dat);
449 450
450 failed: 451 failed:
451 nilfs_clear_recovery_info(&ri); 452 nilfs_clear_recovery_info(&ri);
@@ -608,11 +609,11 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
608 return -EINVAL; 609 return -EINVAL;
609 } 610 }
610 611
611 if (swp) { 612 if (!valid[!swp])
612 printk(KERN_WARNING "NILFS warning: broken superblock. " 613 printk(KERN_WARNING "NILFS warning: broken superblock. "
613 "using spare superblock.\n"); 614 "using spare superblock.\n");
615 if (swp)
614 nilfs_swap_super_block(nilfs); 616 nilfs_swap_super_block(nilfs);
615 }
616 617
617 nilfs->ns_sbwcount = 0; 618 nilfs->ns_sbwcount = 0;
618 nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime); 619 nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
@@ -775,6 +776,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
775 start * sects_per_block, 776 start * sects_per_block,
776 nblocks * sects_per_block, 777 nblocks * sects_per_block,
777 GFP_NOFS, 778 GFP_NOFS,
779 BLKDEV_IFL_WAIT |
778 BLKDEV_IFL_BARRIER); 780 BLKDEV_IFL_BARRIER);
779 if (ret < 0) 781 if (ret < 0)
780 return ret; 782 return ret;
@@ -785,7 +787,8 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
785 ret = blkdev_issue_discard(nilfs->ns_bdev, 787 ret = blkdev_issue_discard(nilfs->ns_bdev,
786 start * sects_per_block, 788 start * sects_per_block,
787 nblocks * sects_per_block, 789 nblocks * sects_per_block,
788 GFP_NOFS, BLKDEV_IFL_BARRIER); 790 GFP_NOFS,
791 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
789 return ret; 792 return ret;
790} 793}
791 794
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 756566fe8449..85366c78cc37 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -165,9 +165,6 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
165 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell, 165 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
166 inode_mark, vfsmnt_mark, event_mask, data, data_type); 166 inode_mark, vfsmnt_mark, event_mask, data, data_type);
167 167
168 pr_debug("%s: group=%p vfsmount_mark=%p inode_mark=%p mask=%x\n",
169 __func__, group, vfsmnt_mark, inode_mark, event_mask);
170
171 /* sorry, fanotify only gives a damn about files and dirs */ 168 /* sorry, fanotify only gives a damn about files and dirs */
172 if (!S_ISREG(to_tell->i_mode) && 169 if (!S_ISREG(to_tell->i_mode) &&
173 !S_ISDIR(to_tell->i_mode)) 170 !S_ISDIR(to_tell->i_mode))
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 032b837fcd11..5ed8e58d7bfc 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -195,6 +195,14 @@ static int prepare_for_access_response(struct fsnotify_group *group,
195 re->fd = fd; 195 re->fd = fd;
196 196
197 mutex_lock(&group->fanotify_data.access_mutex); 197 mutex_lock(&group->fanotify_data.access_mutex);
198
199 if (group->fanotify_data.bypass_perm) {
200 mutex_unlock(&group->fanotify_data.access_mutex);
201 kmem_cache_free(fanotify_response_event_cache, re);
202 event->response = FAN_ALLOW;
203 return 0;
204 }
205
198 list_add_tail(&re->list, &group->fanotify_data.access_list); 206 list_add_tail(&re->list, &group->fanotify_data.access_list);
199 mutex_unlock(&group->fanotify_data.access_mutex); 207 mutex_unlock(&group->fanotify_data.access_mutex);
200 208
@@ -364,9 +372,28 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
364static int fanotify_release(struct inode *ignored, struct file *file) 372static int fanotify_release(struct inode *ignored, struct file *file)
365{ 373{
366 struct fsnotify_group *group = file->private_data; 374 struct fsnotify_group *group = file->private_data;
375 struct fanotify_response_event *re, *lre;
367 376
368 pr_debug("%s: file=%p group=%p\n", __func__, file, group); 377 pr_debug("%s: file=%p group=%p\n", __func__, file, group);
369 378
379#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
380 mutex_lock(&group->fanotify_data.access_mutex);
381
382 group->fanotify_data.bypass_perm = true;
383
384 list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
385 pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
386 re, re->event);
387
388 list_del_init(&re->list);
389 re->event->response = FAN_ALLOW;
390
391 kmem_cache_free(fanotify_response_event_cache, re);
392 }
393 mutex_unlock(&group->fanotify_data.access_mutex);
394
395 wake_up(&group->fanotify_data.access_waitq);
396#endif
370 /* matches the fanotify_init->fsnotify_alloc_group */ 397 /* matches the fanotify_init->fsnotify_alloc_group */
371 fsnotify_put_group(group); 398 fsnotify_put_group(group);
372 399
@@ -614,7 +641,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
614 __func__, flags, event_f_flags); 641 __func__, flags, event_f_flags);
615 642
616 if (!capable(CAP_SYS_ADMIN)) 643 if (!capable(CAP_SYS_ADMIN))
617 return -EACCES; 644 return -EPERM;
618 645
619 if (flags & ~FAN_ALL_INIT_FLAGS) 646 if (flags & ~FAN_ALL_INIT_FLAGS)
620 return -EINVAL; 647 return -EINVAL;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 3970392b2722..36802420d69a 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -148,13 +148,14 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
148 const unsigned char *file_name, 148 const unsigned char *file_name,
149 struct fsnotify_event **event) 149 struct fsnotify_event **event)
150{ 150{
151 struct fsnotify_group *group = inode_mark->group; 151 struct fsnotify_group *group = NULL;
152 __u32 inode_test_mask = (mask & ~FS_EVENT_ON_CHILD); 152 __u32 inode_test_mask = 0;
153 __u32 vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD); 153 __u32 vfsmount_test_mask = 0;
154 154
155 pr_debug("%s: group=%p to_tell=%p mnt=%p mark=%p mask=%x data=%p" 155 if (unlikely(!inode_mark && !vfsmount_mark)) {
156 " data_is=%d cookie=%d event=%p\n", __func__, group, to_tell, 156 BUG();
157 mnt, inode_mark, mask, data, data_is, cookie, *event); 157 return 0;
158 }
158 159
159 /* clear ignored on inode modification */ 160 /* clear ignored on inode modification */
160 if (mask & FS_MODIFY) { 161 if (mask & FS_MODIFY) {
@@ -168,18 +169,29 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
168 169
169 /* does the inode mark tell us to do something? */ 170 /* does the inode mark tell us to do something? */
170 if (inode_mark) { 171 if (inode_mark) {
172 group = inode_mark->group;
173 inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
171 inode_test_mask &= inode_mark->mask; 174 inode_test_mask &= inode_mark->mask;
172 inode_test_mask &= ~inode_mark->ignored_mask; 175 inode_test_mask &= ~inode_mark->ignored_mask;
173 } 176 }
174 177
175 /* does the vfsmount_mark tell us to do something? */ 178 /* does the vfsmount_mark tell us to do something? */
176 if (vfsmount_mark) { 179 if (vfsmount_mark) {
180 vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
181 group = vfsmount_mark->group;
177 vfsmount_test_mask &= vfsmount_mark->mask; 182 vfsmount_test_mask &= vfsmount_mark->mask;
178 vfsmount_test_mask &= ~vfsmount_mark->ignored_mask; 183 vfsmount_test_mask &= ~vfsmount_mark->ignored_mask;
179 if (inode_mark) 184 if (inode_mark)
180 vfsmount_test_mask &= ~inode_mark->ignored_mask; 185 vfsmount_test_mask &= ~inode_mark->ignored_mask;
181 } 186 }
182 187
188 pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p"
189 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
190 " data=%p data_is=%d cookie=%d event=%p\n",
191 __func__, group, to_tell, mnt, mask, inode_mark,
192 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
193 data_is, cookie, *event);
194
183 if (!inode_test_mask && !vfsmount_test_mask) 195 if (!inode_test_mask && !vfsmount_test_mask)
184 return 0; 196 return 0;
185 197
@@ -207,13 +219,12 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
207int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, 219int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
208 const unsigned char *file_name, u32 cookie) 220 const unsigned char *file_name, u32 cookie)
209{ 221{
210 struct hlist_node *inode_node, *vfsmount_node; 222 struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
211 struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; 223 struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
212 struct fsnotify_group *inode_group, *vfsmount_group; 224 struct fsnotify_group *inode_group, *vfsmount_group;
213 struct fsnotify_event *event = NULL; 225 struct fsnotify_event *event = NULL;
214 struct vfsmount *mnt; 226 struct vfsmount *mnt;
215 int idx, ret = 0; 227 int idx, ret = 0;
216 bool used_inode = false, used_vfsmount = false;
217 /* global tests shouldn't care about events on child only the specific event */ 228 /* global tests shouldn't care about events on child only the specific event */
218 __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); 229 __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
219 230
@@ -238,57 +249,50 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
238 (test_mask & to_tell->i_fsnotify_mask)) 249 (test_mask & to_tell->i_fsnotify_mask))
239 inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, 250 inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
240 &fsnotify_mark_srcu); 251 &fsnotify_mark_srcu);
241 else
242 inode_node = NULL;
243 252
244 if (mnt) { 253 if (mnt && ((mask & FS_MODIFY) ||
245 if ((mask & FS_MODIFY) || 254 (test_mask & mnt->mnt_fsnotify_mask))) {
246 (test_mask & mnt->mnt_fsnotify_mask)) 255 vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
247 vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first, 256 &fsnotify_mark_srcu);
248 &fsnotify_mark_srcu); 257 inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
249 else 258 &fsnotify_mark_srcu);
250 vfsmount_node = NULL;
251 } else {
252 mnt = NULL;
253 vfsmount_node = NULL;
254 } 259 }
255 260
256 while (inode_node || vfsmount_node) { 261 while (inode_node || vfsmount_node) {
262 inode_group = vfsmount_group = NULL;
263
257 if (inode_node) { 264 if (inode_node) {
258 inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu), 265 inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
259 struct fsnotify_mark, i.i_list); 266 struct fsnotify_mark, i.i_list);
260 inode_group = inode_mark->group; 267 inode_group = inode_mark->group;
261 } else 268 }
262 inode_group = (void *)-1;
263 269
264 if (vfsmount_node) { 270 if (vfsmount_node) {
265 vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu), 271 vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu),
266 struct fsnotify_mark, m.m_list); 272 struct fsnotify_mark, m.m_list);
267 vfsmount_group = vfsmount_mark->group; 273 vfsmount_group = vfsmount_mark->group;
268 } else 274 }
269 vfsmount_group = (void *)-1;
270 275
271 if (inode_group < vfsmount_group) { 276 if (inode_group > vfsmount_group) {
272 /* handle inode */ 277 /* handle inode */
273 send_to_group(to_tell, NULL, inode_mark, NULL, mask, data, 278 send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
274 data_is, cookie, file_name, &event); 279 data_is, cookie, file_name, &event);
275 used_inode = true; 280 /* we didn't use the vfsmount_mark */
276 } else if (vfsmount_group < inode_group) { 281 vfsmount_group = NULL;
282 } else if (vfsmount_group > inode_group) {
277 send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data, 283 send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
278 data_is, cookie, file_name, &event); 284 data_is, cookie, file_name, &event);
279 used_vfsmount = true; 285 inode_group = NULL;
280 } else { 286 } else {
281 send_to_group(to_tell, mnt, inode_mark, vfsmount_mark, 287 send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
282 mask, data, data_is, cookie, file_name, 288 mask, data, data_is, cookie, file_name,
283 &event); 289 &event);
284 used_vfsmount = true;
285 used_inode = true;
286 } 290 }
287 291
288 if (used_inode) 292 if (inode_group)
289 inode_node = srcu_dereference(inode_node->next, 293 inode_node = srcu_dereference(inode_node->next,
290 &fsnotify_mark_srcu); 294 &fsnotify_mark_srcu);
291 if (used_vfsmount) 295 if (vfsmount_group)
292 vfsmount_node = srcu_dereference(vfsmount_node->next, 296 vfsmount_node = srcu_dereference(vfsmount_node->next,
293 &fsnotify_mark_srcu); 297 &fsnotify_mark_srcu);
294 } 298 }
diff --git a/fs/open.c b/fs/open.c
index 630715f9f73d..d74e1983e8dc 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -675,7 +675,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
675 f->f_path.mnt = mnt; 675 f->f_path.mnt = mnt;
676 f->f_pos = 0; 676 f->f_pos = 0;
677 f->f_op = fops_get(inode->i_fop); 677 f->f_op = fops_get(inode->i_fop);
678 file_move(f, &inode->i_sb->s_files); 678 file_sb_list_add(f, inode->i_sb);
679 679
680 error = security_dentry_open(f, cred); 680 error = security_dentry_open(f, cred);
681 if (error) 681 if (error)
@@ -721,7 +721,7 @@ cleanup_all:
721 mnt_drop_write(mnt); 721 mnt_drop_write(mnt);
722 } 722 }
723 } 723 }
724 file_kill(f); 724 file_sb_list_del(f);
725 f->f_path.dentry = NULL; 725 f->f_path.dentry = NULL;
726 f->f_path.mnt = NULL; 726 f->f_path.mnt = NULL;
727cleanup_file: 727cleanup_file:
diff --git a/fs/pnode.c b/fs/pnode.c
index 5cc564a83149..8066b8dd748f 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -126,6 +126,9 @@ static int do_make_slave(struct vfsmount *mnt)
126 return 0; 126 return 0;
127} 127}
128 128
129/*
130 * vfsmount lock must be held for write
131 */
129void change_mnt_propagation(struct vfsmount *mnt, int type) 132void change_mnt_propagation(struct vfsmount *mnt, int type)
130{ 133{
131 if (type == MS_SHARED) { 134 if (type == MS_SHARED) {
@@ -270,12 +273,12 @@ int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
270 prev_src_mnt = child; 273 prev_src_mnt = child;
271 } 274 }
272out: 275out:
273 spin_lock(&vfsmount_lock); 276 br_write_lock(vfsmount_lock);
274 while (!list_empty(&tmp_list)) { 277 while (!list_empty(&tmp_list)) {
275 child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash); 278 child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash);
276 umount_tree(child, 0, &umount_list); 279 umount_tree(child, 0, &umount_list);
277 } 280 }
278 spin_unlock(&vfsmount_lock); 281 br_write_unlock(vfsmount_lock);
279 release_mounts(&umount_list); 282 release_mounts(&umount_list);
280 return ret; 283 return ret;
281} 284}
@@ -296,6 +299,8 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count)
296 * other mounts its parent propagates to. 299 * other mounts its parent propagates to.
297 * Check if any of these mounts that **do not have submounts** 300 * Check if any of these mounts that **do not have submounts**
298 * have more references than 'refcnt'. If so return busy. 301 * have more references than 'refcnt'. If so return busy.
302 *
303 * vfsmount lock must be held for read or write
299 */ 304 */
300int propagate_mount_busy(struct vfsmount *mnt, int refcnt) 305int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
301{ 306{
@@ -353,6 +358,8 @@ static void __propagate_umount(struct vfsmount *mnt)
353 * collect all mounts that receive propagation from the mount in @list, 358 * collect all mounts that receive propagation from the mount in @list,
354 * and return these additional mounts in the same list. 359 * and return these additional mounts in the same list.
355 * @list: the list of mounts to be unmounted. 360 * @list: the list of mounts to be unmounted.
361 *
362 * vfsmount lock must be held for write
356 */ 363 */
357int propagate_umount(struct list_head *list) 364int propagate_umount(struct list_head *list)
358{ 365{
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ae35413dcbe1..caa758377d66 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -83,6 +83,7 @@ void reiserfs_evict_inode(struct inode *inode)
83 dquot_drop(inode); 83 dquot_drop(inode);
84 inode->i_blocks = 0; 84 inode->i_blocks = 0;
85 reiserfs_write_unlock_once(inode->i_sb, depth); 85 reiserfs_write_unlock_once(inode->i_sb, depth);
86 return;
86 87
87no_delete: 88no_delete:
88 end_writeback(inode); 89 end_writeback(inode);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 1ec952b1f036..812e2c05aa29 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2311,7 +2311,7 @@ static int journal_read_transaction(struct super_block *sb,
2311 /* flush out the real blocks */ 2311 /* flush out the real blocks */
2312 for (i = 0; i < get_desc_trans_len(desc); i++) { 2312 for (i = 0; i < get_desc_trans_len(desc); i++) {
2313 set_buffer_dirty(real_blocks[i]); 2313 set_buffer_dirty(real_blocks[i]);
2314 ll_rw_block(SWRITE, 1, real_blocks + i); 2314 write_dirty_buffer(real_blocks[i], WRITE);
2315 } 2315 }
2316 for (i = 0; i < get_desc_trans_len(desc); i++) { 2316 for (i = 0; i < get_desc_trans_len(desc); i++) {
2317 wait_on_buffer(real_blocks[i]); 2317 wait_on_buffer(real_blocks[i]);
diff --git a/fs/super.c b/fs/super.c
index 9674ab2c8718..8819e3a7ff20 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -54,7 +54,22 @@ static struct super_block *alloc_super(struct file_system_type *type)
54 s = NULL; 54 s = NULL;
55 goto out; 55 goto out;
56 } 56 }
57#ifdef CONFIG_SMP
58 s->s_files = alloc_percpu(struct list_head);
59 if (!s->s_files) {
60 security_sb_free(s);
61 kfree(s);
62 s = NULL;
63 goto out;
64 } else {
65 int i;
66
67 for_each_possible_cpu(i)
68 INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i));
69 }
70#else
57 INIT_LIST_HEAD(&s->s_files); 71 INIT_LIST_HEAD(&s->s_files);
72#endif
58 INIT_LIST_HEAD(&s->s_instances); 73 INIT_LIST_HEAD(&s->s_instances);
59 INIT_HLIST_HEAD(&s->s_anon); 74 INIT_HLIST_HEAD(&s->s_anon);
60 INIT_LIST_HEAD(&s->s_inodes); 75 INIT_LIST_HEAD(&s->s_inodes);
@@ -108,6 +123,9 @@ out:
108 */ 123 */
109static inline void destroy_super(struct super_block *s) 124static inline void destroy_super(struct super_block *s)
110{ 125{
126#ifdef CONFIG_SMP
127 free_percpu(s->s_files);
128#endif
111 security_sb_free(s); 129 security_sb_free(s);
112 kfree(s->s_subtype); 130 kfree(s->s_subtype);
113 kfree(s->s_options); 131 kfree(s->s_options);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1b27b5688f62..da3fefe91a8f 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -340,7 +340,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
340 char *p; 340 char *p;
341 341
342 p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file)); 342 p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file));
343 if (p) 343 if (!IS_ERR(p))
344 memmove(last_sysfs_file, p, strlen(p) + 1); 344 memmove(last_sysfs_file, p, strlen(p) + 1);
345 345
346 /* need attr_sd for attr and ops, its parent for kobj */ 346 /* need attr_sd for attr and ops, its parent for kobj */
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 048484fb10d2..46f7a807bbc1 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -114,10 +114,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
114 114
115 ubh_mark_buffer_dirty (USPI_UBH(uspi)); 115 ubh_mark_buffer_dirty (USPI_UBH(uspi));
116 ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); 116 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
117 if (sb->s_flags & MS_SYNCHRONOUS) { 117 if (sb->s_flags & MS_SYNCHRONOUS)
118 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); 118 ubh_sync_block(UCPI_UBH(ucpi));
119 ubh_wait_on_buffer (UCPI_UBH(ucpi));
120 }
121 sb->s_dirt = 1; 119 sb->s_dirt = 1;
122 120
123 unlock_super (sb); 121 unlock_super (sb);
@@ -207,10 +205,8 @@ do_more:
207 205
208 ubh_mark_buffer_dirty (USPI_UBH(uspi)); 206 ubh_mark_buffer_dirty (USPI_UBH(uspi));
209 ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); 207 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
210 if (sb->s_flags & MS_SYNCHRONOUS) { 208 if (sb->s_flags & MS_SYNCHRONOUS)
211 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); 209 ubh_sync_block(UCPI_UBH(ucpi));
212 ubh_wait_on_buffer (UCPI_UBH(ucpi));
213 }
214 210
215 if (overflow) { 211 if (overflow) {
216 fragment += count; 212 fragment += count;
@@ -558,10 +554,8 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
558 554
559 ubh_mark_buffer_dirty (USPI_UBH(uspi)); 555 ubh_mark_buffer_dirty (USPI_UBH(uspi));
560 ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); 556 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
561 if (sb->s_flags & MS_SYNCHRONOUS) { 557 if (sb->s_flags & MS_SYNCHRONOUS)
562 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); 558 ubh_sync_block(UCPI_UBH(ucpi));
563 ubh_wait_on_buffer (UCPI_UBH(ucpi));
564 }
565 sb->s_dirt = 1; 559 sb->s_dirt = 1;
566 560
567 UFSD("EXIT, fragment %llu\n", (unsigned long long)fragment); 561 UFSD("EXIT, fragment %llu\n", (unsigned long long)fragment);
@@ -680,10 +674,8 @@ cg_found:
680succed: 674succed:
681 ubh_mark_buffer_dirty (USPI_UBH(uspi)); 675 ubh_mark_buffer_dirty (USPI_UBH(uspi));
682 ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); 676 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
683 if (sb->s_flags & MS_SYNCHRONOUS) { 677 if (sb->s_flags & MS_SYNCHRONOUS)
684 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); 678 ubh_sync_block(UCPI_UBH(ucpi));
685 ubh_wait_on_buffer (UCPI_UBH(ucpi));
686 }
687 sb->s_dirt = 1; 679 sb->s_dirt = 1;
688 680
689 result += cgno * uspi->s_fpg; 681 result += cgno * uspi->s_fpg;
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 428017e018fe..2eabf04af3de 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -113,10 +113,8 @@ void ufs_free_inode (struct inode * inode)
113 113
114 ubh_mark_buffer_dirty (USPI_UBH(uspi)); 114 ubh_mark_buffer_dirty (USPI_UBH(uspi));
115 ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); 115 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
116 if (sb->s_flags & MS_SYNCHRONOUS) { 116 if (sb->s_flags & MS_SYNCHRONOUS)
117 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); 117 ubh_sync_block(UCPI_UBH(ucpi));
118 ubh_wait_on_buffer (UCPI_UBH(ucpi));
119 }
120 118
121 sb->s_dirt = 1; 119 sb->s_dirt = 1;
122 unlock_super (sb); 120 unlock_super (sb);
@@ -156,10 +154,8 @@ static void ufs2_init_inodes_chunk(struct super_block *sb,
156 154
157 fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb); 155 fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb);
158 ubh_mark_buffer_dirty(UCPI_UBH(ucpi)); 156 ubh_mark_buffer_dirty(UCPI_UBH(ucpi));
159 if (sb->s_flags & MS_SYNCHRONOUS) { 157 if (sb->s_flags & MS_SYNCHRONOUS)
160 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); 158 ubh_sync_block(UCPI_UBH(ucpi));
161 ubh_wait_on_buffer(UCPI_UBH(ucpi));
162 }
163 159
164 UFSD("EXIT\n"); 160 UFSD("EXIT\n");
165} 161}
@@ -290,10 +286,8 @@ cg_found:
290 } 286 }
291 ubh_mark_buffer_dirty (USPI_UBH(uspi)); 287 ubh_mark_buffer_dirty (USPI_UBH(uspi));
292 ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); 288 ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
293 if (sb->s_flags & MS_SYNCHRONOUS) { 289 if (sb->s_flags & MS_SYNCHRONOUS)
294 ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); 290 ubh_sync_block(UCPI_UBH(ucpi));
295 ubh_wait_on_buffer (UCPI_UBH(ucpi));
296 }
297 sb->s_dirt = 1; 291 sb->s_dirt = 1;
298 292
299 inode->i_ino = cg * uspi->s_ipg + bit; 293 inode->i_ino = cg * uspi->s_ipg + bit;
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 34d5cb135320..a58f9155fc9a 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -243,10 +243,8 @@ static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p)
243 ubh_bforget(ind_ubh); 243 ubh_bforget(ind_ubh);
244 ind_ubh = NULL; 244 ind_ubh = NULL;
245 } 245 }
246 if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) { 246 if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh))
247 ubh_ll_rw_block(SWRITE, ind_ubh); 247 ubh_sync_block(ind_ubh);
248 ubh_wait_on_buffer (ind_ubh);
249 }
250 ubh_brelse (ind_ubh); 248 ubh_brelse (ind_ubh);
251 249
252 UFSD("EXIT: ino %lu\n", inode->i_ino); 250 UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -307,10 +305,8 @@ static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p)
307 ubh_bforget(dind_bh); 305 ubh_bforget(dind_bh);
308 dind_bh = NULL; 306 dind_bh = NULL;
309 } 307 }
310 if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) { 308 if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh))
311 ubh_ll_rw_block(SWRITE, dind_bh); 309 ubh_sync_block(dind_bh);
312 ubh_wait_on_buffer (dind_bh);
313 }
314 ubh_brelse (dind_bh); 310 ubh_brelse (dind_bh);
315 311
316 UFSD("EXIT: ino %lu\n", inode->i_ino); 312 UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -367,10 +363,8 @@ static int ufs_trunc_tindirect(struct inode *inode)
367 ubh_bforget(tind_bh); 363 ubh_bforget(tind_bh);
368 tind_bh = NULL; 364 tind_bh = NULL;
369 } 365 }
370 if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) { 366 if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh))
371 ubh_ll_rw_block(SWRITE, tind_bh); 367 ubh_sync_block(tind_bh);
372 ubh_wait_on_buffer (tind_bh);
373 }
374 ubh_brelse (tind_bh); 368 ubh_brelse (tind_bh);
375 369
376 UFSD("EXIT: ino %lu\n", inode->i_ino); 370 UFSD("EXIT: ino %lu\n", inode->i_ino);
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 85a7fc9e4a4e..d2c36d53fe66 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -113,21 +113,17 @@ void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag)
113 } 113 }
114} 114}
115 115
116void ubh_ll_rw_block(int rw, struct ufs_buffer_head *ubh) 116void ubh_sync_block(struct ufs_buffer_head *ubh)
117{ 117{
118 if (!ubh) 118 if (ubh) {
119 return; 119 unsigned i;
120 120
121 ll_rw_block(rw, ubh->count, ubh->bh); 121 for (i = 0; i < ubh->count; i++)
122} 122 write_dirty_buffer(ubh->bh[i], WRITE);
123 123
124void ubh_wait_on_buffer (struct ufs_buffer_head * ubh) 124 for (i = 0; i < ubh->count; i++)
125{ 125 wait_on_buffer(ubh->bh[i]);
126 unsigned i; 126 }
127 if (!ubh)
128 return;
129 for ( i = 0; i < ubh->count; i++ )
130 wait_on_buffer (ubh->bh[i]);
131} 127}
132 128
133void ubh_bforget (struct ufs_buffer_head * ubh) 129void ubh_bforget (struct ufs_buffer_head * ubh)
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 0466036912f1..9f8775ce381c 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -269,8 +269,7 @@ extern void ubh_brelse (struct ufs_buffer_head *);
269extern void ubh_brelse_uspi (struct ufs_sb_private_info *); 269extern void ubh_brelse_uspi (struct ufs_sb_private_info *);
270extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *); 270extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *);
271extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int); 271extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int);
272extern void ubh_ll_rw_block(int, struct ufs_buffer_head *); 272extern void ubh_sync_block(struct ufs_buffer_head *);
273extern void ubh_wait_on_buffer (struct ufs_buffer_head *);
274extern void ubh_bforget (struct ufs_buffer_head *); 273extern void ubh_bforget (struct ufs_buffer_head *);
275extern int ubh_buffer_dirty (struct ufs_buffer_head *); 274extern int ubh_buffer_dirty (struct ufs_buffer_head *);
276#define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size) 275#define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size)
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 15412fe15c3a..b552f816de15 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -852,8 +852,8 @@ xfs_convert_page(
852 SetPageUptodate(page); 852 SetPageUptodate(page);
853 853
854 if (count) { 854 if (count) {
855 wbc->nr_to_write--; 855 if (--wbc->nr_to_write <= 0 &&
856 if (wbc->nr_to_write <= 0) 856 wbc->sync_mode == WB_SYNC_NONE)
857 done = 1; 857 done = 1;
858 } 858 }
859 xfs_start_page_writeback(page, !page_dirty, count); 859 xfs_start_page_writeback(page, !page_dirty, count);
@@ -1068,7 +1068,7 @@ xfs_vm_writepage(
1068 * by themselves. 1068 * by themselves.
1069 */ 1069 */
1070 if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC) 1070 if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
1071 goto out_fail; 1071 goto redirty;
1072 1072
1073 /* 1073 /*
1074 * We need a transaction if there are delalloc or unwritten buffers 1074 * We need a transaction if there are delalloc or unwritten buffers
@@ -1080,7 +1080,7 @@ xfs_vm_writepage(
1080 */ 1080 */
1081 xfs_count_page_state(page, &delalloc, &unwritten); 1081 xfs_count_page_state(page, &delalloc, &unwritten);
1082 if ((current->flags & PF_FSTRANS) && (delalloc || unwritten)) 1082 if ((current->flags & PF_FSTRANS) && (delalloc || unwritten))
1083 goto out_fail; 1083 goto redirty;
1084 1084
1085 /* Is this page beyond the end of the file? */ 1085 /* Is this page beyond the end of the file? */
1086 offset = i_size_read(inode); 1086 offset = i_size_read(inode);
@@ -1245,12 +1245,15 @@ error:
1245 if (iohead) 1245 if (iohead)
1246 xfs_cancel_ioend(iohead); 1246 xfs_cancel_ioend(iohead);
1247 1247
1248 if (err == -EAGAIN)
1249 goto redirty;
1250
1248 xfs_aops_discard_page(page); 1251 xfs_aops_discard_page(page);
1249 ClearPageUptodate(page); 1252 ClearPageUptodate(page);
1250 unlock_page(page); 1253 unlock_page(page);
1251 return err; 1254 return err;
1252 1255
1253out_fail: 1256redirty:
1254 redirty_page_for_writepage(wbc, page); 1257 redirty_page_for_writepage(wbc, page);
1255 unlock_page(page); 1258 unlock_page(page);
1256 return 0; 1259 return 0;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ea79072f5210..d72cf2bb054a 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -440,12 +440,7 @@ _xfs_buf_find(
440 ASSERT(btp == bp->b_target); 440 ASSERT(btp == bp->b_target);
441 if (bp->b_file_offset == range_base && 441 if (bp->b_file_offset == range_base &&
442 bp->b_buffer_length == range_length) { 442 bp->b_buffer_length == range_length) {
443 /*
444 * If we look at something, bring it to the
445 * front of the list for next time.
446 */
447 atomic_inc(&bp->b_hold); 443 atomic_inc(&bp->b_hold);
448 list_move(&bp->b_hash_list, &hash->bh_list);
449 goto found; 444 goto found;
450 } 445 }
451 } 446 }
@@ -1443,8 +1438,7 @@ xfs_alloc_bufhash(
1443{ 1438{
1444 unsigned int i; 1439 unsigned int i;
1445 1440
1446 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ 1441 btp->bt_hashshift = external ? 3 : 12; /* 8 or 4096 buckets */
1447 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
1448 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * 1442 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
1449 sizeof(xfs_bufhash_t)); 1443 sizeof(xfs_bufhash_t));
1450 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1444 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index d072e5ff923b..2a05614f0b92 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -137,7 +137,6 @@ typedef struct xfs_buftarg {
137 size_t bt_smask; 137 size_t bt_smask;
138 138
139 /* per device buffer hash table */ 139 /* per device buffer hash table */
140 uint bt_hashmask;
141 uint bt_hashshift; 140 uint bt_hashshift;
142 xfs_bufhash_t *bt_hash; 141 xfs_bufhash_t *bt_hash;
143 142
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 237f5ffb2ee8..4fec427b83ef 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -907,6 +907,13 @@ xfs_ioctl_setattr(
907 return XFS_ERROR(EIO); 907 return XFS_ERROR(EIO);
908 908
909 /* 909 /*
910 * Disallow 32bit project ids because on-disk structure
911 * is 16bit only.
912 */
913 if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1))
914 return XFS_ERROR(EINVAL);
915
916 /*
910 * If disk quotas is on, we make sure that the dquots do exist on disk, 917 * If disk quotas is on, we make sure that the dquots do exist on disk,
911 * before we start any other transactions. Trying to do this later 918 * before we start any other transactions. Trying to do this later
912 * is messy. We don't care to take a readlock to look at the ids 919 * is messy. We don't care to take a readlock to look at the ids
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 68be25dcd301..b1fc2a6bfe83 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -664,7 +664,7 @@ xfs_vn_fiemap(
664 fieinfo->fi_extents_max + 1; 664 fieinfo->fi_extents_max + 1;
665 bm.bmv_count = min_t(__s32, bm.bmv_count, 665 bm.bmv_count = min_t(__s32, bm.bmv_count,
666 (PAGE_SIZE * 16 / sizeof(struct getbmapx))); 666 (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
667 bm.bmv_iflags = BMV_IF_PREALLOC; 667 bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
668 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) 668 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
669 bm.bmv_iflags |= BMV_IF_ATTRFORK; 669 bm.bmv_iflags |= BMV_IF_ATTRFORK;
670 if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) 670 if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 15c35b62ff14..a4e07974955b 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1226,6 +1226,7 @@ xfs_fs_statfs(
1226 struct xfs_inode *ip = XFS_I(dentry->d_inode); 1226 struct xfs_inode *ip = XFS_I(dentry->d_inode);
1227 __uint64_t fakeinos, id; 1227 __uint64_t fakeinos, id;
1228 xfs_extlen_t lsize; 1228 xfs_extlen_t lsize;
1229 __int64_t ffree;
1229 1230
1230 statp->f_type = XFS_SB_MAGIC; 1231 statp->f_type = XFS_SB_MAGIC;
1231 statp->f_namelen = MAXNAMELEN - 1; 1232 statp->f_namelen = MAXNAMELEN - 1;
@@ -1249,7 +1250,11 @@ xfs_fs_statfs(
1249 statp->f_files = min_t(typeof(statp->f_files), 1250 statp->f_files = min_t(typeof(statp->f_files),
1250 statp->f_files, 1251 statp->f_files,
1251 mp->m_maxicount); 1252 mp->m_maxicount);
1252 statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); 1253
1254 /* make sure statp->f_ffree does not underflow */
1255 ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
1256 statp->f_ffree = max_t(__int64_t, ffree, 0);
1257
1253 spin_unlock(&mp->m_sb_lock); 1258 spin_unlock(&mp->m_sb_lock);
1254 1259
1255 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) || 1260 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
@@ -1402,7 +1407,7 @@ xfs_fs_freeze(
1402 1407
1403 xfs_save_resvblks(mp); 1408 xfs_save_resvblks(mp);
1404 xfs_quiesce_attr(mp); 1409 xfs_quiesce_attr(mp);
1405 return -xfs_fs_log_dummy(mp); 1410 return -xfs_fs_log_dummy(mp, SYNC_WAIT);
1406} 1411}
1407 1412
1408STATIC int 1413STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index dfcbd98d1599..d59c4a65d492 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -34,6 +34,7 @@
34#include "xfs_inode_item.h" 34#include "xfs_inode_item.h"
35#include "xfs_quota.h" 35#include "xfs_quota.h"
36#include "xfs_trace.h" 36#include "xfs_trace.h"
37#include "xfs_fsops.h"
37 38
38#include <linux/kthread.h> 39#include <linux/kthread.h>
39#include <linux/freezer.h> 40#include <linux/freezer.h>
@@ -341,38 +342,6 @@ xfs_sync_attr(
341} 342}
342 343
343STATIC int 344STATIC int
344xfs_commit_dummy_trans(
345 struct xfs_mount *mp,
346 uint flags)
347{
348 struct xfs_inode *ip = mp->m_rootip;
349 struct xfs_trans *tp;
350 int error;
351
352 /*
353 * Put a dummy transaction in the log to tell recovery
354 * that all others are OK.
355 */
356 tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
357 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
358 if (error) {
359 xfs_trans_cancel(tp, 0);
360 return error;
361 }
362
363 xfs_ilock(ip, XFS_ILOCK_EXCL);
364
365 xfs_trans_ijoin(tp, ip);
366 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
367 error = xfs_trans_commit(tp, 0);
368 xfs_iunlock(ip, XFS_ILOCK_EXCL);
369
370 /* the log force ensures this transaction is pushed to disk */
371 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
372 return error;
373}
374
375STATIC int
376xfs_sync_fsdata( 345xfs_sync_fsdata(
377 struct xfs_mount *mp) 346 struct xfs_mount *mp)
378{ 347{
@@ -432,7 +401,7 @@ xfs_quiesce_data(
432 401
433 /* mark the log as covered if needed */ 402 /* mark the log as covered if needed */
434 if (xfs_log_need_covered(mp)) 403 if (xfs_log_need_covered(mp))
435 error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT); 404 error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
436 405
437 /* flush data-only devices */ 406 /* flush data-only devices */
438 if (mp->m_rtdev_targp) 407 if (mp->m_rtdev_targp)
@@ -563,7 +532,7 @@ xfs_flush_inodes(
563/* 532/*
564 * Every sync period we need to unpin all items, reclaim inodes and sync 533 * Every sync period we need to unpin all items, reclaim inodes and sync
565 * disk quotas. We might need to cover the log to indicate that the 534 * disk quotas. We might need to cover the log to indicate that the
566 * filesystem is idle. 535 * filesystem is idle and not frozen.
567 */ 536 */
568STATIC void 537STATIC void
569xfs_sync_worker( 538xfs_sync_worker(
@@ -577,8 +546,9 @@ xfs_sync_worker(
577 xfs_reclaim_inodes(mp, 0); 546 xfs_reclaim_inodes(mp, 0);
578 /* dgc: errors ignored here */ 547 /* dgc: errors ignored here */
579 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 548 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
580 if (xfs_log_need_covered(mp)) 549 if (mp->m_super->s_frozen == SB_UNFROZEN &&
581 error = xfs_commit_dummy_trans(mp, 0); 550 xfs_log_need_covered(mp))
551 error = xfs_fs_log_dummy(mp, 0);
582 } 552 }
583 mp->m_sync_seq++; 553 mp->m_sync_seq++;
584 wake_up(&mp->m_wait_single_sync_task); 554 wake_up(&mp->m_wait_single_sync_task);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 23f14e595c18..f90dadd5a968 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5533,12 +5533,24 @@ xfs_getbmap(
5533 map[i].br_startblock)) 5533 map[i].br_startblock))
5534 goto out_free_map; 5534 goto out_free_map;
5535 5535
5536 nexleft--;
5537 bmv->bmv_offset = 5536 bmv->bmv_offset =
5538 out[cur_ext].bmv_offset + 5537 out[cur_ext].bmv_offset +
5539 out[cur_ext].bmv_length; 5538 out[cur_ext].bmv_length;
5540 bmv->bmv_length = 5539 bmv->bmv_length =
5541 max_t(__int64_t, 0, bmvend - bmv->bmv_offset); 5540 max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
5541
5542 /*
5543 * In case we don't want to return the hole,
5544 * don't increase cur_ext so that we can reuse
5545 * it in the next loop.
5546 */
5547 if ((iflags & BMV_IF_NO_HOLES) &&
5548 map[i].br_startblock == HOLESTARTBLOCK) {
5549 memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
5550 continue;
5551 }
5552
5553 nexleft--;
5542 bmv->bmv_entries++; 5554 bmv->bmv_entries++;
5543 cur_ext++; 5555 cur_ext++;
5544 } 5556 }
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 7cf7220e7d5f..87c2e9d02288 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -114,8 +114,10 @@ struct getbmapx {
114#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */ 114#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */
115#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ 115#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */
116#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */ 116#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */
117#define BMV_IF_NO_HOLES 0x10 /* Do not return holes */
117#define BMV_IF_VALID \ 118#define BMV_IF_VALID \
118 (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC) 119 (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \
120 BMV_IF_DELALLOC|BMV_IF_NO_HOLES)
119 121
120/* bmv_oflags values - returned for each non-header segment */ 122/* bmv_oflags values - returned for each non-header segment */
121#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ 123#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index dbca5f5c37ba..43b1d5699335 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -604,31 +604,36 @@ out:
604 return 0; 604 return 0;
605} 605}
606 606
607/*
608 * Dump a transaction into the log that contains no real change. This is needed
609 * to be able to make the log dirty or stamp the current tail LSN into the log
610 * during the covering operation.
611 *
612 * We cannot use an inode here for this - that will push dirty state back up
613 * into the VFS and then periodic inode flushing will prevent log covering from
614 * making progress. Hence we log a field in the superblock instead.
615 */
607int 616int
608xfs_fs_log_dummy( 617xfs_fs_log_dummy(
609 xfs_mount_t *mp) 618 xfs_mount_t *mp,
619 int flags)
610{ 620{
611 xfs_trans_t *tp; 621 xfs_trans_t *tp;
612 xfs_inode_t *ip;
613 int error; 622 int error;
614 623
615 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); 624 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
616 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); 625 error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
626 XFS_DEFAULT_LOG_COUNT);
617 if (error) { 627 if (error) {
618 xfs_trans_cancel(tp, 0); 628 xfs_trans_cancel(tp, 0);
619 return error; 629 return error;
620 } 630 }
621 631
622 ip = mp->m_rootip; 632 /* log the UUID because it is an unchanging field */
623 xfs_ilock(ip, XFS_ILOCK_EXCL); 633 xfs_mod_sb(tp, XFS_SB_UUID);
624 634 if (flags & SYNC_WAIT)
625 xfs_trans_ijoin(tp, ip); 635 xfs_trans_set_sync(tp);
626 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 636 return xfs_trans_commit(tp, 0);
627 xfs_trans_set_sync(tp);
628 error = xfs_trans_commit(tp, 0);
629
630 xfs_iunlock(ip, XFS_ILOCK_EXCL);
631 return error;
632} 637}
633 638
634int 639int
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 88435e0a77c9..a786c5212c1e 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, 25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
26 xfs_fsop_resblks_t *outval); 26 xfs_fsop_resblks_t *outval);
27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); 27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
28extern int xfs_fs_log_dummy(xfs_mount_t *mp); 28extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);
29 29
30#endif /* __XFS_FSOPS_H__ */ 30#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index abf80ae1e95b..5371d2dc360e 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1213,7 +1213,6 @@ xfs_imap_lookup(
1213 struct xfs_inobt_rec_incore rec; 1213 struct xfs_inobt_rec_incore rec;
1214 struct xfs_btree_cur *cur; 1214 struct xfs_btree_cur *cur;
1215 struct xfs_buf *agbp; 1215 struct xfs_buf *agbp;
1216 xfs_agino_t startino;
1217 int error; 1216 int error;
1218 int i; 1217 int i;
1219 1218
@@ -1227,13 +1226,13 @@ xfs_imap_lookup(
1227 } 1226 }
1228 1227
1229 /* 1228 /*
1230 * derive and lookup the exact inode record for the given agino. If the 1229 * Lookup the inode record for the given agino. If the record cannot be
1231 * record cannot be found, then it's an invalid inode number and we 1230 * found, then it's an invalid inode number and we should abort. Once
1232 * should abort. 1231 * we have a record, we need to ensure it contains the inode number
1232 * we are looking up.
1233 */ 1233 */
1234 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); 1234 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
1235 startino = agino & ~(XFS_IALLOC_INODES(mp) - 1); 1235 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
1236 error = xfs_inobt_lookup(cur, startino, XFS_LOOKUP_EQ, &i);
1237 if (!error) { 1236 if (!error) {
1238 if (i) 1237 if (i)
1239 error = xfs_inobt_get_rec(cur, &rec, &i); 1238 error = xfs_inobt_get_rec(cur, &rec, &i);
@@ -1246,6 +1245,11 @@ xfs_imap_lookup(
1246 if (error) 1245 if (error)
1247 return error; 1246 return error;
1248 1247
1248 /* check that the returned record contains the required inode */
1249 if (rec.ir_startino > agino ||
1250 rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino)
1251 return EINVAL;
1252
1249 /* for untrusted inodes check it is allocated first */ 1253 /* for untrusted inodes check it is allocated first */
1250 if ((flags & XFS_IGET_UNTRUSTED) && 1254 if ((flags & XFS_IGET_UNTRUSTED) &&
1251 (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))) 1255 (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 68415cb4f23c..34798f391c49 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1914,6 +1914,11 @@ xfs_iunlink_remove(
1914 return 0; 1914 return 0;
1915} 1915}
1916 1916
1917/*
1918 * A big issue when freeing the inode cluster is is that we _cannot_ skip any
1919 * inodes that are in memory - they all must be marked stale and attached to
1920 * the cluster buffer.
1921 */
1917STATIC void 1922STATIC void
1918xfs_ifree_cluster( 1923xfs_ifree_cluster(
1919 xfs_inode_t *free_ip, 1924 xfs_inode_t *free_ip,
@@ -1945,8 +1950,6 @@ xfs_ifree_cluster(
1945 } 1950 }
1946 1951
1947 for (j = 0; j < nbufs; j++, inum += ninodes) { 1952 for (j = 0; j < nbufs; j++, inum += ninodes) {
1948 int found = 0;
1949
1950 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 1953 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
1951 XFS_INO_TO_AGBNO(mp, inum)); 1954 XFS_INO_TO_AGBNO(mp, inum));
1952 1955
@@ -1965,7 +1968,9 @@ xfs_ifree_cluster(
1965 /* 1968 /*
1966 * Walk the inodes already attached to the buffer and mark them 1969 * Walk the inodes already attached to the buffer and mark them
1967 * stale. These will all have the flush locks held, so an 1970 * stale. These will all have the flush locks held, so an
1968 * in-memory inode walk can't lock them. 1971 * in-memory inode walk can't lock them. By marking them all
1972 * stale first, we will not attempt to lock them in the loop
1973 * below as the XFS_ISTALE flag will be set.
1969 */ 1974 */
1970 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 1975 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1971 while (lip) { 1976 while (lip) {
@@ -1977,11 +1982,11 @@ xfs_ifree_cluster(
1977 &iip->ili_flush_lsn, 1982 &iip->ili_flush_lsn,
1978 &iip->ili_item.li_lsn); 1983 &iip->ili_item.li_lsn);
1979 xfs_iflags_set(iip->ili_inode, XFS_ISTALE); 1984 xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
1980 found++;
1981 } 1985 }
1982 lip = lip->li_bio_list; 1986 lip = lip->li_bio_list;
1983 } 1987 }
1984 1988
1989
1985 /* 1990 /*
1986 * For each inode in memory attempt to add it to the inode 1991 * For each inode in memory attempt to add it to the inode
1987 * buffer and set it up for being staled on buffer IO 1992 * buffer and set it up for being staled on buffer IO
@@ -1993,6 +1998,7 @@ xfs_ifree_cluster(
1993 * even trying to lock them. 1998 * even trying to lock them.
1994 */ 1999 */
1995 for (i = 0; i < ninodes; i++) { 2000 for (i = 0; i < ninodes; i++) {
2001retry:
1996 read_lock(&pag->pag_ici_lock); 2002 read_lock(&pag->pag_ici_lock);
1997 ip = radix_tree_lookup(&pag->pag_ici_root, 2003 ip = radix_tree_lookup(&pag->pag_ici_root,
1998 XFS_INO_TO_AGINO(mp, (inum + i))); 2004 XFS_INO_TO_AGINO(mp, (inum + i)));
@@ -2003,38 +2009,36 @@ xfs_ifree_cluster(
2003 continue; 2009 continue;
2004 } 2010 }
2005 2011
2006 /* don't try to lock/unlock the current inode */ 2012 /*
2013 * Don't try to lock/unlock the current inode, but we
2014 * _cannot_ skip the other inodes that we did not find
2015 * in the list attached to the buffer and are not
2016 * already marked stale. If we can't lock it, back off
2017 * and retry.
2018 */
2007 if (ip != free_ip && 2019 if (ip != free_ip &&
2008 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2020 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2009 read_unlock(&pag->pag_ici_lock); 2021 read_unlock(&pag->pag_ici_lock);
2010 continue; 2022 delay(1);
2023 goto retry;
2011 } 2024 }
2012 read_unlock(&pag->pag_ici_lock); 2025 read_unlock(&pag->pag_ici_lock);
2013 2026
2014 if (!xfs_iflock_nowait(ip)) { 2027 xfs_iflock(ip);
2015 if (ip != free_ip)
2016 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2017 continue;
2018 }
2019
2020 xfs_iflags_set(ip, XFS_ISTALE); 2028 xfs_iflags_set(ip, XFS_ISTALE);
2021 if (xfs_inode_clean(ip)) {
2022 ASSERT(ip != free_ip);
2023 xfs_ifunlock(ip);
2024 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2025 continue;
2026 }
2027 2029
2030 /*
2031 * we don't need to attach clean inodes or those only
2032 * with unlogged changes (which we throw away, anyway).
2033 */
2028 iip = ip->i_itemp; 2034 iip = ip->i_itemp;
2029 if (!iip) { 2035 if (!iip || xfs_inode_clean(ip)) {
2030 /* inode with unlogged changes only */
2031 ASSERT(ip != free_ip); 2036 ASSERT(ip != free_ip);
2032 ip->i_update_core = 0; 2037 ip->i_update_core = 0;
2033 xfs_ifunlock(ip); 2038 xfs_ifunlock(ip);
2034 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2039 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2035 continue; 2040 continue;
2036 } 2041 }
2037 found++;
2038 2042
2039 iip->ili_last_fields = iip->ili_format.ilf_fields; 2043 iip->ili_last_fields = iip->ili_format.ilf_fields;
2040 iip->ili_format.ilf_fields = 0; 2044 iip->ili_format.ilf_fields = 0;
@@ -2049,8 +2053,7 @@ xfs_ifree_cluster(
2049 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2053 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2050 } 2054 }
2051 2055
2052 if (found) 2056 xfs_trans_stale_inode_buf(tp, bp);
2053 xfs_trans_stale_inode_buf(tp, bp);
2054 xfs_trans_binval(tp, bp); 2057 xfs_trans_binval(tp, bp);
2055 } 2058 }
2056 2059
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 925d572bf0f4..33f718f92a48 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3015,7 +3015,8 @@ _xfs_log_force(
3015 3015
3016 XFS_STATS_INC(xs_log_force); 3016 XFS_STATS_INC(xs_log_force);
3017 3017
3018 xlog_cil_push(log, 1); 3018 if (log->l_cilp)
3019 xlog_cil_force(log);
3019 3020
3020 spin_lock(&log->l_icloglock); 3021 spin_lock(&log->l_icloglock);
3021 3022
@@ -3167,7 +3168,7 @@ _xfs_log_force_lsn(
3167 XFS_STATS_INC(xs_log_force); 3168 XFS_STATS_INC(xs_log_force);
3168 3169
3169 if (log->l_cilp) { 3170 if (log->l_cilp) {
3170 lsn = xlog_cil_push_lsn(log, lsn); 3171 lsn = xlog_cil_force_lsn(log, lsn);
3171 if (lsn == NULLCOMMITLSN) 3172 if (lsn == NULLCOMMITLSN)
3172 return 0; 3173 return 0;
3173 } 3174 }
@@ -3724,7 +3725,7 @@ xfs_log_force_umount(
3724 * call below. 3725 * call below.
3725 */ 3726 */
3726 if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) 3727 if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
3727 xlog_cil_push(log, 1); 3728 xlog_cil_force(log);
3728 3729
3729 /* 3730 /*
3730 * We must hold both the GRANT lock and the LOG lock, 3731 * We must hold both the GRANT lock and the LOG lock,
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 31e4ea2d19ac..ed575fb4b495 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -68,6 +68,7 @@ xlog_cil_init(
68 ctx->sequence = 1; 68 ctx->sequence = 1;
69 ctx->cil = cil; 69 ctx->cil = cil;
70 cil->xc_ctx = ctx; 70 cil->xc_ctx = ctx;
71 cil->xc_current_sequence = ctx->sequence;
71 72
72 cil->xc_log = log; 73 cil->xc_log = log;
73 log->l_cilp = cil; 74 log->l_cilp = cil;
@@ -269,15 +270,10 @@ xlog_cil_insert(
269static void 270static void
270xlog_cil_format_items( 271xlog_cil_format_items(
271 struct log *log, 272 struct log *log,
272 struct xfs_log_vec *log_vector, 273 struct xfs_log_vec *log_vector)
273 struct xlog_ticket *ticket,
274 xfs_lsn_t *start_lsn)
275{ 274{
276 struct xfs_log_vec *lv; 275 struct xfs_log_vec *lv;
277 276
278 if (start_lsn)
279 *start_lsn = log->l_cilp->xc_ctx->sequence;
280
281 ASSERT(log_vector); 277 ASSERT(log_vector);
282 for (lv = log_vector; lv; lv = lv->lv_next) { 278 for (lv = log_vector; lv; lv = lv->lv_next) {
283 void *ptr; 279 void *ptr;
@@ -301,9 +297,24 @@ xlog_cil_format_items(
301 ptr += vec->i_len; 297 ptr += vec->i_len;
302 } 298 }
303 ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); 299 ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
300 }
301}
302
303static void
304xlog_cil_insert_items(
305 struct log *log,
306 struct xfs_log_vec *log_vector,
307 struct xlog_ticket *ticket,
308 xfs_lsn_t *start_lsn)
309{
310 struct xfs_log_vec *lv;
311
312 if (start_lsn)
313 *start_lsn = log->l_cilp->xc_ctx->sequence;
304 314
315 ASSERT(log_vector);
316 for (lv = log_vector; lv; lv = lv->lv_next)
305 xlog_cil_insert(log, ticket, lv->lv_item, lv); 317 xlog_cil_insert(log, ticket, lv->lv_item, lv);
306 }
307} 318}
308 319
309static void 320static void
@@ -321,80 +332,6 @@ xlog_cil_free_logvec(
321} 332}
322 333
323/* 334/*
324 * Commit a transaction with the given vector to the Committed Item List.
325 *
326 * To do this, we need to format the item, pin it in memory if required and
327 * account for the space used by the transaction. Once we have done that we
328 * need to release the unused reservation for the transaction, attach the
329 * transaction to the checkpoint context so we carry the busy extents through
330 * to checkpoint completion, and then unlock all the items in the transaction.
331 *
332 * For more specific information about the order of operations in
333 * xfs_log_commit_cil() please refer to the comments in
334 * xfs_trans_commit_iclog().
335 *
336 * Called with the context lock already held in read mode to lock out
337 * background commit, returns without it held once background commits are
338 * allowed again.
339 */
340int
341xfs_log_commit_cil(
342 struct xfs_mount *mp,
343 struct xfs_trans *tp,
344 struct xfs_log_vec *log_vector,
345 xfs_lsn_t *commit_lsn,
346 int flags)
347{
348 struct log *log = mp->m_log;
349 int log_flags = 0;
350 int push = 0;
351
352 if (flags & XFS_TRANS_RELEASE_LOG_RES)
353 log_flags = XFS_LOG_REL_PERM_RESERV;
354
355 if (XLOG_FORCED_SHUTDOWN(log)) {
356 xlog_cil_free_logvec(log_vector);
357 return XFS_ERROR(EIO);
358 }
359
360 /* lock out background commit */
361 down_read(&log->l_cilp->xc_ctx_lock);
362 xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
363
364 /* check we didn't blow the reservation */
365 if (tp->t_ticket->t_curr_res < 0)
366 xlog_print_tic_res(log->l_mp, tp->t_ticket);
367
368 /* attach the transaction to the CIL if it has any busy extents */
369 if (!list_empty(&tp->t_busy)) {
370 spin_lock(&log->l_cilp->xc_cil_lock);
371 list_splice_init(&tp->t_busy,
372 &log->l_cilp->xc_ctx->busy_extents);
373 spin_unlock(&log->l_cilp->xc_cil_lock);
374 }
375
376 tp->t_commit_lsn = *commit_lsn;
377 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
378 xfs_trans_unreserve_and_mod_sb(tp);
379
380 /* check for background commit before unlock */
381 if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
382 push = 1;
383 up_read(&log->l_cilp->xc_ctx_lock);
384
385 /*
386 * We need to push CIL every so often so we don't cache more than we
387 * can fit in the log. The limit really is that a checkpoint can't be
388 * more than half the log (the current checkpoint is not allowed to
389 * overwrite the previous checkpoint), but commit latency and memory
390 * usage limit this to a smaller size in most cases.
391 */
392 if (push)
393 xlog_cil_push(log, 0);
394 return 0;
395}
396
397/*
398 * Mark all items committed and clear busy extents. We free the log vector 335 * Mark all items committed and clear busy extents. We free the log vector
399 * chains in a separate pass so that we unpin the log items as quickly as 336 * chains in a separate pass so that we unpin the log items as quickly as
400 * possible. 337 * possible.
@@ -427,13 +364,23 @@ xlog_cil_committed(
427} 364}
428 365
429/* 366/*
430 * Push the Committed Item List to the log. If the push_now flag is not set, 367 * Push the Committed Item List to the log. If @push_seq flag is zero, then it
431 * then it is a background flush and so we can chose to ignore it. 368 * is a background flush and so we can chose to ignore it. Otherwise, if the
369 * current sequence is the same as @push_seq we need to do a flush. If
370 * @push_seq is less than the current sequence, then it has already been
371 * flushed and we don't need to do anything - the caller will wait for it to
372 * complete if necessary.
373 *
374 * @push_seq is a value rather than a flag because that allows us to do an
375 * unlocked check of the sequence number for a match. Hence we can allows log
376 * forces to run racily and not issue pushes for the same sequence twice. If we
377 * get a race between multiple pushes for the same sequence they will block on
378 * the first one and then abort, hence avoiding needless pushes.
432 */ 379 */
433int 380STATIC int
434xlog_cil_push( 381xlog_cil_push(
435 struct log *log, 382 struct log *log,
436 int push_now) 383 xfs_lsn_t push_seq)
437{ 384{
438 struct xfs_cil *cil = log->l_cilp; 385 struct xfs_cil *cil = log->l_cilp;
439 struct xfs_log_vec *lv; 386 struct xfs_log_vec *lv;
@@ -453,12 +400,14 @@ xlog_cil_push(
453 if (!cil) 400 if (!cil)
454 return 0; 401 return 0;
455 402
403 ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
404
456 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); 405 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
457 new_ctx->ticket = xlog_cil_ticket_alloc(log); 406 new_ctx->ticket = xlog_cil_ticket_alloc(log);
458 407
459 /* lock out transaction commit, but don't block on background push */ 408 /* lock out transaction commit, but don't block on background push */
460 if (!down_write_trylock(&cil->xc_ctx_lock)) { 409 if (!down_write_trylock(&cil->xc_ctx_lock)) {
461 if (!push_now) 410 if (!push_seq)
462 goto out_free_ticket; 411 goto out_free_ticket;
463 down_write(&cil->xc_ctx_lock); 412 down_write(&cil->xc_ctx_lock);
464 } 413 }
@@ -469,7 +418,11 @@ xlog_cil_push(
469 goto out_skip; 418 goto out_skip;
470 419
471 /* check for spurious background flush */ 420 /* check for spurious background flush */
472 if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) 421 if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
422 goto out_skip;
423
424 /* check for a previously pushed seqeunce */
425 if (push_seq < cil->xc_ctx->sequence)
473 goto out_skip; 426 goto out_skip;
474 427
475 /* 428 /*
@@ -515,6 +468,13 @@ xlog_cil_push(
515 cil->xc_ctx = new_ctx; 468 cil->xc_ctx = new_ctx;
516 469
517 /* 470 /*
471 * mirror the new sequence into the cil structure so that we can do
472 * unlocked checks against the current sequence in log forces without
473 * risking deferencing a freed context pointer.
474 */
475 cil->xc_current_sequence = new_ctx->sequence;
476
477 /*
518 * The switch is now done, so we can drop the context lock and move out 478 * The switch is now done, so we can drop the context lock and move out
519 * of a shared context. We can't just go straight to the commit record, 479 * of a shared context. We can't just go straight to the commit record,
520 * though - we need to synchronise with previous and future commits so 480 * though - we need to synchronise with previous and future commits so
@@ -626,6 +586,102 @@ out_abort:
626} 586}
627 587
628/* 588/*
589 * Commit a transaction with the given vector to the Committed Item List.
590 *
591 * To do this, we need to format the item, pin it in memory if required and
592 * account for the space used by the transaction. Once we have done that we
593 * need to release the unused reservation for the transaction, attach the
594 * transaction to the checkpoint context so we carry the busy extents through
595 * to checkpoint completion, and then unlock all the items in the transaction.
596 *
597 * For more specific information about the order of operations in
598 * xfs_log_commit_cil() please refer to the comments in
599 * xfs_trans_commit_iclog().
600 *
601 * Called with the context lock already held in read mode to lock out
602 * background commit, returns without it held once background commits are
603 * allowed again.
604 */
605int
606xfs_log_commit_cil(
607 struct xfs_mount *mp,
608 struct xfs_trans *tp,
609 struct xfs_log_vec *log_vector,
610 xfs_lsn_t *commit_lsn,
611 int flags)
612{
613 struct log *log = mp->m_log;
614 int log_flags = 0;
615 int push = 0;
616
617 if (flags & XFS_TRANS_RELEASE_LOG_RES)
618 log_flags = XFS_LOG_REL_PERM_RESERV;
619
620 if (XLOG_FORCED_SHUTDOWN(log)) {
621 xlog_cil_free_logvec(log_vector);
622 return XFS_ERROR(EIO);
623 }
624
625 /*
626 * do all the hard work of formatting items (including memory
627 * allocation) outside the CIL context lock. This prevents stalling CIL
628 * pushes when we are low on memory and a transaction commit spends a
629 * lot of time in memory reclaim.
630 */
631 xlog_cil_format_items(log, log_vector);
632
633 /* lock out background commit */
634 down_read(&log->l_cilp->xc_ctx_lock);
635 xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn);
636
637 /* check we didn't blow the reservation */
638 if (tp->t_ticket->t_curr_res < 0)
639 xlog_print_tic_res(log->l_mp, tp->t_ticket);
640
641 /* attach the transaction to the CIL if it has any busy extents */
642 if (!list_empty(&tp->t_busy)) {
643 spin_lock(&log->l_cilp->xc_cil_lock);
644 list_splice_init(&tp->t_busy,
645 &log->l_cilp->xc_ctx->busy_extents);
646 spin_unlock(&log->l_cilp->xc_cil_lock);
647 }
648
649 tp->t_commit_lsn = *commit_lsn;
650 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
651 xfs_trans_unreserve_and_mod_sb(tp);
652
653 /*
654 * Once all the items of the transaction have been copied to the CIL,
655 * the items can be unlocked and freed.
656 *
657 * This needs to be done before we drop the CIL context lock because we
658 * have to update state in the log items and unlock them before they go
659 * to disk. If we don't, then the CIL checkpoint can race with us and
660 * we can run checkpoint completion before we've updated and unlocked
661 * the log items. This affects (at least) processing of stale buffers,
662 * inodes and EFIs.
663 */
664 xfs_trans_free_items(tp, *commit_lsn, 0);
665
666 /* check for background commit before unlock */
667 if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
668 push = 1;
669
670 up_read(&log->l_cilp->xc_ctx_lock);
671
672 /*
673 * We need to push CIL every so often so we don't cache more than we
674 * can fit in the log. The limit really is that a checkpoint can't be
675 * more than half the log (the current checkpoint is not allowed to
676 * overwrite the previous checkpoint), but commit latency and memory
677 * usage limit this to a smaller size in most cases.
678 */
679 if (push)
680 xlog_cil_push(log, 0);
681 return 0;
682}
683
684/*
629 * Conditionally push the CIL based on the sequence passed in. 685 * Conditionally push the CIL based on the sequence passed in.
630 * 686 *
631 * We only need to push if we haven't already pushed the sequence 687 * We only need to push if we haven't already pushed the sequence
@@ -639,39 +695,34 @@ out_abort:
639 * commit lsn is there. It'll be empty, so this is broken for now. 695 * commit lsn is there. It'll be empty, so this is broken for now.
640 */ 696 */
641xfs_lsn_t 697xfs_lsn_t
642xlog_cil_push_lsn( 698xlog_cil_force_lsn(
643 struct log *log, 699 struct log *log,
644 xfs_lsn_t push_seq) 700 xfs_lsn_t sequence)
645{ 701{
646 struct xfs_cil *cil = log->l_cilp; 702 struct xfs_cil *cil = log->l_cilp;
647 struct xfs_cil_ctx *ctx; 703 struct xfs_cil_ctx *ctx;
648 xfs_lsn_t commit_lsn = NULLCOMMITLSN; 704 xfs_lsn_t commit_lsn = NULLCOMMITLSN;
649 705
650restart: 706 ASSERT(sequence <= cil->xc_current_sequence);
651 down_write(&cil->xc_ctx_lock); 707
652 ASSERT(push_seq <= cil->xc_ctx->sequence); 708 /*
653 709 * check to see if we need to force out the current context.
654 /* check to see if we need to force out the current context */ 710 * xlog_cil_push() handles racing pushes for the same sequence,
655 if (push_seq == cil->xc_ctx->sequence) { 711 * so no need to deal with it here.
656 up_write(&cil->xc_ctx_lock); 712 */
657 xlog_cil_push(log, 1); 713 if (sequence == cil->xc_current_sequence)
658 goto restart; 714 xlog_cil_push(log, sequence);
659 }
660 715
661 /* 716 /*
662 * See if we can find a previous sequence still committing. 717 * See if we can find a previous sequence still committing.
663 * We can drop the flush lock as soon as we have the cil lock
664 * because we are now only comparing contexts protected by
665 * the cil lock.
666 *
667 * We need to wait for all previous sequence commits to complete 718 * We need to wait for all previous sequence commits to complete
668 * before allowing the force of push_seq to go ahead. Hence block 719 * before allowing the force of push_seq to go ahead. Hence block
669 * on commits for those as well. 720 * on commits for those as well.
670 */ 721 */
722restart:
671 spin_lock(&cil->xc_cil_lock); 723 spin_lock(&cil->xc_cil_lock);
672 up_write(&cil->xc_ctx_lock);
673 list_for_each_entry(ctx, &cil->xc_committing, committing) { 724 list_for_each_entry(ctx, &cil->xc_committing, committing) {
674 if (ctx->sequence > push_seq) 725 if (ctx->sequence > sequence)
675 continue; 726 continue;
676 if (!ctx->commit_lsn) { 727 if (!ctx->commit_lsn) {
677 /* 728 /*
@@ -681,7 +732,7 @@ restart:
681 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); 732 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
682 goto restart; 733 goto restart;
683 } 734 }
684 if (ctx->sequence != push_seq) 735 if (ctx->sequence != sequence)
685 continue; 736 continue;
686 /* found it! */ 737 /* found it! */
687 commit_lsn = ctx->commit_lsn; 738 commit_lsn = ctx->commit_lsn;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 8c072618965c..ced52b98b322 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -422,6 +422,7 @@ struct xfs_cil {
422 struct rw_semaphore xc_ctx_lock; 422 struct rw_semaphore xc_ctx_lock;
423 struct list_head xc_committing; 423 struct list_head xc_committing;
424 sv_t xc_commit_wait; 424 sv_t xc_commit_wait;
425 xfs_lsn_t xc_current_sequence;
425}; 426};
426 427
427/* 428/*
@@ -562,8 +563,16 @@ int xlog_cil_init(struct log *log);
562void xlog_cil_init_post_recovery(struct log *log); 563void xlog_cil_init_post_recovery(struct log *log);
563void xlog_cil_destroy(struct log *log); 564void xlog_cil_destroy(struct log *log);
564 565
565int xlog_cil_push(struct log *log, int push_now); 566/*
566xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence); 567 * CIL force routines
568 */
569xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence);
570
571static inline void
572xlog_cil_force(struct log *log)
573{
574 xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
575}
567 576
568/* 577/*
569 * Unmount record type is used as a pseudo transaction type for the ticket. 578 * Unmount record type is used as a pseudo transaction type for the ticket.
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index fdca7416c754..1c47edaea0d2 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1167,7 +1167,7 @@ xfs_trans_del_item(
1167 * Unlock all of the items of a transaction and free all the descriptors 1167 * Unlock all of the items of a transaction and free all the descriptors
1168 * of that transaction. 1168 * of that transaction.
1169 */ 1169 */
1170STATIC void 1170void
1171xfs_trans_free_items( 1171xfs_trans_free_items(
1172 struct xfs_trans *tp, 1172 struct xfs_trans *tp,
1173 xfs_lsn_t commit_lsn, 1173 xfs_lsn_t commit_lsn,
@@ -1653,9 +1653,6 @@ xfs_trans_commit_cil(
1653 return error; 1653 return error;
1654 1654
1655 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1655 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1656
1657 /* xfs_trans_free_items() unlocks them first */
1658 xfs_trans_free_items(tp, *commit_lsn, 0);
1659 xfs_trans_free(tp); 1656 xfs_trans_free(tp);
1660 return 0; 1657 return 0;
1661} 1658}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index e2d93d8ead7b..62da86c90de5 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -25,7 +25,8 @@ struct xfs_trans;
25 25
26void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); 26void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
27void xfs_trans_del_item(struct xfs_log_item *); 27void xfs_trans_del_item(struct xfs_log_item *);
28 28void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
29 int flags);
29void xfs_trans_item_committed(struct xfs_log_item *lip, 30void xfs_trans_item_committed(struct xfs_log_item *lip,
30 xfs_lsn_t commit_lsn, int aborted); 31 xfs_lsn_t commit_lsn, int aborted);
31void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); 32void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 66d585c6917c..4c7c7bfb2b2f 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2299,15 +2299,22 @@ xfs_alloc_file_space(
2299 e = allocatesize_fsb; 2299 e = allocatesize_fsb;
2300 } 2300 }
2301 2301
2302 /*
2303 * The transaction reservation is limited to a 32-bit block
2304 * count, hence we need to limit the number of blocks we are
2305 * trying to reserve to avoid an overflow. We can't allocate
2306 * more than @nimaps extents, and an extent is limited on disk
2307 * to MAXEXTLEN (21 bits), so use that to enforce the limit.
2308 */
2309 resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
2302 if (unlikely(rt)) { 2310 if (unlikely(rt)) {
2303 resrtextents = qblocks = (uint)(e - s); 2311 resrtextents = qblocks = resblks;
2304 resrtextents /= mp->m_sb.sb_rextsize; 2312 resrtextents /= mp->m_sb.sb_rextsize;
2305 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); 2313 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
2306 quota_flag = XFS_QMOPT_RES_RTBLKS; 2314 quota_flag = XFS_QMOPT_RES_RTBLKS;
2307 } else { 2315 } else {
2308 resrtextents = 0; 2316 resrtextents = 0;
2309 resblks = qblocks = \ 2317 resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
2310 XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
2311 quota_flag = XFS_QMOPT_RES_REGBLKS; 2318 quota_flag = XFS_QMOPT_RES_REGBLKS;
2312 } 2319 }
2313 2320