aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-04-07 06:05:21 -0400
committerIngo Molnar <mingo@elte.hu>2009-04-07 06:05:25 -0400
commit6c009ecef8cca28c7c09eb16d0802e37915a76e1 (patch)
tree11c773f780186fdb9fbc9c80a73fb7c8426b1fba /fs
parent98c2aaf8be5baf7193be37fb28bce8e7327158bc (diff)
parentd508afb437daee7cf07da085b635c44a4ebf9b38 (diff)
Merge branch 'linus' into perfcounters/core
Merge reason: need the upstream facility added by: 7f1e2ca: hrtimer: fix rq->lock inversion (again) Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'fs')
-rw-r--r--fs/befs/debug.c1
-rw-r--r--fs/buffer.c22
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/ext3/Kconfig19
-rw-r--r--fs/ext3/super.c8
-rw-r--r--fs/jbd/commit.c7
-rw-r--r--fs/jbd2/commit.c13
-rw-r--r--fs/jffs2/acl.c4
-rw-r--r--fs/jffs2/malloc.c6
-rw-r--r--fs/libfs.c16
-rw-r--r--fs/lockd/svclock.c13
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nfsd/nfs3proc.c10
-rw-r--r--fs/nfsd/nfs4callback.c47
-rw-r--r--fs/nfsd/nfs4proc.c246
-rw-r--r--fs/nfsd/nfs4recover.c74
-rw-r--r--fs/nfsd/nfs4state.c1196
-rw-r--r--fs/nfsd/nfs4xdr.c633
-rw-r--r--fs/nfsd/nfsctl.c38
-rw-r--r--fs/nfsd/nfsproc.c3
-rw-r--r--fs/nfsd/nfssvc.c88
-rw-r--r--fs/nfsd/vfs.c37
-rw-r--r--fs/romfs/Kconfig48
-rw-r--r--fs/romfs/Makefile9
-rw-r--r--fs/romfs/inode.c665
-rw-r--r--fs/romfs/internal.h47
-rw-r--r--fs/romfs/mmap-nommu.c75
-rw-r--r--fs/romfs/storage.c261
-rw-r--r--fs/romfs/super.c648
-rw-r--r--fs/squashfs/export.c1
-rw-r--r--fs/ubifs/budget.c37
-rw-r--r--fs/ubifs/debug.c6
-rw-r--r--fs/ubifs/file.c16
-rw-r--r--fs/ubifs/find.c12
-rw-r--r--fs/ubifs/gc.c428
-rw-r--r--fs/ubifs/journal.c7
-rw-r--r--fs/ubifs/key.h6
-rw-r--r--fs/ubifs/log.c5
-rw-r--r--fs/ubifs/lpt_commit.c34
-rw-r--r--fs/ubifs/recovery.c70
-rw-r--r--fs/ubifs/replay.c2
-rw-r--r--fs/ubifs/sb.c36
-rw-r--r--fs/ubifs/shrinker.c6
-rw-r--r--fs/ubifs/super.c37
-rw-r--r--fs/ubifs/tnc.c2
-rw-r--r--fs/ubifs/ubifs-media.h30
-rw-r--r--fs/ubifs/ubifs.h13
48 files changed, 3656 insertions, 1331 deletions
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index b8e304a0661e..622e73775c83 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -17,6 +17,7 @@
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/kernel.h> 18#include <linux/kernel.h>
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/slab.h>
20 21
21#endif /* __KERNEL__ */ 22#endif /* __KERNEL__ */
22 23
diff --git a/fs/buffer.c b/fs/buffer.c
index 5d55a896ff78..6e35762b6169 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -737,7 +737,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
737{ 737{
738 struct buffer_head *bh; 738 struct buffer_head *bh;
739 struct list_head tmp; 739 struct list_head tmp;
740 struct address_space *mapping; 740 struct address_space *mapping, *prev_mapping = NULL;
741 int err = 0, err2; 741 int err = 0, err2;
742 742
743 INIT_LIST_HEAD(&tmp); 743 INIT_LIST_HEAD(&tmp);
@@ -762,7 +762,18 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
762 * contents - it is a noop if I/O is still in 762 * contents - it is a noop if I/O is still in
763 * flight on potentially older contents. 763 * flight on potentially older contents.
764 */ 764 */
765 ll_rw_block(SWRITE_SYNC, 1, &bh); 765 ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
766
767 /*
768 * Kick off IO for the previous mapping. Note
769 * that we will not run the very last mapping,
770 * wait_on_buffer() will do that for us
771 * through sync_buffer().
772 */
773 if (prev_mapping && prev_mapping != mapping)
774 blk_run_address_space(prev_mapping);
775 prev_mapping = mapping;
776
766 brelse(bh); 777 brelse(bh);
767 spin_lock(lock); 778 spin_lock(lock);
768 } 779 }
@@ -2957,12 +2968,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2957 for (i = 0; i < nr; i++) { 2968 for (i = 0; i < nr; i++) {
2958 struct buffer_head *bh = bhs[i]; 2969 struct buffer_head *bh = bhs[i];
2959 2970
2960 if (rw == SWRITE || rw == SWRITE_SYNC) 2971 if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
2961 lock_buffer(bh); 2972 lock_buffer(bh);
2962 else if (!trylock_buffer(bh)) 2973 else if (!trylock_buffer(bh))
2963 continue; 2974 continue;
2964 2975
2965 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) { 2976 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
2977 rw == SWRITE_SYNC_PLUG) {
2966 if (test_clear_buffer_dirty(bh)) { 2978 if (test_clear_buffer_dirty(bh)) {
2967 bh->b_end_io = end_buffer_write_sync; 2979 bh->b_end_io = end_buffer_write_sync;
2968 get_bh(bh); 2980 get_bh(bh);
@@ -2998,7 +3010,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
2998 if (test_clear_buffer_dirty(bh)) { 3010 if (test_clear_buffer_dirty(bh)) {
2999 get_bh(bh); 3011 get_bh(bh);
3000 bh->b_end_io = end_buffer_write_sync; 3012 bh->b_end_io = end_buffer_write_sync;
3001 ret = submit_bh(WRITE, bh); 3013 ret = submit_bh(WRITE_SYNC, bh);
3002 wait_on_buffer(bh); 3014 wait_on_buffer(bh);
3003 if (buffer_eopnotsupp(bh)) { 3015 if (buffer_eopnotsupp(bh)) {
3004 clear_buffer_eopnotsupp(bh); 3016 clear_buffer_eopnotsupp(bh);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b6d43908ff7a..da258e7249cc 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1126,7 +1126,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1126 int acquire_i_mutex = 0; 1126 int acquire_i_mutex = 0;
1127 1127
1128 if (rw & WRITE) 1128 if (rw & WRITE)
1129 rw = WRITE_SYNC; 1129 rw = WRITE_ODIRECT;
1130 1130
1131 if (bdev) 1131 if (bdev)
1132 bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); 1132 bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index 8e0cfe44b0fc..fb3c1a21b135 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -28,6 +28,25 @@ config EXT3_FS
28 To compile this file system support as a module, choose M here: the 28 To compile this file system support as a module, choose M here: the
29 module will be called ext3. 29 module will be called ext3.
30 30
31config EXT3_DEFAULTS_TO_ORDERED
32 bool "Default to 'data=ordered' in ext3 (legacy option)"
33 depends on EXT3_FS
34 help
35 If a filesystem does not explicitly specify a data ordering
36 mode, and the journal capability allowed it, ext3 used to
37 historically default to 'data=ordered'.
38
39 That was a rather unfortunate choice, because it leads to all
40 kinds of latency problems, and the 'data=writeback' mode is more
41 appropriate these days.
42
43 You should probably always answer 'n' here, and if you really
44 want to use 'data=ordered' mode, set it in the filesystem itself
45 with 'tune2fs -o journal_data_ordered'.
46
47 But if you really want to enable the legacy default, you can do
48 so by answering 'y' to this question.
49
31config EXT3_FS_XATTR 50config EXT3_FS_XATTR
32 bool "Ext3 extended attributes" 51 bool "Ext3 extended attributes"
33 depends on EXT3_FS 52 depends on EXT3_FS
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9e5b8e387e1e..599dbfe504c3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -44,6 +44,12 @@
44#include "acl.h" 44#include "acl.h"
45#include "namei.h" 45#include "namei.h"
46 46
47#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
48 #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
49#else
50 #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA
51#endif
52
47static int ext3_load_journal(struct super_block *, struct ext3_super_block *, 53static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
48 unsigned long journal_devnum); 54 unsigned long journal_devnum);
49static int ext3_create_journal(struct super_block *, struct ext3_super_block *, 55static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
@@ -1919,7 +1925,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1919 cope, else JOURNAL_DATA */ 1925 cope, else JOURNAL_DATA */
1920 if (journal_check_available_features 1926 if (journal_check_available_features
1921 (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) 1927 (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
1922 set_opt(sbi->s_mount_opt, ORDERED_DATA); 1928 set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE);
1923 else 1929 else
1924 set_opt(sbi->s_mount_opt, JOURNAL_DATA); 1930 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
1925 break; 1931 break;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index f8077b9c8981..a8e8513a78a9 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -351,8 +351,13 @@ void journal_commit_transaction(journal_t *journal)
351 spin_lock(&journal->j_state_lock); 351 spin_lock(&journal->j_state_lock);
352 commit_transaction->t_state = T_LOCKED; 352 commit_transaction->t_state = T_LOCKED;
353 353
354 /*
355 * Use plugged writes here, since we want to submit several before
356 * we unplug the device. We don't do explicit unplugging in here,
357 * instead we rely on sync_buffer() doing the unplug for us.
358 */
354 if (commit_transaction->t_synchronous_commit) 359 if (commit_transaction->t_synchronous_commit)
355 write_op = WRITE_SYNC; 360 write_op = WRITE_SYNC_PLUG;
356 spin_lock(&commit_transaction->t_handle_lock); 361 spin_lock(&commit_transaction->t_handle_lock);
357 while (commit_transaction->t_updates) { 362 while (commit_transaction->t_updates) {
358 DEFINE_WAIT(wait); 363 DEFINE_WAIT(wait);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 4ea72377c7a2..073c8c3df7cd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -138,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
138 set_buffer_ordered(bh); 138 set_buffer_ordered(bh);
139 barrier_done = 1; 139 barrier_done = 1;
140 } 140 }
141 ret = submit_bh(WRITE_SYNC, bh); 141 ret = submit_bh(WRITE_SYNC_PLUG, bh);
142 if (barrier_done) 142 if (barrier_done)
143 clear_buffer_ordered(bh); 143 clear_buffer_ordered(bh);
144 144
@@ -159,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
159 lock_buffer(bh); 159 lock_buffer(bh);
160 set_buffer_uptodate(bh); 160 set_buffer_uptodate(bh);
161 clear_buffer_dirty(bh); 161 clear_buffer_dirty(bh);
162 ret = submit_bh(WRITE_SYNC, bh); 162 ret = submit_bh(WRITE_SYNC_PLUG, bh);
163 } 163 }
164 *cbh = bh; 164 *cbh = bh;
165 return ret; 165 return ret;
@@ -190,7 +190,7 @@ retry:
190 set_buffer_uptodate(bh); 190 set_buffer_uptodate(bh);
191 bh->b_end_io = journal_end_buffer_io_sync; 191 bh->b_end_io = journal_end_buffer_io_sync;
192 192
193 ret = submit_bh(WRITE_SYNC, bh); 193 ret = submit_bh(WRITE_SYNC_PLUG, bh);
194 if (ret) { 194 if (ret) {
195 unlock_buffer(bh); 195 unlock_buffer(bh);
196 return ret; 196 return ret;
@@ -402,8 +402,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
402 spin_lock(&journal->j_state_lock); 402 spin_lock(&journal->j_state_lock);
403 commit_transaction->t_state = T_LOCKED; 403 commit_transaction->t_state = T_LOCKED;
404 404
405 /*
406 * Use plugged writes here, since we want to submit several before
407 * we unplug the device. We don't do explicit unplugging in here,
408 * instead we rely on sync_buffer() doing the unplug for us.
409 */
405 if (commit_transaction->t_synchronous_commit) 410 if (commit_transaction->t_synchronous_commit)
406 write_op = WRITE_SYNC; 411 write_op = WRITE_SYNC_PLUG;
407 stats.u.run.rs_wait = commit_transaction->t_max_wait; 412 stats.u.run.rs_wait = commit_transaction->t_max_wait;
408 stats.u.run.rs_locked = jiffies; 413 stats.u.run.rs_locked = jiffies;
409 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, 414 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 77ccf8cb0823..043740dde20c 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -38,12 +38,12 @@ static int jffs2_acl_count(size_t size)
38 size_t s; 38 size_t s;
39 39
40 size -= sizeof(struct jffs2_acl_header); 40 size -= sizeof(struct jffs2_acl_header);
41 s = size - 4 * sizeof(struct jffs2_acl_entry_short); 41 if (size < 4 * sizeof(struct jffs2_acl_entry_short)) {
42 if (s < 0) {
43 if (size % sizeof(struct jffs2_acl_entry_short)) 42 if (size % sizeof(struct jffs2_acl_entry_short))
44 return -1; 43 return -1;
45 return size / sizeof(struct jffs2_acl_entry_short); 44 return size / sizeof(struct jffs2_acl_entry_short);
46 } else { 45 } else {
46 s = size - 4 * sizeof(struct jffs2_acl_entry_short);
47 if (s % sizeof(struct jffs2_acl_entry)) 47 if (s % sizeof(struct jffs2_acl_entry))
48 return -1; 48 return -1;
49 return s / sizeof(struct jffs2_acl_entry) + 4; 49 return s / sizeof(struct jffs2_acl_entry) + 4;
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index f9211252b5f1..9eff2bdae8a7 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -284,10 +284,9 @@ void jffs2_free_inode_cache(struct jffs2_inode_cache *x)
284struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void) 284struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
285{ 285{
286 struct jffs2_xattr_datum *xd; 286 struct jffs2_xattr_datum *xd;
287 xd = kmem_cache_alloc(xattr_datum_cache, GFP_KERNEL); 287 xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL);
288 dbg_memalloc("%p\n", xd); 288 dbg_memalloc("%p\n", xd);
289 289
290 memset(xd, 0, sizeof(struct jffs2_xattr_datum));
291 xd->class = RAWNODE_CLASS_XATTR_DATUM; 290 xd->class = RAWNODE_CLASS_XATTR_DATUM;
292 xd->node = (void *)xd; 291 xd->node = (void *)xd;
293 INIT_LIST_HEAD(&xd->xindex); 292 INIT_LIST_HEAD(&xd->xindex);
@@ -303,10 +302,9 @@ void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd)
303struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void) 302struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
304{ 303{
305 struct jffs2_xattr_ref *ref; 304 struct jffs2_xattr_ref *ref;
306 ref = kmem_cache_alloc(xattr_ref_cache, GFP_KERNEL); 305 ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL);
307 dbg_memalloc("%p\n", ref); 306 dbg_memalloc("%p\n", ref);
308 307
309 memset(ref, 0, sizeof(struct jffs2_xattr_ref));
310 ref->class = RAWNODE_CLASS_XATTR_REF; 308 ref->class = RAWNODE_CLASS_XATTR_REF;
311 ref->node = (void *)ref; 309 ref->node = (void *)ref;
312 return ref; 310 return ref;
diff --git a/fs/libfs.c b/fs/libfs.c
index 4910a36f516e..cd223190c4e9 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -575,6 +575,21 @@ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
575 * possibly a read which collects the result - which is stored in a 575 * possibly a read which collects the result - which is stored in a
576 * file-local buffer. 576 * file-local buffer.
577 */ 577 */
578
579void simple_transaction_set(struct file *file, size_t n)
580{
581 struct simple_transaction_argresp *ar = file->private_data;
582
583 BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);
584
585 /*
586 * The barrier ensures that ar->size will really remain zero until
587 * ar->data is ready for reading.
588 */
589 smp_mb();
590 ar->size = n;
591}
592
578char *simple_transaction_get(struct file *file, const char __user *buf, size_t size) 593char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
579{ 594{
580 struct simple_transaction_argresp *ar; 595 struct simple_transaction_argresp *ar;
@@ -820,6 +835,7 @@ EXPORT_SYMBOL(simple_sync_file);
820EXPORT_SYMBOL(simple_unlink); 835EXPORT_SYMBOL(simple_unlink);
821EXPORT_SYMBOL(simple_read_from_buffer); 836EXPORT_SYMBOL(simple_read_from_buffer);
822EXPORT_SYMBOL(memory_read_from_buffer); 837EXPORT_SYMBOL(memory_read_from_buffer);
838EXPORT_SYMBOL(simple_transaction_set);
823EXPORT_SYMBOL(simple_transaction_get); 839EXPORT_SYMBOL(simple_transaction_get);
824EXPORT_SYMBOL(simple_transaction_read); 840EXPORT_SYMBOL(simple_transaction_read);
825EXPORT_SYMBOL(simple_transaction_release); 841EXPORT_SYMBOL(simple_transaction_release);
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 763b78a6e9de..83ee34203bd7 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -426,8 +426,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
426 ret = nlm_granted; 426 ret = nlm_granted;
427 goto out; 427 goto out;
428 case -EAGAIN: 428 case -EAGAIN:
429 /*
430 * If this is a blocking request for an
431 * already pending lock request then we need
432 * to put it back on lockd's block list
433 */
434 if (wait)
435 break;
429 ret = nlm_lck_denied; 436 ret = nlm_lck_denied;
430 break; 437 goto out;
431 case FILE_LOCK_DEFERRED: 438 case FILE_LOCK_DEFERRED:
432 if (wait) 439 if (wait)
433 break; 440 break;
@@ -443,10 +450,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
443 goto out; 450 goto out;
444 } 451 }
445 452
446 ret = nlm_lck_denied;
447 if (!wait)
448 goto out;
449
450 ret = nlm_lck_blocked; 453 ret = nlm_lck_blocked;
451 454
452 /* Append to list of blocked */ 455 /* Append to list of blocked */
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 82eaadbff408..6717200923fe 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1228,7 +1228,6 @@ static int nfs_parse_mount_options(char *raw,
1228 goto out_nomem; 1228 goto out_nomem;
1229 token = match_token(string, 1229 token = match_token(string,
1230 nfs_xprt_protocol_tokens, args); 1230 nfs_xprt_protocol_tokens, args);
1231 kfree(string);
1232 1231
1233 switch (token) { 1232 switch (token) {
1234 case Opt_xprt_udp: 1233 case Opt_xprt_udp:
@@ -1258,6 +1257,7 @@ static int nfs_parse_mount_options(char *raw,
1258 goto out_nomem; 1257 goto out_nomem;
1259 token = match_token(string, 1258 token = match_token(string,
1260 nfs_xprt_protocol_tokens, args); 1259 nfs_xprt_protocol_tokens, args);
1260 kfree(string);
1261 1261
1262 switch (token) { 1262 switch (token) {
1263 case Opt_xprt_udp: 1263 case Opt_xprt_udp:
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 44d7d04dab95..503b9da159a3 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -1,6 +1,7 @@
1config NFSD 1config NFSD
2 tristate "NFS server support" 2 tristate "NFS server support"
3 depends on INET 3 depends on INET
4 depends on FILE_LOCKING
4 select LOCKD 5 select LOCKD
5 select SUNRPC 6 select SUNRPC
6 select EXPORTFS 7 select EXPORTFS
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9dbd2eb91281..7c9fe838f038 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -18,6 +18,7 @@
18#include <linux/unistd.h> 18#include <linux/unistd.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/major.h> 20#include <linux/major.h>
21#include <linux/magic.h>
21 22
22#include <linux/sunrpc/svc.h> 23#include <linux/sunrpc/svc.h>
23#include <linux/nfsd/nfsd.h> 24#include <linux/nfsd/nfsd.h>
@@ -202,6 +203,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
202 struct nfsd3_writeres *resp) 203 struct nfsd3_writeres *resp)
203{ 204{
204 __be32 nfserr; 205 __be32 nfserr;
206 unsigned long cnt = argp->len;
205 207
206 dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n", 208 dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n",
207 SVCFH_fmt(&argp->fh), 209 SVCFH_fmt(&argp->fh),
@@ -214,9 +216,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
214 nfserr = nfsd_write(rqstp, &resp->fh, NULL, 216 nfserr = nfsd_write(rqstp, &resp->fh, NULL,
215 argp->offset, 217 argp->offset,
216 rqstp->rq_vec, argp->vlen, 218 rqstp->rq_vec, argp->vlen,
217 argp->len, 219 &cnt,
218 &resp->committed); 220 &resp->committed);
219 resp->count = argp->count; 221 resp->count = cnt;
220 RETURN_STATUS(nfserr); 222 RETURN_STATUS(nfserr);
221} 223}
222 224
@@ -569,7 +571,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
569 struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb; 571 struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb;
570 572
571 /* Note that we don't care for remote fs's here */ 573 /* Note that we don't care for remote fs's here */
572 if (sb->s_magic == 0x4d44 /* MSDOS_SUPER_MAGIC */) { 574 if (sb->s_magic == MSDOS_SUPER_MAGIC) {
573 resp->f_properties = NFS3_FSF_BILLYBOY; 575 resp->f_properties = NFS3_FSF_BILLYBOY;
574 } 576 }
575 resp->f_maxfilesize = sb->s_maxbytes; 577 resp->f_maxfilesize = sb->s_maxbytes;
@@ -610,7 +612,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
610 resp->p_link_max = EXT2_LINK_MAX; 612 resp->p_link_max = EXT2_LINK_MAX;
611 resp->p_name_max = EXT2_NAME_LEN; 613 resp->p_name_max = EXT2_NAME_LEN;
612 break; 614 break;
613 case 0x4d44: /* MSDOS_SUPER_MAGIC */ 615 case MSDOS_SUPER_MAGIC:
614 resp->p_case_insensitive = 1; 616 resp->p_case_insensitive = 1;
615 resp->p_case_preserving = 0; 617 resp->p_case_preserving = 0;
616 break; 618 break;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c464181b5994..290289bd44f7 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -218,7 +218,7 @@ static int
218encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec) 218encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
219{ 219{
220 __be32 *p; 220 __be32 *p;
221 int len = cb_rec->cbr_fhlen; 221 int len = cb_rec->cbr_fh.fh_size;
222 222
223 RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len); 223 RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
224 WRITE32(OP_CB_RECALL); 224 WRITE32(OP_CB_RECALL);
@@ -226,7 +226,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
226 WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t)); 226 WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
227 WRITE32(cb_rec->cbr_trunc); 227 WRITE32(cb_rec->cbr_trunc);
228 WRITE32(len); 228 WRITE32(len);
229 WRITEMEM(cb_rec->cbr_fhval, len); 229 WRITEMEM(&cb_rec->cbr_fh.fh_base, len);
230 return 0; 230 return 0;
231} 231}
232 232
@@ -361,9 +361,8 @@ static struct rpc_program cb_program = {
361/* Reference counting, callback cleanup, etc., all look racy as heck. 361/* Reference counting, callback cleanup, etc., all look racy as heck.
362 * And why is cb_set an atomic? */ 362 * And why is cb_set an atomic? */
363 363
364static int do_probe_callback(void *data) 364static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
365{ 365{
366 struct nfs4_client *clp = data;
367 struct sockaddr_in addr; 366 struct sockaddr_in addr;
368 struct nfs4_callback *cb = &clp->cl_callback; 367 struct nfs4_callback *cb = &clp->cl_callback;
369 struct rpc_timeout timeparms = { 368 struct rpc_timeout timeparms = {
@@ -384,17 +383,10 @@ static int do_probe_callback(void *data)
384 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), 383 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
385 .client_name = clp->cl_principal, 384 .client_name = clp->cl_principal,
386 }; 385 };
387 struct rpc_message msg = {
388 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
389 .rpc_argp = clp,
390 };
391 struct rpc_clnt *client; 386 struct rpc_clnt *client;
392 int status;
393 387
394 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) { 388 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
395 status = nfserr_cb_path_down; 389 return ERR_PTR(-EINVAL);
396 goto out_err;
397 }
398 390
399 /* Initialize address */ 391 /* Initialize address */
400 memset(&addr, 0, sizeof(addr)); 392 memset(&addr, 0, sizeof(addr));
@@ -404,9 +396,29 @@ static int do_probe_callback(void *data)
404 396
405 /* Create RPC client */ 397 /* Create RPC client */
406 client = rpc_create(&args); 398 client = rpc_create(&args);
399 if (IS_ERR(client))
400 dprintk("NFSD: couldn't create callback client: %ld\n",
401 PTR_ERR(client));
402 return client;
403
404}
405
406static int do_probe_callback(void *data)
407{
408 struct nfs4_client *clp = data;
409 struct nfs4_callback *cb = &clp->cl_callback;
410 struct rpc_message msg = {
411 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
412 .rpc_argp = clp,
413 };
414 struct rpc_clnt *client;
415 int status;
416
417 client = setup_callback_client(clp);
407 if (IS_ERR(client)) { 418 if (IS_ERR(client)) {
408 dprintk("NFSD: couldn't create callback client\n");
409 status = PTR_ERR(client); 419 status = PTR_ERR(client);
420 dprintk("NFSD: couldn't create callback client: %d\n",
421 status);
410 goto out_err; 422 goto out_err;
411 } 423 }
412 424
@@ -422,10 +434,10 @@ static int do_probe_callback(void *data)
422out_release_client: 434out_release_client:
423 rpc_shutdown_client(client); 435 rpc_shutdown_client(client);
424out_err: 436out_err:
425 dprintk("NFSD: warning: no callback path to client %.*s\n", 437 dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
426 (int)clp->cl_name.len, clp->cl_name.data); 438 (int)clp->cl_name.len, clp->cl_name.data, status);
427 put_nfs4_client(clp); 439 put_nfs4_client(clp);
428 return status; 440 return 0;
429} 441}
430 442
431/* 443/*
@@ -451,7 +463,6 @@ nfsd4_probe_callback(struct nfs4_client *clp)
451 463
452/* 464/*
453 * called with dp->dl_count inc'ed. 465 * called with dp->dl_count inc'ed.
454 * nfs4_lock_state() may or may not have been called.
455 */ 466 */
456void 467void
457nfsd4_cb_recall(struct nfs4_delegation *dp) 468nfsd4_cb_recall(struct nfs4_delegation *dp)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 9fa60a3ad48c..b2883e9c6381 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -93,6 +93,21 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
93 open->op_truncate = 0; 93 open->op_truncate = 0;
94 94
95 if (open->op_create) { 95 if (open->op_create) {
96 /* FIXME: check session persistence and pnfs flags.
97 * The nfsv4.1 spec requires the following semantics:
98 *
99 * Persistent | pNFS | Server REQUIRED | Client Allowed
100 * Reply Cache | server | |
101 * -------------+--------+-----------------+--------------------
102 * no | no | EXCLUSIVE4_1 | EXCLUSIVE4_1
103 * | | | (SHOULD)
104 * | | and EXCLUSIVE4 | or EXCLUSIVE4
105 * | | | (SHOULD NOT)
106 * no | yes | EXCLUSIVE4_1 | EXCLUSIVE4_1
107 * yes | no | GUARDED4 | GUARDED4
108 * yes | yes | GUARDED4 | GUARDED4
109 */
110
96 /* 111 /*
97 * Note: create modes (UNCHECKED,GUARDED...) are the same 112 * Note: create modes (UNCHECKED,GUARDED...) are the same
98 * in NFSv4 as in v3. 113 * in NFSv4 as in v3.
@@ -103,11 +118,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
103 (u32 *)open->op_verf.data, 118 (u32 *)open->op_verf.data,
104 &open->op_truncate, &created); 119 &open->op_truncate, &created);
105 120
106 /* If we ever decide to use different attrs to store the 121 /*
107 * verifier in nfsd_create_v3, then we'll need to change this 122 * Following rfc 3530 14.2.16, use the returned bitmask
123 * to indicate which attributes we used to store the
124 * verifier:
108 */ 125 */
109 if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0) 126 if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
110 open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS | 127 open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
111 FATTR4_WORD1_TIME_MODIFY); 128 FATTR4_WORD1_TIME_MODIFY);
112 } else { 129 } else {
113 status = nfsd_lookup(rqstp, current_fh, 130 status = nfsd_lookup(rqstp, current_fh,
@@ -118,13 +135,11 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
118 goto out; 135 goto out;
119 136
120 set_change_info(&open->op_cinfo, current_fh); 137 set_change_info(&open->op_cinfo, current_fh);
121
122 /* set reply cache */
123 fh_dup2(current_fh, &resfh); 138 fh_dup2(current_fh, &resfh);
124 open->op_stateowner->so_replay.rp_openfh_len = resfh.fh_handle.fh_size;
125 memcpy(open->op_stateowner->so_replay.rp_openfh,
126 &resfh.fh_handle.fh_base, resfh.fh_handle.fh_size);
127 139
140 /* set reply cache */
141 fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
142 &resfh.fh_handle);
128 if (!created) 143 if (!created)
129 status = do_open_permission(rqstp, current_fh, open, 144 status = do_open_permission(rqstp, current_fh, open,
130 NFSD_MAY_NOP); 145 NFSD_MAY_NOP);
@@ -150,10 +165,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
150 memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info)); 165 memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
151 166
152 /* set replay cache */ 167 /* set replay cache */
153 open->op_stateowner->so_replay.rp_openfh_len = current_fh->fh_handle.fh_size; 168 fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
154 memcpy(open->op_stateowner->so_replay.rp_openfh, 169 &current_fh->fh_handle);
155 &current_fh->fh_handle.fh_base,
156 current_fh->fh_handle.fh_size);
157 170
158 open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && 171 open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
159 (open->op_iattr.ia_size == 0); 172 (open->op_iattr.ia_size == 0);
@@ -164,12 +177,23 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
164 return status; 177 return status;
165} 178}
166 179
180static void
181copy_clientid(clientid_t *clid, struct nfsd4_session *session)
182{
183 struct nfsd4_sessionid *sid =
184 (struct nfsd4_sessionid *)session->se_sessionid.data;
185
186 clid->cl_boot = sid->clientid.cl_boot;
187 clid->cl_id = sid->clientid.cl_id;
188}
167 189
168static __be32 190static __be32
169nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 191nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
170 struct nfsd4_open *open) 192 struct nfsd4_open *open)
171{ 193{
172 __be32 status; 194 __be32 status;
195 struct nfsd4_compoundres *resp;
196
173 dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n", 197 dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
174 (int)open->op_fname.len, open->op_fname.data, 198 (int)open->op_fname.len, open->op_fname.data,
175 open->op_stateowner); 199 open->op_stateowner);
@@ -178,16 +202,19 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
178 if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) 202 if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
179 return nfserr_inval; 203 return nfserr_inval;
180 204
205 if (nfsd4_has_session(cstate))
206 copy_clientid(&open->op_clientid, cstate->session);
207
181 nfs4_lock_state(); 208 nfs4_lock_state();
182 209
183 /* check seqid for replay. set nfs4_owner */ 210 /* check seqid for replay. set nfs4_owner */
184 status = nfsd4_process_open1(open); 211 resp = rqstp->rq_resp;
212 status = nfsd4_process_open1(&resp->cstate, open);
185 if (status == nfserr_replay_me) { 213 if (status == nfserr_replay_me) {
186 struct nfs4_replay *rp = &open->op_stateowner->so_replay; 214 struct nfs4_replay *rp = &open->op_stateowner->so_replay;
187 fh_put(&cstate->current_fh); 215 fh_put(&cstate->current_fh);
188 cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len; 216 fh_copy_shallow(&cstate->current_fh.fh_handle,
189 memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh, 217 &rp->rp_openfh);
190 rp->rp_openfh_len);
191 status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); 218 status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
192 if (status) 219 if (status)
193 dprintk("nfsd4_open: replay failed" 220 dprintk("nfsd4_open: replay failed"
@@ -209,10 +236,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
209 236
210 switch (open->op_claim_type) { 237 switch (open->op_claim_type) {
211 case NFS4_OPEN_CLAIM_DELEGATE_CUR: 238 case NFS4_OPEN_CLAIM_DELEGATE_CUR:
212 status = nfserr_inval;
213 if (open->op_create)
214 goto out;
215 /* fall through */
216 case NFS4_OPEN_CLAIM_NULL: 239 case NFS4_OPEN_CLAIM_NULL:
217 /* 240 /*
218 * (1) set CURRENT_FH to the file being opened, 241 * (1) set CURRENT_FH to the file being opened,
@@ -455,8 +478,9 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
455 if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) 478 if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
456 return nfserr_inval; 479 return nfserr_inval;
457 480
458 getattr->ga_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0; 481 getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
459 getattr->ga_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1; 482 getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
483 getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
460 484
461 getattr->ga_fhp = &cstate->current_fh; 485 getattr->ga_fhp = &cstate->current_fh;
462 return nfs_ok; 486 return nfs_ok;
@@ -520,9 +544,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
520 544
521 nfs4_lock_state(); 545 nfs4_lock_state();
522 /* check stateid */ 546 /* check stateid */
523 if ((status = nfs4_preprocess_stateid_op(&cstate->current_fh, 547 if ((status = nfs4_preprocess_stateid_op(cstate, &read->rd_stateid,
524 &read->rd_stateid, 548 RD_STATE, &read->rd_filp))) {
525 CHECK_FH | RD_STATE, &read->rd_filp))) {
526 dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); 549 dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
527 goto out; 550 goto out;
528 } 551 }
@@ -548,8 +571,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
548 if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) 571 if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
549 return nfserr_inval; 572 return nfserr_inval;
550 573
551 readdir->rd_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0; 574 readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
552 readdir->rd_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1; 575 readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
576 readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
553 577
554 if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) || 578 if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) ||
555 (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE))) 579 (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
@@ -653,8 +677,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
653 677
654 if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { 678 if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
655 nfs4_lock_state(); 679 nfs4_lock_state();
656 status = nfs4_preprocess_stateid_op(&cstate->current_fh, 680 status = nfs4_preprocess_stateid_op(cstate,
657 &setattr->sa_stateid, CHECK_FH | WR_STATE, NULL); 681 &setattr->sa_stateid, WR_STATE, NULL);
658 nfs4_unlock_state(); 682 nfs4_unlock_state();
659 if (status) { 683 if (status) {
660 dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); 684 dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
@@ -685,6 +709,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
685 struct file *filp = NULL; 709 struct file *filp = NULL;
686 u32 *p; 710 u32 *p;
687 __be32 status = nfs_ok; 711 __be32 status = nfs_ok;
712 unsigned long cnt;
688 713
689 /* no need to check permission - this will be done in nfsd_write() */ 714 /* no need to check permission - this will be done in nfsd_write() */
690 715
@@ -692,8 +717,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
692 return nfserr_inval; 717 return nfserr_inval;
693 718
694 nfs4_lock_state(); 719 nfs4_lock_state();
695 status = nfs4_preprocess_stateid_op(&cstate->current_fh, stateid, 720 status = nfs4_preprocess_stateid_op(cstate, stateid, WR_STATE, &filp);
696 CHECK_FH | WR_STATE, &filp);
697 if (filp) 721 if (filp)
698 get_file(filp); 722 get_file(filp);
699 nfs4_unlock_state(); 723 nfs4_unlock_state();
@@ -703,7 +727,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
703 return status; 727 return status;
704 } 728 }
705 729
706 write->wr_bytes_written = write->wr_buflen; 730 cnt = write->wr_buflen;
707 write->wr_how_written = write->wr_stable_how; 731 write->wr_how_written = write->wr_stable_how;
708 p = (u32 *)write->wr_verifier.data; 732 p = (u32 *)write->wr_verifier.data;
709 *p++ = nfssvc_boot.tv_sec; 733 *p++ = nfssvc_boot.tv_sec;
@@ -711,10 +735,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
711 735
712 status = nfsd_write(rqstp, &cstate->current_fh, filp, 736 status = nfsd_write(rqstp, &cstate->current_fh, filp,
713 write->wr_offset, rqstp->rq_vec, write->wr_vlen, 737 write->wr_offset, rqstp->rq_vec, write->wr_vlen,
714 write->wr_buflen, &write->wr_how_written); 738 &cnt, &write->wr_how_written);
715 if (filp) 739 if (filp)
716 fput(filp); 740 fput(filp);
717 741
742 write->wr_bytes_written = cnt;
743
718 if (status == nfserr_symlink) 744 if (status == nfserr_symlink)
719 status = nfserr_inval; 745 status = nfserr_inval;
720 return status; 746 return status;
@@ -737,8 +763,9 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
737 if (status) 763 if (status)
738 return status; 764 return status;
739 765
740 if ((verify->ve_bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) 766 if ((verify->ve_bmval[0] & ~nfsd_suppattrs0(cstate->minorversion))
741 || (verify->ve_bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1)) 767 || (verify->ve_bmval[1] & ~nfsd_suppattrs1(cstate->minorversion))
768 || (verify->ve_bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
742 return nfserr_attrnotsupp; 769 return nfserr_attrnotsupp;
743 if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR) 770 if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)
744 || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)) 771 || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1))
@@ -766,7 +793,8 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
766 if (status) 793 if (status)
767 goto out_kfree; 794 goto out_kfree;
768 795
769 p = buf + 3; 796 /* skip bitmap */
797 p = buf + 1 + ntohl(buf[0]);
770 status = nfserr_not_same; 798 status = nfserr_not_same;
771 if (ntohl(*p++) != verify->ve_attrlen) 799 if (ntohl(*p++) != verify->ve_attrlen)
772 goto out_kfree; 800 goto out_kfree;
@@ -813,39 +841,17 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
813 nfsdstats.nfs4_opcount[opnum]++; 841 nfsdstats.nfs4_opcount[opnum]++;
814} 842}
815 843
816static void cstate_free(struct nfsd4_compound_state *cstate)
817{
818 if (cstate == NULL)
819 return;
820 fh_put(&cstate->current_fh);
821 fh_put(&cstate->save_fh);
822 BUG_ON(cstate->replay_owner);
823 kfree(cstate);
824}
825
826static struct nfsd4_compound_state *cstate_alloc(void)
827{
828 struct nfsd4_compound_state *cstate;
829
830 cstate = kmalloc(sizeof(struct nfsd4_compound_state), GFP_KERNEL);
831 if (cstate == NULL)
832 return NULL;
833 fh_init(&cstate->current_fh, NFS4_FHSIZE);
834 fh_init(&cstate->save_fh, NFS4_FHSIZE);
835 cstate->replay_owner = NULL;
836 return cstate;
837}
838
839typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *, 844typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
840 void *); 845 void *);
846enum nfsd4_op_flags {
847 ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */
848 ALLOWED_ON_ABSENT_FS = 2 << 0, /* ops processed on absent fs */
849 ALLOWED_AS_FIRST_OP = 3 << 0, /* ops reqired first in compound */
850};
841 851
842struct nfsd4_operation { 852struct nfsd4_operation {
843 nfsd4op_func op_func; 853 nfsd4op_func op_func;
844 u32 op_flags; 854 u32 op_flags;
845/* Most ops require a valid current filehandle; a few don't: */
846#define ALLOWED_WITHOUT_FH 1
847/* GETATTR and ops not listed as returning NFS4ERR_MOVED: */
848#define ALLOWED_ON_ABSENT_FS 2
849 char *op_name; 855 char *op_name;
850}; 856};
851 857
@@ -854,6 +860,51 @@ static struct nfsd4_operation nfsd4_ops[];
854static const char *nfsd4_op_name(unsigned opnum); 860static const char *nfsd4_op_name(unsigned opnum);
855 861
856/* 862/*
863 * This is a replay of a compound for which no cache entry pages
864 * were used. Encode the sequence operation, and if cachethis is FALSE
865 * encode the uncache rep error on the next operation.
866 */
867static __be32
868nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args,
869 struct nfsd4_compoundres *resp)
870{
871 struct nfsd4_op *op;
872
873 dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__,
874 resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis);
875
876 /* Encode the replayed sequence operation */
877 BUG_ON(resp->opcnt != 1);
878 op = &args->ops[resp->opcnt - 1];
879 nfsd4_encode_operation(resp, op);
880
881 /*return nfserr_retry_uncached_rep in next operation. */
882 if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) {
883 op = &args->ops[resp->opcnt++];
884 op->status = nfserr_retry_uncached_rep;
885 nfsd4_encode_operation(resp, op);
886 }
887 return op->status;
888}
889
890/*
891 * Enforce NFSv4.1 COMPOUND ordering rules.
892 *
893 * TODO:
894 * - enforce NFS4ERR_NOT_ONLY_OP,
895 * - DESTROY_SESSION MUST be the final operation in the COMPOUND request.
896 */
897static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args)
898{
899 if (args->minorversion && args->opcnt > 0) {
900 struct nfsd4_op *op = &args->ops[0];
901 return (op->status == nfserr_op_illegal) ||
902 (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP);
903 }
904 return true;
905}
906
907/*
857 * COMPOUND call. 908 * COMPOUND call.
858 */ 909 */
859static __be32 910static __be32
@@ -863,12 +914,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
863{ 914{
864 struct nfsd4_op *op; 915 struct nfsd4_op *op;
865 struct nfsd4_operation *opdesc; 916 struct nfsd4_operation *opdesc;
866 struct nfsd4_compound_state *cstate = NULL; 917 struct nfsd4_compound_state *cstate = &resp->cstate;
867 int slack_bytes; 918 int slack_bytes;
868 __be32 status; 919 __be32 status;
869 920
870 resp->xbuf = &rqstp->rq_res; 921 resp->xbuf = &rqstp->rq_res;
871 resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len; 922 resp->p = rqstp->rq_res.head[0].iov_base +
923 rqstp->rq_res.head[0].iov_len;
872 resp->tagp = resp->p; 924 resp->tagp = resp->p;
873 /* reserve space for: taglen, tag, and opcnt */ 925 /* reserve space for: taglen, tag, and opcnt */
874 resp->p += 2 + XDR_QUADLEN(args->taglen); 926 resp->p += 2 + XDR_QUADLEN(args->taglen);
@@ -877,18 +929,25 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
877 resp->tag = args->tag; 929 resp->tag = args->tag;
878 resp->opcnt = 0; 930 resp->opcnt = 0;
879 resp->rqstp = rqstp; 931 resp->rqstp = rqstp;
932 resp->cstate.minorversion = args->minorversion;
933 resp->cstate.replay_owner = NULL;
934 fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
935 fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
936 /* Use the deferral mechanism only for NFSv4.0 compounds */
937 rqstp->rq_usedeferral = (args->minorversion == 0);
880 938
881 /* 939 /*
882 * According to RFC3010, this takes precedence over all other errors. 940 * According to RFC3010, this takes precedence over all other errors.
883 */ 941 */
884 status = nfserr_minor_vers_mismatch; 942 status = nfserr_minor_vers_mismatch;
885 if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION) 943 if (args->minorversion > nfsd_supported_minorversion)
886 goto out; 944 goto out;
887 945
888 status = nfserr_resource; 946 if (!nfs41_op_ordering_ok(args)) {
889 cstate = cstate_alloc(); 947 op = &args->ops[0];
890 if (cstate == NULL) 948 op->status = nfserr_sequence_pos;
891 goto out; 949 goto encode_op;
950 }
892 951
893 status = nfs_ok; 952 status = nfs_ok;
894 while (!status && resp->opcnt < args->opcnt) { 953 while (!status && resp->opcnt < args->opcnt) {
@@ -897,7 +956,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
897 dprintk("nfsv4 compound op #%d/%d: %d (%s)\n", 956 dprintk("nfsv4 compound op #%d/%d: %d (%s)\n",
898 resp->opcnt, args->opcnt, op->opnum, 957 resp->opcnt, args->opcnt, op->opnum,
899 nfsd4_op_name(op->opnum)); 958 nfsd4_op_name(op->opnum));
900
901 /* 959 /*
902 * The XDR decode routines may have pre-set op->status; 960 * The XDR decode routines may have pre-set op->status;
903 * for example, if there is a miscellaneous XDR error 961 * for example, if there is a miscellaneous XDR error
@@ -938,6 +996,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
938 BUG_ON(op->status == nfs_ok); 996 BUG_ON(op->status == nfs_ok);
939 997
940encode_op: 998encode_op:
999 /* Only from SEQUENCE or CREATE_SESSION */
1000 if (resp->cstate.status == nfserr_replay_cache) {
1001 dprintk("%s NFS4.1 replay from cache\n", __func__);
1002 if (nfsd4_not_cached(resp))
1003 status = nfsd4_enc_uncached_replay(args, resp);
1004 else
1005 status = op->status;
1006 goto out;
1007 }
941 if (op->status == nfserr_replay_me) { 1008 if (op->status == nfserr_replay_me) {
942 op->replay = &cstate->replay_owner->so_replay; 1009 op->replay = &cstate->replay_owner->so_replay;
943 nfsd4_encode_replay(resp, op); 1010 nfsd4_encode_replay(resp, op);
@@ -961,15 +1028,24 @@ encode_op:
961 1028
962 nfsd4_increment_op_stats(op->opnum); 1029 nfsd4_increment_op_stats(op->opnum);
963 } 1030 }
1031 if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
1032 dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
1033 status = nfserr_jukebox;
1034 }
964 1035
965 cstate_free(cstate); 1036 resp->cstate.status = status;
1037 fh_put(&resp->cstate.current_fh);
1038 fh_put(&resp->cstate.save_fh);
1039 BUG_ON(resp->cstate.replay_owner);
966out: 1040out:
967 nfsd4_release_compoundargs(args); 1041 nfsd4_release_compoundargs(args);
1042 /* Reset deferral mechanism for RPC deferrals */
1043 rqstp->rq_usedeferral = 1;
968 dprintk("nfsv4 compound returned %d\n", ntohl(status)); 1044 dprintk("nfsv4 compound returned %d\n", ntohl(status));
969 return status; 1045 return status;
970} 1046}
971 1047
972static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = { 1048static struct nfsd4_operation nfsd4_ops[] = {
973 [OP_ACCESS] = { 1049 [OP_ACCESS] = {
974 .op_func = (nfsd4op_func)nfsd4_access, 1050 .op_func = (nfsd4op_func)nfsd4_access,
975 .op_name = "OP_ACCESS", 1051 .op_name = "OP_ACCESS",
@@ -1045,7 +1121,7 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
1045 .op_name = "OP_PUTFH", 1121 .op_name = "OP_PUTFH",
1046 }, 1122 },
1047 [OP_PUTPUBFH] = { 1123 [OP_PUTPUBFH] = {
1048 /* unsupported, just for future reference: */ 1124 .op_func = (nfsd4op_func)nfsd4_putrootfh,
1049 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1125 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
1050 .op_name = "OP_PUTPUBFH", 1126 .op_name = "OP_PUTPUBFH",
1051 }, 1127 },
@@ -1119,6 +1195,28 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
1119 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1195 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
1120 .op_name = "OP_RELEASE_LOCKOWNER", 1196 .op_name = "OP_RELEASE_LOCKOWNER",
1121 }, 1197 },
1198
1199 /* NFSv4.1 operations */
1200 [OP_EXCHANGE_ID] = {
1201 .op_func = (nfsd4op_func)nfsd4_exchange_id,
1202 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1203 .op_name = "OP_EXCHANGE_ID",
1204 },
1205 [OP_CREATE_SESSION] = {
1206 .op_func = (nfsd4op_func)nfsd4_create_session,
1207 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1208 .op_name = "OP_CREATE_SESSION",
1209 },
1210 [OP_DESTROY_SESSION] = {
1211 .op_func = (nfsd4op_func)nfsd4_destroy_session,
1212 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1213 .op_name = "OP_DESTROY_SESSION",
1214 },
1215 [OP_SEQUENCE] = {
1216 .op_func = (nfsd4op_func)nfsd4_sequence,
1217 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1218 .op_name = "OP_SEQUENCE",
1219 },
1122}; 1220};
1123 1221
1124static const char *nfsd4_op_name(unsigned opnum) 1222static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 74f7b67567fd..3444c0052a87 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -182,36 +182,26 @@ out_unlock:
182 182
183typedef int (recdir_func)(struct dentry *, struct dentry *); 183typedef int (recdir_func)(struct dentry *, struct dentry *);
184 184
185struct dentry_list { 185struct name_list {
186 struct dentry *dentry; 186 char name[HEXDIR_LEN];
187 struct list_head list; 187 struct list_head list;
188}; 188};
189 189
190struct dentry_list_arg {
191 struct list_head dentries;
192 struct dentry *parent;
193};
194
195static int 190static int
196nfsd4_build_dentrylist(void *arg, const char *name, int namlen, 191nfsd4_build_namelist(void *arg, const char *name, int namlen,
197 loff_t offset, u64 ino, unsigned int d_type) 192 loff_t offset, u64 ino, unsigned int d_type)
198{ 193{
199 struct dentry_list_arg *dla = arg; 194 struct list_head *names = arg;
200 struct list_head *dentries = &dla->dentries; 195 struct name_list *entry;
201 struct dentry *parent = dla->parent;
202 struct dentry *dentry;
203 struct dentry_list *child;
204 196
205 if (name && isdotent(name, namlen)) 197 if (namlen != HEXDIR_LEN - 1)
206 return 0; 198 return 0;
207 dentry = lookup_one_len(name, parent, namlen); 199 entry = kmalloc(sizeof(struct name_list), GFP_KERNEL);
208 if (IS_ERR(dentry)) 200 if (entry == NULL)
209 return PTR_ERR(dentry);
210 child = kmalloc(sizeof(*child), GFP_KERNEL);
211 if (child == NULL)
212 return -ENOMEM; 201 return -ENOMEM;
213 child->dentry = dentry; 202 memcpy(entry->name, name, HEXDIR_LEN - 1);
214 list_add(&child->list, dentries); 203 entry->name[HEXDIR_LEN - 1] = '\0';
204 list_add(&entry->list, names);
215 return 0; 205 return 0;
216} 206}
217 207
@@ -220,11 +210,9 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
220{ 210{
221 const struct cred *original_cred; 211 const struct cred *original_cred;
222 struct file *filp; 212 struct file *filp;
223 struct dentry_list_arg dla = { 213 LIST_HEAD(names);
224 .parent = dir, 214 struct name_list *entry;
225 }; 215 struct dentry *dentry;
226 struct list_head *dentries = &dla.dentries;
227 struct dentry_list *child;
228 int status; 216 int status;
229 217
230 if (!rec_dir_init) 218 if (!rec_dir_init)
@@ -233,31 +221,34 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
233 status = nfs4_save_creds(&original_cred); 221 status = nfs4_save_creds(&original_cred);
234 if (status < 0) 222 if (status < 0)
235 return status; 223 return status;
236 INIT_LIST_HEAD(dentries);
237 224
238 filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY, 225 filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
239 current_cred()); 226 current_cred());
240 status = PTR_ERR(filp); 227 status = PTR_ERR(filp);
241 if (IS_ERR(filp)) 228 if (IS_ERR(filp))
242 goto out; 229 goto out;
243 INIT_LIST_HEAD(dentries); 230 status = vfs_readdir(filp, nfsd4_build_namelist, &names);
244 status = vfs_readdir(filp, nfsd4_build_dentrylist, &dla);
245 fput(filp); 231 fput(filp);
246 while (!list_empty(dentries)) { 232 while (!list_empty(&names)) {
247 child = list_entry(dentries->next, struct dentry_list, list); 233 entry = list_entry(names.next, struct name_list, list);
248 status = f(dir, child->dentry); 234
235 dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
236 if (IS_ERR(dentry)) {
237 status = PTR_ERR(dentry);
238 goto out;
239 }
240 status = f(dir, dentry);
241 dput(dentry);
249 if (status) 242 if (status)
250 goto out; 243 goto out;
251 list_del(&child->list); 244 list_del(&entry->list);
252 dput(child->dentry); 245 kfree(entry);
253 kfree(child);
254 } 246 }
255out: 247out:
256 while (!list_empty(dentries)) { 248 while (!list_empty(&names)) {
257 child = list_entry(dentries->next, struct dentry_list, list); 249 entry = list_entry(names.next, struct name_list, list);
258 list_del(&child->list); 250 list_del(&entry->list);
259 dput(child->dentry); 251 kfree(entry);
260 kfree(child);
261 } 252 }
262 nfs4_reset_creds(original_cred); 253 nfs4_reset_creds(original_cred);
263 return status; 254 return status;
@@ -353,7 +344,8 @@ purge_old(struct dentry *parent, struct dentry *child)
353{ 344{
354 int status; 345 int status;
355 346
356 if (nfs4_has_reclaimed_state(child->d_name.name)) 347 /* note: we currently use this path only for minorversion 0 */
348 if (nfs4_has_reclaimed_state(child->d_name.name, false))
357 return 0; 349 return 0;
358 350
359 status = nfsd4_clear_clid_dir(parent, child); 351 status = nfsd4_clear_clid_dir(parent, child);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b6f60f48e94b..c65a27b76a9d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -68,6 +68,7 @@ static u32 current_delegid = 1;
68static u32 nfs4_init; 68static u32 nfs4_init;
69static stateid_t zerostateid; /* bits all 0 */ 69static stateid_t zerostateid; /* bits all 0 */
70static stateid_t onestateid; /* bits all 1 */ 70static stateid_t onestateid; /* bits all 1 */
71static u64 current_sessionid = 1;
71 72
72#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t))) 73#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
73#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) 74#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
@@ -75,18 +76,21 @@ static stateid_t onestateid; /* bits all 1 */
75/* forward declarations */ 76/* forward declarations */
76static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); 77static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
77static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); 78static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
78static void release_stateid_lockowners(struct nfs4_stateid *open_stp);
79static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; 79static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
80static void nfs4_set_recdir(char *recdir); 80static void nfs4_set_recdir(char *recdir);
81 81
82/* Locking: 82/* Locking: */
83 * 83
84 * client_mutex: 84/* Currently used for almost all code touching nfsv4 state: */
85 * protects clientid_hashtbl[], clientstr_hashtbl[],
86 * unconfstr_hashtbl[], uncofid_hashtbl[].
87 */
88static DEFINE_MUTEX(client_mutex); 85static DEFINE_MUTEX(client_mutex);
89 86
87/*
88 * Currently used for the del_recall_lru and file hash table. In an
89 * effort to decrease the scope of the client_mutex, this spinlock may
90 * eventually cover more:
91 */
92static DEFINE_SPINLOCK(recall_lock);
93
90static struct kmem_cache *stateowner_slab = NULL; 94static struct kmem_cache *stateowner_slab = NULL;
91static struct kmem_cache *file_slab = NULL; 95static struct kmem_cache *file_slab = NULL;
92static struct kmem_cache *stateid_slab = NULL; 96static struct kmem_cache *stateid_slab = NULL;
@@ -117,37 +121,23 @@ opaque_hashval(const void *ptr, int nbytes)
117 return x; 121 return x;
118} 122}
119 123
120/* forward declarations */
121static void release_stateowner(struct nfs4_stateowner *sop);
122static void release_stateid(struct nfs4_stateid *stp, int flags);
123
124/*
125 * Delegation state
126 */
127
128/* recall_lock protects the del_recall_lru */
129static DEFINE_SPINLOCK(recall_lock);
130static struct list_head del_recall_lru; 124static struct list_head del_recall_lru;
131 125
132static void
133free_nfs4_file(struct kref *kref)
134{
135 struct nfs4_file *fp = container_of(kref, struct nfs4_file, fi_ref);
136 list_del(&fp->fi_hash);
137 iput(fp->fi_inode);
138 kmem_cache_free(file_slab, fp);
139}
140
141static inline void 126static inline void
142put_nfs4_file(struct nfs4_file *fi) 127put_nfs4_file(struct nfs4_file *fi)
143{ 128{
144 kref_put(&fi->fi_ref, free_nfs4_file); 129 if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
130 list_del(&fi->fi_hash);
131 spin_unlock(&recall_lock);
132 iput(fi->fi_inode);
133 kmem_cache_free(file_slab, fi);
134 }
145} 135}
146 136
147static inline void 137static inline void
148get_nfs4_file(struct nfs4_file *fi) 138get_nfs4_file(struct nfs4_file *fi)
149{ 139{
150 kref_get(&fi->fi_ref); 140 atomic_inc(&fi->fi_ref);
151} 141}
152 142
153static int num_delegations; 143static int num_delegations;
@@ -220,9 +210,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
220 dp->dl_stateid.si_stateownerid = current_delegid++; 210 dp->dl_stateid.si_stateownerid = current_delegid++;
221 dp->dl_stateid.si_fileid = 0; 211 dp->dl_stateid.si_fileid = 0;
222 dp->dl_stateid.si_generation = 0; 212 dp->dl_stateid.si_generation = 0;
223 dp->dl_fhlen = current_fh->fh_handle.fh_size; 213 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
224 memcpy(dp->dl_fhval, &current_fh->fh_handle.fh_base,
225 current_fh->fh_handle.fh_size);
226 dp->dl_time = 0; 214 dp->dl_time = 0;
227 atomic_set(&dp->dl_count, 1); 215 atomic_set(&dp->dl_count, 1);
228 list_add(&dp->dl_perfile, &fp->fi_delegations); 216 list_add(&dp->dl_perfile, &fp->fi_delegations);
@@ -311,6 +299,291 @@ static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE];
311static struct list_head client_lru; 299static struct list_head client_lru;
312static struct list_head close_lru; 300static struct list_head close_lru;
313 301
302static void unhash_generic_stateid(struct nfs4_stateid *stp)
303{
304 list_del(&stp->st_hash);
305 list_del(&stp->st_perfile);
306 list_del(&stp->st_perstateowner);
307}
308
309static void free_generic_stateid(struct nfs4_stateid *stp)
310{
311 put_nfs4_file(stp->st_file);
312 kmem_cache_free(stateid_slab, stp);
313}
314
315static void release_lock_stateid(struct nfs4_stateid *stp)
316{
317 unhash_generic_stateid(stp);
318 locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner);
319 free_generic_stateid(stp);
320}
321
322static void unhash_lockowner(struct nfs4_stateowner *sop)
323{
324 struct nfs4_stateid *stp;
325
326 list_del(&sop->so_idhash);
327 list_del(&sop->so_strhash);
328 list_del(&sop->so_perstateid);
329 while (!list_empty(&sop->so_stateids)) {
330 stp = list_first_entry(&sop->so_stateids,
331 struct nfs4_stateid, st_perstateowner);
332 release_lock_stateid(stp);
333 }
334}
335
336static void release_lockowner(struct nfs4_stateowner *sop)
337{
338 unhash_lockowner(sop);
339 nfs4_put_stateowner(sop);
340}
341
342static void
343release_stateid_lockowners(struct nfs4_stateid *open_stp)
344{
345 struct nfs4_stateowner *lock_sop;
346
347 while (!list_empty(&open_stp->st_lockowners)) {
348 lock_sop = list_entry(open_stp->st_lockowners.next,
349 struct nfs4_stateowner, so_perstateid);
350 /* list_del(&open_stp->st_lockowners); */
351 BUG_ON(lock_sop->so_is_open_owner);
352 release_lockowner(lock_sop);
353 }
354}
355
356static void release_open_stateid(struct nfs4_stateid *stp)
357{
358 unhash_generic_stateid(stp);
359 release_stateid_lockowners(stp);
360 nfsd_close(stp->st_vfs_file);
361 free_generic_stateid(stp);
362}
363
364static void unhash_openowner(struct nfs4_stateowner *sop)
365{
366 struct nfs4_stateid *stp;
367
368 list_del(&sop->so_idhash);
369 list_del(&sop->so_strhash);
370 list_del(&sop->so_perclient);
371 list_del(&sop->so_perstateid); /* XXX: necessary? */
372 while (!list_empty(&sop->so_stateids)) {
373 stp = list_first_entry(&sop->so_stateids,
374 struct nfs4_stateid, st_perstateowner);
375 release_open_stateid(stp);
376 }
377}
378
379static void release_openowner(struct nfs4_stateowner *sop)
380{
381 unhash_openowner(sop);
382 list_del(&sop->so_close_lru);
383 nfs4_put_stateowner(sop);
384}
385
386static DEFINE_SPINLOCK(sessionid_lock);
387#define SESSION_HASH_SIZE 512
388static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
389
390static inline int
391hash_sessionid(struct nfs4_sessionid *sessionid)
392{
393 struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid;
394
395 return sid->sequence % SESSION_HASH_SIZE;
396}
397
398static inline void
399dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
400{
401 u32 *ptr = (u32 *)(&sessionid->data[0]);
402 dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]);
403}
404
405static void
406gen_sessionid(struct nfsd4_session *ses)
407{
408 struct nfs4_client *clp = ses->se_client;
409 struct nfsd4_sessionid *sid;
410
411 sid = (struct nfsd4_sessionid *)ses->se_sessionid.data;
412 sid->clientid = clp->cl_clientid;
413 sid->sequence = current_sessionid++;
414 sid->reserved = 0;
415}
416
417/*
418 * Give the client the number of slots it requests bound by
419 * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages.
420 *
421 * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we
422 * should (up to a point) re-negotiate active sessions and reduce their
423 * slot usage to make rooom for new connections. For now we just fail the
424 * create session.
425 */
426static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
427{
428 int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT;
429
430 spin_lock(&nfsd_serv->sv_lock);
431 if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages)
432 np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used;
433 nfsd_serv->sv_drc_pages_used += np;
434 spin_unlock(&nfsd_serv->sv_lock);
435
436 if (np <= 0) {
437 status = nfserr_resource;
438 fchan->maxreqs = 0;
439 } else
440 fchan->maxreqs = np / NFSD_PAGES_PER_SLOT;
441
442 return status;
443}
444
445/*
446 * fchan holds the client values on input, and the server values on output
447 */
448static int init_forechannel_attrs(struct svc_rqst *rqstp,
449 struct nfsd4_session *session,
450 struct nfsd4_channel_attrs *fchan)
451{
452 int status = 0;
453 __u32 maxcount = svc_max_payload(rqstp);
454
455 /* headerpadsz set to zero in encode routine */
456
457 /* Use the client's max request and max response size if possible */
458 if (fchan->maxreq_sz > maxcount)
459 fchan->maxreq_sz = maxcount;
460 session->se_fmaxreq_sz = fchan->maxreq_sz;
461
462 if (fchan->maxresp_sz > maxcount)
463 fchan->maxresp_sz = maxcount;
464 session->se_fmaxresp_sz = fchan->maxresp_sz;
465
466 /* Set the max response cached size our default which is
467 * a multiple of PAGE_SIZE and small */
468 session->se_fmaxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
469 fchan->maxresp_cached = session->se_fmaxresp_cached;
470
471 /* Use the client's maxops if possible */
472 if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
473 fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
474 session->se_fmaxops = fchan->maxops;
475
476 /* try to use the client requested number of slots */
477 if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
478 fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
479
480 /* FIXME: Error means no more DRC pages so the server should
481 * recover pages from existing sessions. For now fail session
482 * creation.
483 */
484 status = set_forechannel_maxreqs(fchan);
485
486 session->se_fnumslots = fchan->maxreqs;
487 return status;
488}
489
490static int
491alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
492 struct nfsd4_create_session *cses)
493{
494 struct nfsd4_session *new, tmp;
495 int idx, status = nfserr_resource, slotsize;
496
497 memset(&tmp, 0, sizeof(tmp));
498
499 /* FIXME: For now, we just accept the client back channel attributes. */
500 status = init_forechannel_attrs(rqstp, &tmp, &cses->fore_channel);
501 if (status)
502 goto out;
503
504 /* allocate struct nfsd4_session and slot table in one piece */
505 slotsize = tmp.se_fnumslots * sizeof(struct nfsd4_slot);
506 new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
507 if (!new)
508 goto out;
509
510 memcpy(new, &tmp, sizeof(*new));
511
512 new->se_client = clp;
513 gen_sessionid(new);
514 idx = hash_sessionid(&new->se_sessionid);
515 memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
516 NFS4_MAX_SESSIONID_LEN);
517
518 new->se_flags = cses->flags;
519 kref_init(&new->se_ref);
520 spin_lock(&sessionid_lock);
521 list_add(&new->se_hash, &sessionid_hashtbl[idx]);
522 list_add(&new->se_perclnt, &clp->cl_sessions);
523 spin_unlock(&sessionid_lock);
524
525 status = nfs_ok;
526out:
527 return status;
528}
529
530/* caller must hold sessionid_lock */
531static struct nfsd4_session *
532find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
533{
534 struct nfsd4_session *elem;
535 int idx;
536
537 dump_sessionid(__func__, sessionid);
538 idx = hash_sessionid(sessionid);
539 dprintk("%s: idx is %d\n", __func__, idx);
540 /* Search in the appropriate list */
541 list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
542 dump_sessionid("list traversal", &elem->se_sessionid);
543 if (!memcmp(elem->se_sessionid.data, sessionid->data,
544 NFS4_MAX_SESSIONID_LEN)) {
545 return elem;
546 }
547 }
548
549 dprintk("%s: session not found\n", __func__);
550 return NULL;
551}
552
553/* caller must hold sessionid_lock */
554static void
555unhash_session(struct nfsd4_session *ses)
556{
557 list_del(&ses->se_hash);
558 list_del(&ses->se_perclnt);
559}
560
561static void
562release_session(struct nfsd4_session *ses)
563{
564 spin_lock(&sessionid_lock);
565 unhash_session(ses);
566 spin_unlock(&sessionid_lock);
567 nfsd4_put_session(ses);
568}
569
570static void nfsd4_release_respages(struct page **respages, short resused);
571
572void
573free_session(struct kref *kref)
574{
575 struct nfsd4_session *ses;
576 int i;
577
578 ses = container_of(kref, struct nfsd4_session, se_ref);
579 for (i = 0; i < ses->se_fnumslots; i++) {
580 struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry;
581 nfsd4_release_respages(e->ce_respages, e->ce_resused);
582 }
583 kfree(ses->se_slots);
584 kfree(ses);
585}
586
314static inline void 587static inline void
315renew_client(struct nfs4_client *clp) 588renew_client(struct nfs4_client *clp)
316{ 589{
@@ -330,8 +603,8 @@ STALE_CLIENTID(clientid_t *clid)
330{ 603{
331 if (clid->cl_boot == boot_time) 604 if (clid->cl_boot == boot_time)
332 return 0; 605 return 0;
333 dprintk("NFSD stale clientid (%08x/%08x)\n", 606 dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
334 clid->cl_boot, clid->cl_id); 607 clid->cl_boot, clid->cl_id, boot_time);
335 return 1; 608 return 1;
336} 609}
337 610
@@ -376,6 +649,8 @@ static inline void
376free_client(struct nfs4_client *clp) 649free_client(struct nfs4_client *clp)
377{ 650{
378 shutdown_callback_client(clp); 651 shutdown_callback_client(clp);
652 nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages,
653 clp->cl_slot.sl_cache_entry.ce_resused);
379 if (clp->cl_cred.cr_group_info) 654 if (clp->cl_cred.cr_group_info)
380 put_group_info(clp->cl_cred.cr_group_info); 655 put_group_info(clp->cl_cred.cr_group_info);
381 kfree(clp->cl_principal); 656 kfree(clp->cl_principal);
@@ -420,7 +695,13 @@ expire_client(struct nfs4_client *clp)
420 list_del(&clp->cl_lru); 695 list_del(&clp->cl_lru);
421 while (!list_empty(&clp->cl_openowners)) { 696 while (!list_empty(&clp->cl_openowners)) {
422 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); 697 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
423 release_stateowner(sop); 698 release_openowner(sop);
699 }
700 while (!list_empty(&clp->cl_sessions)) {
701 struct nfsd4_session *ses;
702 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
703 se_perclnt);
704 release_session(ses);
424 } 705 }
425 put_nfs4_client(clp); 706 put_nfs4_client(clp);
426} 707}
@@ -439,6 +720,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
439 INIT_LIST_HEAD(&clp->cl_strhash); 720 INIT_LIST_HEAD(&clp->cl_strhash);
440 INIT_LIST_HEAD(&clp->cl_openowners); 721 INIT_LIST_HEAD(&clp->cl_openowners);
441 INIT_LIST_HEAD(&clp->cl_delegations); 722 INIT_LIST_HEAD(&clp->cl_delegations);
723 INIT_LIST_HEAD(&clp->cl_sessions);
442 INIT_LIST_HEAD(&clp->cl_lru); 724 INIT_LIST_HEAD(&clp->cl_lru);
443 return clp; 725 return clp;
444} 726}
@@ -568,25 +850,45 @@ find_unconfirmed_client(clientid_t *clid)
568 return NULL; 850 return NULL;
569} 851}
570 852
853/*
854 * Return 1 iff clp's clientid establishment method matches the use_exchange_id
855 * parameter. Matching is based on the fact the at least one of the
856 * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
857 *
858 * FIXME: we need to unify the clientid namespaces for nfsv4.x
859 * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
860 * and SET_CLIENTID{,_CONFIRM}
861 */
862static inline int
863match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
864{
865 bool has_exchange_flags = (clp->cl_exchange_flags != 0);
866 return use_exchange_id == has_exchange_flags;
867}
868
571static struct nfs4_client * 869static struct nfs4_client *
572find_confirmed_client_by_str(const char *dname, unsigned int hashval) 870find_confirmed_client_by_str(const char *dname, unsigned int hashval,
871 bool use_exchange_id)
573{ 872{
574 struct nfs4_client *clp; 873 struct nfs4_client *clp;
575 874
576 list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) { 875 list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
577 if (same_name(clp->cl_recdir, dname)) 876 if (same_name(clp->cl_recdir, dname) &&
877 match_clientid_establishment(clp, use_exchange_id))
578 return clp; 878 return clp;
579 } 879 }
580 return NULL; 880 return NULL;
581} 881}
582 882
583static struct nfs4_client * 883static struct nfs4_client *
584find_unconfirmed_client_by_str(const char *dname, unsigned int hashval) 884find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
885 bool use_exchange_id)
585{ 886{
586 struct nfs4_client *clp; 887 struct nfs4_client *clp;
587 888
588 list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) { 889 list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
589 if (same_name(clp->cl_recdir, dname)) 890 if (same_name(clp->cl_recdir, dname) &&
891 match_clientid_establishment(clp, use_exchange_id))
590 return clp; 892 return clp;
591 } 893 }
592 return NULL; 894 return NULL;
@@ -685,6 +987,534 @@ out_err:
685 return; 987 return;
686} 988}
687 989
990void
991nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
992{
993 struct nfsd4_compoundres *resp = rqstp->rq_resp;
994
995 resp->cstate.statp = statp;
996}
997
998/*
999 * Dereference the result pages.
1000 */
1001static void
1002nfsd4_release_respages(struct page **respages, short resused)
1003{
1004 int i;
1005
1006 dprintk("--> %s\n", __func__);
1007 for (i = 0; i < resused; i++) {
1008 if (!respages[i])
1009 continue;
1010 put_page(respages[i]);
1011 respages[i] = NULL;
1012 }
1013}
1014
1015static void
1016nfsd4_copy_pages(struct page **topages, struct page **frompages, short count)
1017{
1018 int i;
1019
1020 for (i = 0; i < count; i++) {
1021 topages[i] = frompages[i];
1022 if (!topages[i])
1023 continue;
1024 get_page(topages[i]);
1025 }
1026}
1027
1028/*
1029 * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous
1030 * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total
1031 * length of the XDR response is less than se_fmaxresp_cached
1032 * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a
1033 * of the reply (e.g. readdir).
1034 *
1035 * Store the base and length of the rq_req.head[0] page
1036 * of the NFSv4.1 data, just past the rpc header.
1037 */
1038void
1039nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
1040{
1041 struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
1042 struct svc_rqst *rqstp = resp->rqstp;
1043 struct nfsd4_compoundargs *args = rqstp->rq_argp;
1044 struct nfsd4_op *op = &args->ops[resp->opcnt];
1045 struct kvec *resv = &rqstp->rq_res.head[0];
1046
1047 dprintk("--> %s entry %p\n", __func__, entry);
1048
1049 /* Don't cache a failed OP_SEQUENCE. */
1050 if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status)
1051 return;
1052
1053 nfsd4_release_respages(entry->ce_respages, entry->ce_resused);
1054 entry->ce_opcnt = resp->opcnt;
1055 entry->ce_status = resp->cstate.status;
1056
1057 /*
1058 * Don't need a page to cache just the sequence operation - the slot
1059 * does this for us!
1060 */
1061
1062 if (nfsd4_not_cached(resp)) {
1063 entry->ce_resused = 0;
1064 entry->ce_rpchdrlen = 0;
1065 dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__,
1066 resp->cstate.slot->sl_cache_entry.ce_cachethis);
1067 return;
1068 }
1069 entry->ce_resused = rqstp->rq_resused;
1070 if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1)
1071 entry->ce_resused = NFSD_PAGES_PER_SLOT + 1;
1072 nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages,
1073 entry->ce_resused);
1074 entry->ce_datav.iov_base = resp->cstate.statp;
1075 entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp -
1076 (char *)page_address(rqstp->rq_respages[0]));
1077 /* Current request rpc header length*/
1078 entry->ce_rpchdrlen = (char *)resp->cstate.statp -
1079 (char *)page_address(rqstp->rq_respages[0]);
1080}
1081
1082/*
1083 * We keep the rpc header, but take the nfs reply from the replycache.
1084 */
1085static int
1086nfsd41_copy_replay_data(struct nfsd4_compoundres *resp,
1087 struct nfsd4_cache_entry *entry)
1088{
1089 struct svc_rqst *rqstp = resp->rqstp;
1090 struct kvec *resv = &resp->rqstp->rq_res.head[0];
1091 int len;
1092
1093 /* Current request rpc header length*/
1094 len = (char *)resp->cstate.statp -
1095 (char *)page_address(rqstp->rq_respages[0]);
1096 if (entry->ce_datav.iov_len + len > PAGE_SIZE) {
1097 dprintk("%s v41 cached reply too large (%Zd).\n", __func__,
1098 entry->ce_datav.iov_len);
1099 return 0;
1100 }
1101 /* copy the cached reply nfsd data past the current rpc header */
1102 memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base,
1103 entry->ce_datav.iov_len);
1104 resv->iov_len = len + entry->ce_datav.iov_len;
1105 return 1;
1106}
1107
1108/*
1109 * Keep the first page of the replay. Copy the NFSv4.1 data from the first
1110 * cached page. Replace any futher replay pages from the cache.
1111 */
1112__be32
1113nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
1114 struct nfsd4_sequence *seq)
1115{
1116 struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
1117 __be32 status;
1118
1119 dprintk("--> %s entry %p\n", __func__, entry);
1120
1121 /*
1122 * If this is just the sequence operation, we did not keep
1123 * a page in the cache entry because we can just use the
1124 * slot info stored in struct nfsd4_sequence that was checked
1125 * against the slot in nfsd4_sequence().
1126 *
1127 * This occurs when seq->cachethis is FALSE, or when the client
1128 * session inactivity timer fires and a solo sequence operation
1129 * is sent (lease renewal).
1130 */
1131 if (seq && nfsd4_not_cached(resp)) {
1132 seq->maxslots = resp->cstate.session->se_fnumslots;
1133 return nfs_ok;
1134 }
1135
1136 if (!nfsd41_copy_replay_data(resp, entry)) {
1137 /*
1138 * Not enough room to use the replay rpc header, send the
1139 * cached header. Release all the allocated result pages.
1140 */
1141 svc_free_res_pages(resp->rqstp);
1142 nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages,
1143 entry->ce_resused);
1144 } else {
1145 /* Release all but the first allocated result page */
1146
1147 resp->rqstp->rq_resused--;
1148 svc_free_res_pages(resp->rqstp);
1149
1150 nfsd4_copy_pages(&resp->rqstp->rq_respages[1],
1151 &entry->ce_respages[1],
1152 entry->ce_resused - 1);
1153 }
1154
1155 resp->rqstp->rq_resused = entry->ce_resused;
1156 resp->opcnt = entry->ce_opcnt;
1157 resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen;
1158 status = entry->ce_status;
1159
1160 return status;
1161}
1162
1163/*
1164 * Set the exchange_id flags returned by the server.
1165 */
1166static void
1167nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
1168{
1169 /* pNFS is not supported */
1170 new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
1171
1172 /* Referrals are supported, Migration is not. */
1173 new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
1174
1175 /* set the wire flags to return to client. */
1176 clid->flags = new->cl_exchange_flags;
1177}
1178
1179__be32
1180nfsd4_exchange_id(struct svc_rqst *rqstp,
1181 struct nfsd4_compound_state *cstate,
1182 struct nfsd4_exchange_id *exid)
1183{
1184 struct nfs4_client *unconf, *conf, *new;
1185 int status;
1186 unsigned int strhashval;
1187 char dname[HEXDIR_LEN];
1188 nfs4_verifier verf = exid->verifier;
1189 u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
1190
1191 dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
1192 " ip_addr=%u flags %x, spa_how %d\n",
1193 __func__, rqstp, exid, exid->clname.len, exid->clname.data,
1194 ip_addr, exid->flags, exid->spa_how);
1195
1196 if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
1197 return nfserr_inval;
1198
1199 /* Currently only support SP4_NONE */
1200 switch (exid->spa_how) {
1201 case SP4_NONE:
1202 break;
1203 case SP4_SSV:
1204 return nfserr_encr_alg_unsupp;
1205 default:
1206 BUG(); /* checked by xdr code */
1207 case SP4_MACH_CRED:
1208 return nfserr_serverfault; /* no excuse :-/ */
1209 }
1210
1211 status = nfs4_make_rec_clidname(dname, &exid->clname);
1212
1213 if (status)
1214 goto error;
1215
1216 strhashval = clientstr_hashval(dname);
1217
1218 nfs4_lock_state();
1219 status = nfs_ok;
1220
1221 conf = find_confirmed_client_by_str(dname, strhashval, true);
1222 if (conf) {
1223 if (!same_verf(&verf, &conf->cl_verifier)) {
1224 /* 18.35.4 case 8 */
1225 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
1226 status = nfserr_not_same;
1227 goto out;
1228 }
1229 /* Client reboot: destroy old state */
1230 expire_client(conf);
1231 goto out_new;
1232 }
1233 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
1234 /* 18.35.4 case 9 */
1235 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
1236 status = nfserr_perm;
1237 goto out;
1238 }
1239 expire_client(conf);
1240 goto out_new;
1241 }
1242 if (ip_addr != conf->cl_addr &&
1243 !(exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A)) {
1244 /* Client collision. 18.35.4 case 3 */
1245 status = nfserr_clid_inuse;
1246 goto out;
1247 }
1248 /*
1249 * Set bit when the owner id and verifier map to an already
1250 * confirmed client id (18.35.3).
1251 */
1252 exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
1253
1254 /*
1255 * Falling into 18.35.4 case 2, possible router replay.
1256 * Leave confirmed record intact and return same result.
1257 */
1258 copy_verf(conf, &verf);
1259 new = conf;
1260 goto out_copy;
1261 } else {
1262 /* 18.35.4 case 7 */
1263 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
1264 status = nfserr_noent;
1265 goto out;
1266 }
1267 }
1268
1269 unconf = find_unconfirmed_client_by_str(dname, strhashval, true);
1270 if (unconf) {
1271 /*
1272 * Possible retry or client restart. Per 18.35.4 case 4,
1273 * a new unconfirmed record should be generated regardless
1274 * of whether any properties have changed.
1275 */
1276 expire_client(unconf);
1277 }
1278
1279out_new:
1280 /* Normal case */
1281 new = create_client(exid->clname, dname);
1282 if (new == NULL) {
1283 status = nfserr_resource;
1284 goto out;
1285 }
1286
1287 copy_verf(new, &verf);
1288 copy_cred(&new->cl_cred, &rqstp->rq_cred);
1289 new->cl_addr = ip_addr;
1290 gen_clid(new);
1291 gen_confirm(new);
1292 add_to_unconfirmed(new, strhashval);
1293out_copy:
1294 exid->clientid.cl_boot = new->cl_clientid.cl_boot;
1295 exid->clientid.cl_id = new->cl_clientid.cl_id;
1296
1297 new->cl_slot.sl_seqid = 0;
1298 exid->seqid = 1;
1299 nfsd4_set_ex_flags(new, exid);
1300
1301 dprintk("nfsd4_exchange_id seqid %d flags %x\n",
1302 new->cl_slot.sl_seqid, new->cl_exchange_flags);
1303 status = nfs_ok;
1304
1305out:
1306 nfs4_unlock_state();
1307error:
1308 dprintk("nfsd4_exchange_id returns %d\n", ntohl(status));
1309 return status;
1310}
1311
1312static int
1313check_slot_seqid(u32 seqid, struct nfsd4_slot *slot)
1314{
1315 dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid,
1316 slot->sl_seqid);
1317
1318 /* The slot is in use, and no response has been sent. */
1319 if (slot->sl_inuse) {
1320 if (seqid == slot->sl_seqid)
1321 return nfserr_jukebox;
1322 else
1323 return nfserr_seq_misordered;
1324 }
1325 /* Normal */
1326 if (likely(seqid == slot->sl_seqid + 1))
1327 return nfs_ok;
1328 /* Replay */
1329 if (seqid == slot->sl_seqid)
1330 return nfserr_replay_cache;
1331 /* Wraparound */
1332 if (seqid == 1 && (slot->sl_seqid + 1) == 0)
1333 return nfs_ok;
1334 /* Misordered replay or misordered new request */
1335 return nfserr_seq_misordered;
1336}
1337
1338__be32
1339nfsd4_create_session(struct svc_rqst *rqstp,
1340 struct nfsd4_compound_state *cstate,
1341 struct nfsd4_create_session *cr_ses)
1342{
1343 u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
1344 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1345 struct nfs4_client *conf, *unconf;
1346 struct nfsd4_slot *slot = NULL;
1347 int status = 0;
1348
1349 nfs4_lock_state();
1350 unconf = find_unconfirmed_client(&cr_ses->clientid);
1351 conf = find_confirmed_client(&cr_ses->clientid);
1352
1353 if (conf) {
1354 slot = &conf->cl_slot;
1355 status = check_slot_seqid(cr_ses->seqid, slot);
1356 if (status == nfserr_replay_cache) {
1357 dprintk("Got a create_session replay! seqid= %d\n",
1358 slot->sl_seqid);
1359 cstate->slot = slot;
1360 cstate->status = status;
1361 /* Return the cached reply status */
1362 status = nfsd4_replay_cache_entry(resp, NULL);
1363 goto out;
1364 } else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) {
1365 status = nfserr_seq_misordered;
1366 dprintk("Sequence misordered!\n");
1367 dprintk("Expected seqid= %d but got seqid= %d\n",
1368 slot->sl_seqid, cr_ses->seqid);
1369 goto out;
1370 }
1371 conf->cl_slot.sl_seqid++;
1372 } else if (unconf) {
1373 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
1374 (ip_addr != unconf->cl_addr)) {
1375 status = nfserr_clid_inuse;
1376 goto out;
1377 }
1378
1379 slot = &unconf->cl_slot;
1380 status = check_slot_seqid(cr_ses->seqid, slot);
1381 if (status) {
1382 /* an unconfirmed replay returns misordered */
1383 status = nfserr_seq_misordered;
1384 goto out;
1385 }
1386
1387 slot->sl_seqid++; /* from 0 to 1 */
1388 move_to_confirmed(unconf);
1389
1390 /*
1391 * We do not support RDMA or persistent sessions
1392 */
1393 cr_ses->flags &= ~SESSION4_PERSIST;
1394 cr_ses->flags &= ~SESSION4_RDMA;
1395
1396 conf = unconf;
1397 } else {
1398 status = nfserr_stale_clientid;
1399 goto out;
1400 }
1401
1402 status = alloc_init_session(rqstp, conf, cr_ses);
1403 if (status)
1404 goto out;
1405
1406 memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
1407 NFS4_MAX_SESSIONID_LEN);
1408 cr_ses->seqid = slot->sl_seqid;
1409
1410 slot->sl_inuse = true;
1411 cstate->slot = slot;
1412 /* Ensure a page is used for the cache */
1413 slot->sl_cache_entry.ce_cachethis = 1;
1414out:
1415 nfs4_unlock_state();
1416 dprintk("%s returns %d\n", __func__, ntohl(status));
1417 return status;
1418}
1419
1420__be32
1421nfsd4_destroy_session(struct svc_rqst *r,
1422 struct nfsd4_compound_state *cstate,
1423 struct nfsd4_destroy_session *sessionid)
1424{
1425 struct nfsd4_session *ses;
1426 u32 status = nfserr_badsession;
1427
1428 /* Notes:
1429 * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
1430 * - Should we return nfserr_back_chan_busy if waiting for
1431 * callbacks on to-be-destroyed session?
1432 * - Do we need to clear any callback info from previous session?
1433 */
1434
1435 dump_sessionid(__func__, &sessionid->sessionid);
1436 spin_lock(&sessionid_lock);
1437 ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
1438 if (!ses) {
1439 spin_unlock(&sessionid_lock);
1440 goto out;
1441 }
1442
1443 unhash_session(ses);
1444 spin_unlock(&sessionid_lock);
1445
1446 /* wait for callbacks */
1447 shutdown_callback_client(ses->se_client);
1448 nfsd4_put_session(ses);
1449 status = nfs_ok;
1450out:
1451 dprintk("%s returns %d\n", __func__, ntohl(status));
1452 return status;
1453}
1454
1455__be32
1456nfsd4_sequence(struct svc_rqst *rqstp,
1457 struct nfsd4_compound_state *cstate,
1458 struct nfsd4_sequence *seq)
1459{
1460 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1461 struct nfsd4_session *session;
1462 struct nfsd4_slot *slot;
1463 int status;
1464
1465 if (resp->opcnt != 1)
1466 return nfserr_sequence_pos;
1467
1468 spin_lock(&sessionid_lock);
1469 status = nfserr_badsession;
1470 session = find_in_sessionid_hashtbl(&seq->sessionid);
1471 if (!session)
1472 goto out;
1473
1474 status = nfserr_badslot;
1475 if (seq->slotid >= session->se_fnumslots)
1476 goto out;
1477
1478 slot = &session->se_slots[seq->slotid];
1479 dprintk("%s: slotid %d\n", __func__, seq->slotid);
1480
1481 status = check_slot_seqid(seq->seqid, slot);
1482 if (status == nfserr_replay_cache) {
1483 cstate->slot = slot;
1484 cstate->session = session;
1485 /* Return the cached reply status and set cstate->status
1486 * for nfsd4_svc_encode_compoundres processing */
1487 status = nfsd4_replay_cache_entry(resp, seq);
1488 cstate->status = nfserr_replay_cache;
1489 goto replay_cache;
1490 }
1491 if (status)
1492 goto out;
1493
1494 /* Success! bump slot seqid */
1495 slot->sl_inuse = true;
1496 slot->sl_seqid = seq->seqid;
1497 slot->sl_cache_entry.ce_cachethis = seq->cachethis;
1498 /* Always set the cache entry cachethis for solo sequence */
1499 if (nfsd4_is_solo_sequence(resp))
1500 slot->sl_cache_entry.ce_cachethis = 1;
1501
1502 cstate->slot = slot;
1503 cstate->session = session;
1504
1505replay_cache:
1506 /* Renew the clientid on success and on replay.
1507 * Hold a session reference until done processing the compound:
1508 * nfsd4_put_session called only if the cstate slot is set.
1509 */
1510 renew_client(session->se_client);
1511 nfsd4_get_session(session);
1512out:
1513 spin_unlock(&sessionid_lock);
1514 dprintk("%s: return %d\n", __func__, ntohl(status));
1515 return status;
1516}
1517
688__be32 1518__be32
689nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 1519nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
690 struct nfsd4_setclientid *setclid) 1520 struct nfsd4_setclientid *setclid)
@@ -716,14 +1546,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
716 strhashval = clientstr_hashval(dname); 1546 strhashval = clientstr_hashval(dname);
717 1547
718 nfs4_lock_state(); 1548 nfs4_lock_state();
719 conf = find_confirmed_client_by_str(dname, strhashval); 1549 conf = find_confirmed_client_by_str(dname, strhashval, false);
720 if (conf) { 1550 if (conf) {
721 /* RFC 3530 14.2.33 CASE 0: */ 1551 /* RFC 3530 14.2.33 CASE 0: */
722 status = nfserr_clid_inuse; 1552 status = nfserr_clid_inuse;
723 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred) 1553 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
724 || conf->cl_addr != sin->sin_addr.s_addr) { 1554 dprintk("NFSD: setclientid: string in use by client"
725 dprintk("NFSD: setclientid: string in use by clientat %pI4\n", 1555 " at %pI4\n", &conf->cl_addr);
726 &conf->cl_addr);
727 goto out; 1556 goto out;
728 } 1557 }
729 } 1558 }
@@ -732,7 +1561,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
732 * has a description of SETCLIENTID request processing consisting 1561 * has a description of SETCLIENTID request processing consisting
733 * of 5 bullet points, labeled as CASE0 - CASE4 below. 1562 * of 5 bullet points, labeled as CASE0 - CASE4 below.
734 */ 1563 */
735 unconf = find_unconfirmed_client_by_str(dname, strhashval); 1564 unconf = find_unconfirmed_client_by_str(dname, strhashval, false);
736 status = nfserr_resource; 1565 status = nfserr_resource;
737 if (!conf) { 1566 if (!conf) {
738 /* 1567 /*
@@ -887,7 +1716,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
887 unsigned int hash = 1716 unsigned int hash =
888 clientstr_hashval(unconf->cl_recdir); 1717 clientstr_hashval(unconf->cl_recdir);
889 conf = find_confirmed_client_by_str(unconf->cl_recdir, 1718 conf = find_confirmed_client_by_str(unconf->cl_recdir,
890 hash); 1719 hash, false);
891 if (conf) { 1720 if (conf) {
892 nfsd4_remove_clid_dir(conf); 1721 nfsd4_remove_clid_dir(conf);
893 expire_client(conf); 1722 expire_client(conf);
@@ -923,11 +1752,13 @@ alloc_init_file(struct inode *ino)
923 1752
924 fp = kmem_cache_alloc(file_slab, GFP_KERNEL); 1753 fp = kmem_cache_alloc(file_slab, GFP_KERNEL);
925 if (fp) { 1754 if (fp) {
926 kref_init(&fp->fi_ref); 1755 atomic_set(&fp->fi_ref, 1);
927 INIT_LIST_HEAD(&fp->fi_hash); 1756 INIT_LIST_HEAD(&fp->fi_hash);
928 INIT_LIST_HEAD(&fp->fi_stateids); 1757 INIT_LIST_HEAD(&fp->fi_stateids);
929 INIT_LIST_HEAD(&fp->fi_delegations); 1758 INIT_LIST_HEAD(&fp->fi_delegations);
1759 spin_lock(&recall_lock);
930 list_add(&fp->fi_hash, &file_hashtbl[hashval]); 1760 list_add(&fp->fi_hash, &file_hashtbl[hashval]);
1761 spin_unlock(&recall_lock);
931 fp->fi_inode = igrab(ino); 1762 fp->fi_inode = igrab(ino);
932 fp->fi_id = current_fileid++; 1763 fp->fi_id = current_fileid++;
933 fp->fi_had_conflict = false; 1764 fp->fi_had_conflict = false;
@@ -1037,48 +1868,6 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
1037 return sop; 1868 return sop;
1038} 1869}
1039 1870
1040static void
1041release_stateid_lockowners(struct nfs4_stateid *open_stp)
1042{
1043 struct nfs4_stateowner *lock_sop;
1044
1045 while (!list_empty(&open_stp->st_lockowners)) {
1046 lock_sop = list_entry(open_stp->st_lockowners.next,
1047 struct nfs4_stateowner, so_perstateid);
1048 /* list_del(&open_stp->st_lockowners); */
1049 BUG_ON(lock_sop->so_is_open_owner);
1050 release_stateowner(lock_sop);
1051 }
1052}
1053
1054static void
1055unhash_stateowner(struct nfs4_stateowner *sop)
1056{
1057 struct nfs4_stateid *stp;
1058
1059 list_del(&sop->so_idhash);
1060 list_del(&sop->so_strhash);
1061 if (sop->so_is_open_owner)
1062 list_del(&sop->so_perclient);
1063 list_del(&sop->so_perstateid);
1064 while (!list_empty(&sop->so_stateids)) {
1065 stp = list_entry(sop->so_stateids.next,
1066 struct nfs4_stateid, st_perstateowner);
1067 if (sop->so_is_open_owner)
1068 release_stateid(stp, OPEN_STATE);
1069 else
1070 release_stateid(stp, LOCK_STATE);
1071 }
1072}
1073
1074static void
1075release_stateowner(struct nfs4_stateowner *sop)
1076{
1077 unhash_stateowner(sop);
1078 list_del(&sop->so_close_lru);
1079 nfs4_put_stateowner(sop);
1080}
1081
1082static inline void 1871static inline void
1083init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { 1872init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
1084 struct nfs4_stateowner *sop = open->op_stateowner; 1873 struct nfs4_stateowner *sop = open->op_stateowner;
@@ -1100,30 +1889,13 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
1100 stp->st_stateid.si_generation = 0; 1889 stp->st_stateid.si_generation = 0;
1101 stp->st_access_bmap = 0; 1890 stp->st_access_bmap = 0;
1102 stp->st_deny_bmap = 0; 1891 stp->st_deny_bmap = 0;
1103 __set_bit(open->op_share_access, &stp->st_access_bmap); 1892 __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK,
1893 &stp->st_access_bmap);
1104 __set_bit(open->op_share_deny, &stp->st_deny_bmap); 1894 __set_bit(open->op_share_deny, &stp->st_deny_bmap);
1105 stp->st_openstp = NULL; 1895 stp->st_openstp = NULL;
1106} 1896}
1107 1897
1108static void 1898static void
1109release_stateid(struct nfs4_stateid *stp, int flags)
1110{
1111 struct file *filp = stp->st_vfs_file;
1112
1113 list_del(&stp->st_hash);
1114 list_del(&stp->st_perfile);
1115 list_del(&stp->st_perstateowner);
1116 if (flags & OPEN_STATE) {
1117 release_stateid_lockowners(stp);
1118 stp->st_vfs_file = NULL;
1119 nfsd_close(filp);
1120 } else if (flags & LOCK_STATE)
1121 locks_remove_posix(filp, (fl_owner_t) stp->st_stateowner);
1122 put_nfs4_file(stp->st_file);
1123 kmem_cache_free(stateid_slab, stp);
1124}
1125
1126static void
1127move_to_close_lru(struct nfs4_stateowner *sop) 1899move_to_close_lru(struct nfs4_stateowner *sop)
1128{ 1900{
1129 dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop); 1901 dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
@@ -1160,20 +1932,33 @@ find_file(struct inode *ino)
1160 unsigned int hashval = file_hashval(ino); 1932 unsigned int hashval = file_hashval(ino);
1161 struct nfs4_file *fp; 1933 struct nfs4_file *fp;
1162 1934
1935 spin_lock(&recall_lock);
1163 list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { 1936 list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
1164 if (fp->fi_inode == ino) { 1937 if (fp->fi_inode == ino) {
1165 get_nfs4_file(fp); 1938 get_nfs4_file(fp);
1939 spin_unlock(&recall_lock);
1166 return fp; 1940 return fp;
1167 } 1941 }
1168 } 1942 }
1943 spin_unlock(&recall_lock);
1169 return NULL; 1944 return NULL;
1170} 1945}
1171 1946
1172static inline int access_valid(u32 x) 1947static inline int access_valid(u32 x, u32 minorversion)
1173{ 1948{
1174 if (x < NFS4_SHARE_ACCESS_READ) 1949 if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
1175 return 0; 1950 return 0;
1176 if (x > NFS4_SHARE_ACCESS_BOTH) 1951 if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH)
1952 return 0;
1953 x &= ~NFS4_SHARE_ACCESS_MASK;
1954 if (minorversion && x) {
1955 if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL)
1956 return 0;
1957 if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED)
1958 return 0;
1959 x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK);
1960 }
1961 if (x)
1177 return 0; 1962 return 0;
1178 return 1; 1963 return 1;
1179} 1964}
@@ -1409,7 +2194,8 @@ static struct lock_manager_operations nfsd_lease_mng_ops = {
1409 2194
1410 2195
1411__be32 2196__be32
1412nfsd4_process_open1(struct nfsd4_open *open) 2197nfsd4_process_open1(struct nfsd4_compound_state *cstate,
2198 struct nfsd4_open *open)
1413{ 2199{
1414 clientid_t *clientid = &open->op_clientid; 2200 clientid_t *clientid = &open->op_clientid;
1415 struct nfs4_client *clp = NULL; 2201 struct nfs4_client *clp = NULL;
@@ -1432,10 +2218,13 @@ nfsd4_process_open1(struct nfsd4_open *open)
1432 return nfserr_expired; 2218 return nfserr_expired;
1433 goto renew; 2219 goto renew;
1434 } 2220 }
2221 /* When sessions are used, skip open sequenceid processing */
2222 if (nfsd4_has_session(cstate))
2223 goto renew;
1435 if (!sop->so_confirmed) { 2224 if (!sop->so_confirmed) {
1436 /* Replace unconfirmed owners without checking for replay. */ 2225 /* Replace unconfirmed owners without checking for replay. */
1437 clp = sop->so_client; 2226 clp = sop->so_client;
1438 release_stateowner(sop); 2227 release_openowner(sop);
1439 open->op_stateowner = NULL; 2228 open->op_stateowner = NULL;
1440 goto renew; 2229 goto renew;
1441 } 2230 }
@@ -1709,6 +2498,7 @@ out:
1709__be32 2498__be32
1710nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) 2499nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
1711{ 2500{
2501 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1712 struct nfs4_file *fp = NULL; 2502 struct nfs4_file *fp = NULL;
1713 struct inode *ino = current_fh->fh_dentry->d_inode; 2503 struct inode *ino = current_fh->fh_dentry->d_inode;
1714 struct nfs4_stateid *stp = NULL; 2504 struct nfs4_stateid *stp = NULL;
@@ -1716,7 +2506,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
1716 __be32 status; 2506 __be32 status;
1717 2507
1718 status = nfserr_inval; 2508 status = nfserr_inval;
1719 if (!access_valid(open->op_share_access) 2509 if (!access_valid(open->op_share_access, resp->cstate.minorversion)
1720 || !deny_valid(open->op_share_deny)) 2510 || !deny_valid(open->op_share_deny))
1721 goto out; 2511 goto out;
1722 /* 2512 /*
@@ -1764,12 +2554,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
1764 init_stateid(stp, fp, open); 2554 init_stateid(stp, fp, open);
1765 status = nfsd4_truncate(rqstp, current_fh, open); 2555 status = nfsd4_truncate(rqstp, current_fh, open);
1766 if (status) { 2556 if (status) {
1767 release_stateid(stp, OPEN_STATE); 2557 release_open_stateid(stp);
1768 goto out; 2558 goto out;
1769 } 2559 }
2560 if (nfsd4_has_session(&resp->cstate))
2561 update_stateid(&stp->st_stateid);
1770 } 2562 }
1771 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); 2563 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
1772 2564
2565 if (nfsd4_has_session(&resp->cstate))
2566 open->op_stateowner->so_confirmed = 1;
2567
1773 /* 2568 /*
1774 * Attempt to hand out a delegation. No error return, because the 2569 * Attempt to hand out a delegation. No error return, because the
1775 * OPEN succeeds even if we fail. 2570 * OPEN succeeds even if we fail.
@@ -1790,7 +2585,8 @@ out:
1790 * To finish the open response, we just need to set the rflags. 2585 * To finish the open response, we just need to set the rflags.
1791 */ 2586 */
1792 open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; 2587 open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
1793 if (!open->op_stateowner->so_confirmed) 2588 if (!open->op_stateowner->so_confirmed &&
2589 !nfsd4_has_session(&resp->cstate))
1794 open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; 2590 open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
1795 2591
1796 return status; 2592 return status;
@@ -1898,7 +2694,7 @@ nfs4_laundromat(void)
1898 } 2694 }
1899 dprintk("NFSD: purging unused open stateowner (so_id %d)\n", 2695 dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
1900 sop->so_id); 2696 sop->so_id);
1901 release_stateowner(sop); 2697 release_openowner(sop);
1902 } 2698 }
1903 if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) 2699 if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
1904 clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; 2700 clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -1983,10 +2779,7 @@ out:
1983static inline __be32 2779static inline __be32
1984check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) 2780check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
1985{ 2781{
1986 /* Trying to call delegreturn with a special stateid? Yuch: */ 2782 if (ONE_STATEID(stateid) && (flags & RD_STATE))
1987 if (!(flags & (RD_STATE | WR_STATE)))
1988 return nfserr_bad_stateid;
1989 else if (ONE_STATEID(stateid) && (flags & RD_STATE))
1990 return nfs_ok; 2783 return nfs_ok;
1991 else if (locks_in_grace()) { 2784 else if (locks_in_grace()) {
1992 /* Answer in remaining cases depends on existance of 2785 /* Answer in remaining cases depends on existance of
@@ -2005,14 +2798,20 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
2005 * that are not able to provide mandatory locking. 2798 * that are not able to provide mandatory locking.
2006 */ 2799 */
2007static inline int 2800static inline int
2008io_during_grace_disallowed(struct inode *inode, int flags) 2801grace_disallows_io(struct inode *inode)
2009{ 2802{
2010 return locks_in_grace() && (flags & (RD_STATE | WR_STATE)) 2803 return locks_in_grace() && mandatory_lock(inode);
2011 && mandatory_lock(inode);
2012} 2804}
2013 2805
2014static int check_stateid_generation(stateid_t *in, stateid_t *ref) 2806static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags)
2015{ 2807{
2808 /*
2809 * When sessions are used the stateid generation number is ignored
2810 * when it is zero.
2811 */
2812 if ((flags & HAS_SESSION) && in->si_generation == 0)
2813 goto out;
2814
2016 /* If the client sends us a stateid from the future, it's buggy: */ 2815 /* If the client sends us a stateid from the future, it's buggy: */
2017 if (in->si_generation > ref->si_generation) 2816 if (in->si_generation > ref->si_generation)
2018 return nfserr_bad_stateid; 2817 return nfserr_bad_stateid;
@@ -2028,74 +2827,77 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref)
2028 */ 2827 */
2029 if (in->si_generation < ref->si_generation) 2828 if (in->si_generation < ref->si_generation)
2030 return nfserr_old_stateid; 2829 return nfserr_old_stateid;
2830out:
2031 return nfs_ok; 2831 return nfs_ok;
2032} 2832}
2033 2833
2834static int is_delegation_stateid(stateid_t *stateid)
2835{
2836 return stateid->si_fileid == 0;
2837}
2838
2034/* 2839/*
2035* Checks for stateid operations 2840* Checks for stateid operations
2036*/ 2841*/
2037__be32 2842__be32
2038nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp) 2843nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2844 stateid_t *stateid, int flags, struct file **filpp)
2039{ 2845{
2040 struct nfs4_stateid *stp = NULL; 2846 struct nfs4_stateid *stp = NULL;
2041 struct nfs4_delegation *dp = NULL; 2847 struct nfs4_delegation *dp = NULL;
2042 stateid_t *stidp; 2848 struct svc_fh *current_fh = &cstate->current_fh;
2043 struct inode *ino = current_fh->fh_dentry->d_inode; 2849 struct inode *ino = current_fh->fh_dentry->d_inode;
2044 __be32 status; 2850 __be32 status;
2045 2851
2046 dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n",
2047 stateid->si_boot, stateid->si_stateownerid,
2048 stateid->si_fileid, stateid->si_generation);
2049 if (filpp) 2852 if (filpp)
2050 *filpp = NULL; 2853 *filpp = NULL;
2051 2854
2052 if (io_during_grace_disallowed(ino, flags)) 2855 if (grace_disallows_io(ino))
2053 return nfserr_grace; 2856 return nfserr_grace;
2054 2857
2858 if (nfsd4_has_session(cstate))
2859 flags |= HAS_SESSION;
2860
2055 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) 2861 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
2056 return check_special_stateids(current_fh, stateid, flags); 2862 return check_special_stateids(current_fh, stateid, flags);
2057 2863
2058 /* STALE STATEID */
2059 status = nfserr_stale_stateid; 2864 status = nfserr_stale_stateid;
2060 if (STALE_STATEID(stateid)) 2865 if (STALE_STATEID(stateid))
2061 goto out; 2866 goto out;
2062 2867
2063 /* BAD STATEID */
2064 status = nfserr_bad_stateid; 2868 status = nfserr_bad_stateid;
2065 if (!stateid->si_fileid) { /* delegation stateid */ 2869 if (is_delegation_stateid(stateid)) {
2066 if(!(dp = find_delegation_stateid(ino, stateid))) { 2870 dp = find_delegation_stateid(ino, stateid);
2067 dprintk("NFSD: delegation stateid not found\n"); 2871 if (!dp)
2068 goto out; 2872 goto out;
2069 } 2873 status = check_stateid_generation(stateid, &dp->dl_stateid,
2070 stidp = &dp->dl_stateid; 2874 flags);
2875 if (status)
2876 goto out;
2877 status = nfs4_check_delegmode(dp, flags);
2878 if (status)
2879 goto out;
2880 renew_client(dp->dl_client);
2881 if (filpp)
2882 *filpp = dp->dl_vfs_file;
2071 } else { /* open or lock stateid */ 2883 } else { /* open or lock stateid */
2072 if (!(stp = find_stateid(stateid, flags))) { 2884 stp = find_stateid(stateid, flags);
2073 dprintk("NFSD: open or lock stateid not found\n"); 2885 if (!stp)
2074 goto out; 2886 goto out;
2075 } 2887 if (nfs4_check_fh(current_fh, stp))
2076 if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp))
2077 goto out; 2888 goto out;
2078 if (!stp->st_stateowner->so_confirmed) 2889 if (!stp->st_stateowner->so_confirmed)
2079 goto out; 2890 goto out;
2080 stidp = &stp->st_stateid; 2891 status = check_stateid_generation(stateid, &stp->st_stateid,
2081 } 2892 flags);
2082 status = check_stateid_generation(stateid, stidp); 2893 if (status)
2083 if (status) 2894 goto out;
2084 goto out; 2895 status = nfs4_check_openmode(stp, flags);
2085 if (stp) { 2896 if (status)
2086 if ((status = nfs4_check_openmode(stp,flags)))
2087 goto out; 2897 goto out;
2088 renew_client(stp->st_stateowner->so_client); 2898 renew_client(stp->st_stateowner->so_client);
2089 if (filpp) 2899 if (filpp)
2090 *filpp = stp->st_vfs_file; 2900 *filpp = stp->st_vfs_file;
2091 } else {
2092 if ((status = nfs4_check_delegmode(dp, flags)))
2093 goto out;
2094 renew_client(dp->dl_client);
2095 if (flags & DELEG_RET)
2096 unhash_delegation(dp);
2097 if (filpp)
2098 *filpp = dp->dl_vfs_file;
2099 } 2901 }
2100 status = nfs_ok; 2902 status = nfs_ok;
2101out: 2903out:
@@ -2113,10 +2915,14 @@ setlkflg (int type)
2113 * Checks for sequence id mutating operations. 2915 * Checks for sequence id mutating operations.
2114 */ 2916 */
2115static __be32 2917static __be32
2116nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock) 2918nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
2919 stateid_t *stateid, int flags,
2920 struct nfs4_stateowner **sopp,
2921 struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
2117{ 2922{
2118 struct nfs4_stateid *stp; 2923 struct nfs4_stateid *stp;
2119 struct nfs4_stateowner *sop; 2924 struct nfs4_stateowner *sop;
2925 struct svc_fh *current_fh = &cstate->current_fh;
2120 __be32 status; 2926 __be32 status;
2121 2927
2122 dprintk("NFSD: preprocess_seqid_op: seqid=%d " 2928 dprintk("NFSD: preprocess_seqid_op: seqid=%d "
@@ -2134,6 +2940,10 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2134 2940
2135 if (STALE_STATEID(stateid)) 2941 if (STALE_STATEID(stateid))
2136 return nfserr_stale_stateid; 2942 return nfserr_stale_stateid;
2943
2944 if (nfsd4_has_session(cstate))
2945 flags |= HAS_SESSION;
2946
2137 /* 2947 /*
2138 * We return BAD_STATEID if filehandle doesn't match stateid, 2948 * We return BAD_STATEID if filehandle doesn't match stateid,
2139 * the confirmed flag is incorrecly set, or the generation 2949 * the confirmed flag is incorrecly set, or the generation
@@ -2166,8 +2976,9 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2166 if (lock->lk_is_new) { 2976 if (lock->lk_is_new) {
2167 if (!sop->so_is_open_owner) 2977 if (!sop->so_is_open_owner)
2168 return nfserr_bad_stateid; 2978 return nfserr_bad_stateid;
2169 if (!same_clid(&clp->cl_clientid, lockclid)) 2979 if (!(flags & HAS_SESSION) &&
2170 return nfserr_bad_stateid; 2980 !same_clid(&clp->cl_clientid, lockclid))
2981 return nfserr_bad_stateid;
2171 /* stp is the open stateid */ 2982 /* stp is the open stateid */
2172 status = nfs4_check_openmode(stp, lkflg); 2983 status = nfs4_check_openmode(stp, lkflg);
2173 if (status) 2984 if (status)
@@ -2190,7 +3001,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2190 * For the moment, we ignore the possibility of 3001 * For the moment, we ignore the possibility of
2191 * generation number wraparound. 3002 * generation number wraparound.
2192 */ 3003 */
2193 if (seqid != sop->so_seqid) 3004 if (!(flags & HAS_SESSION) && seqid != sop->so_seqid)
2194 goto check_replay; 3005 goto check_replay;
2195 3006
2196 if (sop->so_confirmed && flags & CONFIRM) { 3007 if (sop->so_confirmed && flags & CONFIRM) {
@@ -2203,7 +3014,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2203 " confirmed yet!\n"); 3014 " confirmed yet!\n");
2204 return nfserr_bad_stateid; 3015 return nfserr_bad_stateid;
2205 } 3016 }
2206 status = check_stateid_generation(stateid, &stp->st_stateid); 3017 status = check_stateid_generation(stateid, &stp->st_stateid, flags);
2207 if (status) 3018 if (status)
2208 return status; 3019 return status;
2209 renew_client(sop->so_client); 3020 renew_client(sop->so_client);
@@ -2239,7 +3050,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2239 3050
2240 nfs4_lock_state(); 3051 nfs4_lock_state();
2241 3052
2242 if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3053 if ((status = nfs4_preprocess_seqid_op(cstate,
2243 oc->oc_seqid, &oc->oc_req_stateid, 3054 oc->oc_seqid, &oc->oc_req_stateid,
2244 CONFIRM | OPEN_STATE, 3055 CONFIRM | OPEN_STATE,
2245 &oc->oc_stateowner, &stp, NULL))) 3056 &oc->oc_stateowner, &stp, NULL)))
@@ -2304,12 +3115,12 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
2304 (int)cstate->current_fh.fh_dentry->d_name.len, 3115 (int)cstate->current_fh.fh_dentry->d_name.len,
2305 cstate->current_fh.fh_dentry->d_name.name); 3116 cstate->current_fh.fh_dentry->d_name.name);
2306 3117
2307 if (!access_valid(od->od_share_access) 3118 if (!access_valid(od->od_share_access, cstate->minorversion)
2308 || !deny_valid(od->od_share_deny)) 3119 || !deny_valid(od->od_share_deny))
2309 return nfserr_inval; 3120 return nfserr_inval;
2310 3121
2311 nfs4_lock_state(); 3122 nfs4_lock_state();
2312 if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3123 if ((status = nfs4_preprocess_seqid_op(cstate,
2313 od->od_seqid, 3124 od->od_seqid,
2314 &od->od_stateid, 3125 &od->od_stateid,
2315 OPEN_STATE, 3126 OPEN_STATE,
@@ -2362,7 +3173,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2362 3173
2363 nfs4_lock_state(); 3174 nfs4_lock_state();
2364 /* check close_lru for replay */ 3175 /* check close_lru for replay */
2365 if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3176 if ((status = nfs4_preprocess_seqid_op(cstate,
2366 close->cl_seqid, 3177 close->cl_seqid,
2367 &close->cl_stateid, 3178 &close->cl_stateid,
2368 OPEN_STATE | CLOSE_STATE, 3179 OPEN_STATE | CLOSE_STATE,
@@ -2373,7 +3184,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2373 memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t)); 3184 memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t));
2374 3185
2375 /* release_stateid() calls nfsd_close() if needed */ 3186 /* release_stateid() calls nfsd_close() if needed */
2376 release_stateid(stp, OPEN_STATE); 3187 release_open_stateid(stp);
2377 3188
2378 /* place unused nfs4_stateowners on so_close_lru list to be 3189 /* place unused nfs4_stateowners on so_close_lru list to be
2379 * released by the laundromat service after the lease period 3190 * released by the laundromat service after the lease period
@@ -2394,16 +3205,40 @@ __be32
2394nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 3205nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2395 struct nfsd4_delegreturn *dr) 3206 struct nfsd4_delegreturn *dr)
2396{ 3207{
3208 struct nfs4_delegation *dp;
3209 stateid_t *stateid = &dr->dr_stateid;
3210 struct inode *inode;
2397 __be32 status; 3211 __be32 status;
3212 int flags = 0;
2398 3213
2399 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) 3214 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
2400 goto out; 3215 return status;
3216 inode = cstate->current_fh.fh_dentry->d_inode;
2401 3217
3218 if (nfsd4_has_session(cstate))
3219 flags |= HAS_SESSION;
2402 nfs4_lock_state(); 3220 nfs4_lock_state();
2403 status = nfs4_preprocess_stateid_op(&cstate->current_fh, 3221 status = nfserr_bad_stateid;
2404 &dr->dr_stateid, DELEG_RET, NULL); 3222 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
2405 nfs4_unlock_state(); 3223 goto out;
3224 status = nfserr_stale_stateid;
3225 if (STALE_STATEID(stateid))
3226 goto out;
3227 status = nfserr_bad_stateid;
3228 if (!is_delegation_stateid(stateid))
3229 goto out;
3230 dp = find_delegation_stateid(inode, stateid);
3231 if (!dp)
3232 goto out;
3233 status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
3234 if (status)
3235 goto out;
3236 renew_client(dp->dl_client);
3237
3238 unhash_delegation(dp);
2406out: 3239out:
3240 nfs4_unlock_state();
3241
2407 return status; 3242 return status;
2408} 3243}
2409 3244
@@ -2684,11 +3519,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2684 struct nfs4_file *fp; 3519 struct nfs4_file *fp;
2685 3520
2686 status = nfserr_stale_clientid; 3521 status = nfserr_stale_clientid;
2687 if (STALE_CLIENTID(&lock->lk_new_clientid)) 3522 if (!nfsd4_has_session(cstate) &&
3523 STALE_CLIENTID(&lock->lk_new_clientid))
2688 goto out; 3524 goto out;
2689 3525
2690 /* validate and update open stateid and open seqid */ 3526 /* validate and update open stateid and open seqid */
2691 status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3527 status = nfs4_preprocess_seqid_op(cstate,
2692 lock->lk_new_open_seqid, 3528 lock->lk_new_open_seqid,
2693 &lock->lk_new_open_stateid, 3529 &lock->lk_new_open_stateid,
2694 OPEN_STATE, 3530 OPEN_STATE,
@@ -2715,7 +3551,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2715 goto out; 3551 goto out;
2716 } else { 3552 } else {
2717 /* lock (lock owner + lock stateid) already exists */ 3553 /* lock (lock owner + lock stateid) already exists */
2718 status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3554 status = nfs4_preprocess_seqid_op(cstate,
2719 lock->lk_old_lock_seqid, 3555 lock->lk_old_lock_seqid,
2720 &lock->lk_old_lock_stateid, 3556 &lock->lk_old_lock_stateid,
2721 LOCK_STATE, 3557 LOCK_STATE,
@@ -2788,7 +3624,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2788 } 3624 }
2789out: 3625out:
2790 if (status && lock->lk_is_new && lock_sop) 3626 if (status && lock->lk_is_new && lock_sop)
2791 release_stateowner(lock_sop); 3627 release_lockowner(lock_sop);
2792 if (lock->lk_replay_owner) { 3628 if (lock->lk_replay_owner) {
2793 nfs4_get_stateowner(lock->lk_replay_owner); 3629 nfs4_get_stateowner(lock->lk_replay_owner);
2794 cstate->replay_owner = lock->lk_replay_owner; 3630 cstate->replay_owner = lock->lk_replay_owner;
@@ -2838,7 +3674,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2838 nfs4_lock_state(); 3674 nfs4_lock_state();
2839 3675
2840 status = nfserr_stale_clientid; 3676 status = nfserr_stale_clientid;
2841 if (STALE_CLIENTID(&lockt->lt_clientid)) 3677 if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid))
2842 goto out; 3678 goto out;
2843 3679
2844 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) { 3680 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) {
@@ -2911,7 +3747,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2911 3747
2912 nfs4_lock_state(); 3748 nfs4_lock_state();
2913 3749
2914 if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3750 if ((status = nfs4_preprocess_seqid_op(cstate,
2915 locku->lu_seqid, 3751 locku->lu_seqid,
2916 &locku->lu_stateid, 3752 &locku->lu_stateid,
2917 LOCK_STATE, 3753 LOCK_STATE,
@@ -3037,7 +3873,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
3037 /* unhash_stateowner deletes so_perclient only 3873 /* unhash_stateowner deletes so_perclient only
3038 * for openowners. */ 3874 * for openowners. */
3039 list_del(&sop->so_perclient); 3875 list_del(&sop->so_perclient);
3040 release_stateowner(sop); 3876 release_lockowner(sop);
3041 } 3877 }
3042out: 3878out:
3043 nfs4_unlock_state(); 3879 nfs4_unlock_state();
@@ -3051,12 +3887,12 @@ alloc_reclaim(void)
3051} 3887}
3052 3888
3053int 3889int
3054nfs4_has_reclaimed_state(const char *name) 3890nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
3055{ 3891{
3056 unsigned int strhashval = clientstr_hashval(name); 3892 unsigned int strhashval = clientstr_hashval(name);
3057 struct nfs4_client *clp; 3893 struct nfs4_client *clp;
3058 3894
3059 clp = find_confirmed_client_by_str(name, strhashval); 3895 clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id);
3060 return clp ? 1 : 0; 3896 return clp ? 1 : 0;
3061} 3897}
3062 3898
@@ -3153,6 +3989,8 @@ nfs4_state_init(void)
3153 INIT_LIST_HEAD(&unconf_str_hashtbl[i]); 3989 INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
3154 INIT_LIST_HEAD(&unconf_id_hashtbl[i]); 3990 INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
3155 } 3991 }
3992 for (i = 0; i < SESSION_HASH_SIZE; i++)
3993 INIT_LIST_HEAD(&sessionid_hashtbl[i]);
3156 for (i = 0; i < FILE_HASH_SIZE; i++) { 3994 for (i = 0; i < FILE_HASH_SIZE; i++) {
3157 INIT_LIST_HEAD(&file_hashtbl[i]); 3995 INIT_LIST_HEAD(&file_hashtbl[i]);
3158 } 3996 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 9250067943d8..b820c311931c 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -45,6 +45,7 @@
45#include <linux/fs.h> 45#include <linux/fs.h>
46#include <linux/namei.h> 46#include <linux/namei.h>
47#include <linux/vfs.h> 47#include <linux/vfs.h>
48#include <linux/utsname.h>
48#include <linux/sunrpc/xdr.h> 49#include <linux/sunrpc/xdr.h>
49#include <linux/sunrpc/svc.h> 50#include <linux/sunrpc/svc.h>
50#include <linux/sunrpc/clnt.h> 51#include <linux/sunrpc/clnt.h>
@@ -188,6 +189,11 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
188 return p; 189 return p;
189} 190}
190 191
192static int zero_clientid(clientid_t *clid)
193{
194 return (clid->cl_boot == 0) && (clid->cl_id == 0);
195}
196
191static int 197static int
192defer_free(struct nfsd4_compoundargs *argp, 198defer_free(struct nfsd4_compoundargs *argp,
193 void (*release)(const void *), void *p) 199 void (*release)(const void *), void *p)
@@ -230,6 +236,7 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
230 236
231 bmval[0] = 0; 237 bmval[0] = 0;
232 bmval[1] = 0; 238 bmval[1] = 0;
239 bmval[2] = 0;
233 240
234 READ_BUF(4); 241 READ_BUF(4);
235 READ32(bmlen); 242 READ32(bmlen);
@@ -241,13 +248,27 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
241 READ32(bmval[0]); 248 READ32(bmval[0]);
242 if (bmlen > 1) 249 if (bmlen > 1)
243 READ32(bmval[1]); 250 READ32(bmval[1]);
251 if (bmlen > 2)
252 READ32(bmval[2]);
244 253
245 DECODE_TAIL; 254 DECODE_TAIL;
246} 255}
247 256
257static u32 nfsd_attrmask[] = {
258 NFSD_WRITEABLE_ATTRS_WORD0,
259 NFSD_WRITEABLE_ATTRS_WORD1,
260 NFSD_WRITEABLE_ATTRS_WORD2
261};
262
263static u32 nfsd41_ex_attrmask[] = {
264 NFSD_SUPPATTR_EXCLCREAT_WORD0,
265 NFSD_SUPPATTR_EXCLCREAT_WORD1,
266 NFSD_SUPPATTR_EXCLCREAT_WORD2
267};
268
248static __be32 269static __be32
249nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr, 270nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
250 struct nfs4_acl **acl) 271 struct iattr *iattr, struct nfs4_acl **acl)
251{ 272{
252 int expected_len, len = 0; 273 int expected_len, len = 0;
253 u32 dummy32; 274 u32 dummy32;
@@ -263,9 +284,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
263 * According to spec, unsupported attributes return ERR_ATTRNOTSUPP; 284 * According to spec, unsupported attributes return ERR_ATTRNOTSUPP;
264 * read-only attributes return ERR_INVAL. 285 * read-only attributes return ERR_INVAL.
265 */ 286 */
266 if ((bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) || (bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1)) 287 if ((bmval[0] & ~nfsd_suppattrs0(argp->minorversion)) ||
288 (bmval[1] & ~nfsd_suppattrs1(argp->minorversion)) ||
289 (bmval[2] & ~nfsd_suppattrs2(argp->minorversion)))
267 return nfserr_attrnotsupp; 290 return nfserr_attrnotsupp;
268 if ((bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0) || (bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1)) 291 if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
292 (bmval[2] & ~writable[2]))
269 return nfserr_inval; 293 return nfserr_inval;
270 294
271 READ_BUF(4); 295 READ_BUF(4);
@@ -400,6 +424,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
400 goto xdr_error; 424 goto xdr_error;
401 } 425 }
402 } 426 }
427 BUG_ON(bmval[2]); /* no such writeable attr supported yet */
403 if (len != expected_len) 428 if (len != expected_len)
404 goto xdr_error; 429 goto xdr_error;
405 430
@@ -493,7 +518,9 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
493 if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval))) 518 if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval)))
494 return status; 519 return status;
495 520
496 if ((status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, &create->cr_acl))) 521 status = nfsd4_decode_fattr(argp, create->cr_bmval, nfsd_attrmask,
522 &create->cr_iattr, &create->cr_acl);
523 if (status)
497 goto out; 524 goto out;
498 525
499 DECODE_TAIL; 526 DECODE_TAIL;
@@ -583,6 +610,8 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
583 READ_BUF(lockt->lt_owner.len); 610 READ_BUF(lockt->lt_owner.len);
584 READMEM(lockt->lt_owner.data, lockt->lt_owner.len); 611 READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
585 612
613 if (argp->minorversion && !zero_clientid(&lockt->lt_clientid))
614 return nfserr_inval;
586 DECODE_TAIL; 615 DECODE_TAIL;
587} 616}
588 617
@@ -652,13 +681,26 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
652 switch (open->op_createmode) { 681 switch (open->op_createmode) {
653 case NFS4_CREATE_UNCHECKED: 682 case NFS4_CREATE_UNCHECKED:
654 case NFS4_CREATE_GUARDED: 683 case NFS4_CREATE_GUARDED:
655 if ((status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl))) 684 status = nfsd4_decode_fattr(argp, open->op_bmval,
685 nfsd_attrmask, &open->op_iattr, &open->op_acl);
686 if (status)
656 goto out; 687 goto out;
657 break; 688 break;
658 case NFS4_CREATE_EXCLUSIVE: 689 case NFS4_CREATE_EXCLUSIVE:
659 READ_BUF(8); 690 READ_BUF(8);
660 COPYMEM(open->op_verf.data, 8); 691 COPYMEM(open->op_verf.data, 8);
661 break; 692 break;
693 case NFS4_CREATE_EXCLUSIVE4_1:
694 if (argp->minorversion < 1)
695 goto xdr_error;
696 READ_BUF(8);
697 COPYMEM(open->op_verf.data, 8);
698 status = nfsd4_decode_fattr(argp, open->op_bmval,
699 nfsd41_ex_attrmask, &open->op_iattr,
700 &open->op_acl);
701 if (status)
702 goto out;
703 break;
662 default: 704 default:
663 goto xdr_error; 705 goto xdr_error;
664 } 706 }
@@ -851,7 +893,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
851 status = nfsd4_decode_stateid(argp, &setattr->sa_stateid); 893 status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
852 if (status) 894 if (status)
853 return status; 895 return status;
854 return nfsd4_decode_fattr(argp, setattr->sa_bmval, 896 return nfsd4_decode_fattr(argp, setattr->sa_bmval, nfsd_attrmask,
855 &setattr->sa_iattr, &setattr->sa_acl); 897 &setattr->sa_iattr, &setattr->sa_acl);
856} 898}
857 899
@@ -993,6 +1035,241 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
993 READ_BUF(rlockowner->rl_owner.len); 1035 READ_BUF(rlockowner->rl_owner.len);
994 READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len); 1036 READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len);
995 1037
1038 if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid))
1039 return nfserr_inval;
1040 DECODE_TAIL;
1041}
1042
1043static __be32
1044nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
1045 struct nfsd4_exchange_id *exid)
1046{
1047 int dummy;
1048 DECODE_HEAD;
1049
1050 READ_BUF(NFS4_VERIFIER_SIZE);
1051 COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
1052
1053 READ_BUF(4);
1054 READ32(exid->clname.len);
1055
1056 READ_BUF(exid->clname.len);
1057 SAVEMEM(exid->clname.data, exid->clname.len);
1058
1059 READ_BUF(4);
1060 READ32(exid->flags);
1061
1062 /* Ignore state_protect4_a */
1063 READ_BUF(4);
1064 READ32(exid->spa_how);
1065 switch (exid->spa_how) {
1066 case SP4_NONE:
1067 break;
1068 case SP4_MACH_CRED:
1069 /* spo_must_enforce */
1070 READ_BUF(4);
1071 READ32(dummy);
1072 READ_BUF(dummy * 4);
1073 p += dummy;
1074
1075 /* spo_must_allow */
1076 READ_BUF(4);
1077 READ32(dummy);
1078 READ_BUF(dummy * 4);
1079 p += dummy;
1080 break;
1081 case SP4_SSV:
1082 /* ssp_ops */
1083 READ_BUF(4);
1084 READ32(dummy);
1085 READ_BUF(dummy * 4);
1086 p += dummy;
1087
1088 READ_BUF(4);
1089 READ32(dummy);
1090 READ_BUF(dummy * 4);
1091 p += dummy;
1092
1093 /* ssp_hash_algs<> */
1094 READ_BUF(4);
1095 READ32(dummy);
1096 READ_BUF(dummy);
1097 p += XDR_QUADLEN(dummy);
1098
1099 /* ssp_encr_algs<> */
1100 READ_BUF(4);
1101 READ32(dummy);
1102 READ_BUF(dummy);
1103 p += XDR_QUADLEN(dummy);
1104
1105 /* ssp_window and ssp_num_gss_handles */
1106 READ_BUF(8);
1107 READ32(dummy);
1108 READ32(dummy);
1109 break;
1110 default:
1111 goto xdr_error;
1112 }
1113
1114 /* Ignore Implementation ID */
1115 READ_BUF(4); /* nfs_impl_id4 array length */
1116 READ32(dummy);
1117
1118 if (dummy > 1)
1119 goto xdr_error;
1120
1121 if (dummy == 1) {
1122 /* nii_domain */
1123 READ_BUF(4);
1124 READ32(dummy);
1125 READ_BUF(dummy);
1126 p += XDR_QUADLEN(dummy);
1127
1128 /* nii_name */
1129 READ_BUF(4);
1130 READ32(dummy);
1131 READ_BUF(dummy);
1132 p += XDR_QUADLEN(dummy);
1133
1134 /* nii_date */
1135 READ_BUF(12);
1136 p += 3;
1137 }
1138 DECODE_TAIL;
1139}
1140
1141static __be32
1142nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
1143 struct nfsd4_create_session *sess)
1144{
1145 DECODE_HEAD;
1146
1147 u32 dummy;
1148 char *machine_name;
1149 int i;
1150 int nr_secflavs;
1151
1152 READ_BUF(16);
1153 COPYMEM(&sess->clientid, 8);
1154 READ32(sess->seqid);
1155 READ32(sess->flags);
1156
1157 /* Fore channel attrs */
1158 READ_BUF(28);
1159 READ32(dummy); /* headerpadsz is always 0 */
1160 READ32(sess->fore_channel.maxreq_sz);
1161 READ32(sess->fore_channel.maxresp_sz);
1162 READ32(sess->fore_channel.maxresp_cached);
1163 READ32(sess->fore_channel.maxops);
1164 READ32(sess->fore_channel.maxreqs);
1165 READ32(sess->fore_channel.nr_rdma_attrs);
1166 if (sess->fore_channel.nr_rdma_attrs == 1) {
1167 READ_BUF(4);
1168 READ32(sess->fore_channel.rdma_attrs);
1169 } else if (sess->fore_channel.nr_rdma_attrs > 1) {
1170 dprintk("Too many fore channel attr bitmaps!\n");
1171 goto xdr_error;
1172 }
1173
1174 /* Back channel attrs */
1175 READ_BUF(28);
1176 READ32(dummy); /* headerpadsz is always 0 */
1177 READ32(sess->back_channel.maxreq_sz);
1178 READ32(sess->back_channel.maxresp_sz);
1179 READ32(sess->back_channel.maxresp_cached);
1180 READ32(sess->back_channel.maxops);
1181 READ32(sess->back_channel.maxreqs);
1182 READ32(sess->back_channel.nr_rdma_attrs);
1183 if (sess->back_channel.nr_rdma_attrs == 1) {
1184 READ_BUF(4);
1185 READ32(sess->back_channel.rdma_attrs);
1186 } else if (sess->back_channel.nr_rdma_attrs > 1) {
1187 dprintk("Too many back channel attr bitmaps!\n");
1188 goto xdr_error;
1189 }
1190
1191 READ_BUF(8);
1192 READ32(sess->callback_prog);
1193
1194 /* callback_sec_params4 */
1195 READ32(nr_secflavs);
1196 for (i = 0; i < nr_secflavs; ++i) {
1197 READ_BUF(4);
1198 READ32(dummy);
1199 switch (dummy) {
1200 case RPC_AUTH_NULL:
1201 /* Nothing to read */
1202 break;
1203 case RPC_AUTH_UNIX:
1204 READ_BUF(8);
1205 /* stamp */
1206 READ32(dummy);
1207
1208 /* machine name */
1209 READ32(dummy);
1210 READ_BUF(dummy);
1211 SAVEMEM(machine_name, dummy);
1212
1213 /* uid, gid */
1214 READ_BUF(8);
1215 READ32(sess->uid);
1216 READ32(sess->gid);
1217
1218 /* more gids */
1219 READ_BUF(4);
1220 READ32(dummy);
1221 READ_BUF(dummy * 4);
1222 for (i = 0; i < dummy; ++i)
1223 READ32(dummy);
1224 break;
1225 case RPC_AUTH_GSS:
1226 dprintk("RPC_AUTH_GSS callback secflavor "
1227 "not supported!\n");
1228 READ_BUF(8);
1229 /* gcbp_service */
1230 READ32(dummy);
1231 /* gcbp_handle_from_server */
1232 READ32(dummy);
1233 READ_BUF(dummy);
1234 p += XDR_QUADLEN(dummy);
1235 /* gcbp_handle_from_client */
1236 READ_BUF(4);
1237 READ32(dummy);
1238 READ_BUF(dummy);
1239 p += XDR_QUADLEN(dummy);
1240 break;
1241 default:
1242 dprintk("Illegal callback secflavor\n");
1243 return nfserr_inval;
1244 }
1245 }
1246 DECODE_TAIL;
1247}
1248
1249static __be32
1250nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
1251 struct nfsd4_destroy_session *destroy_session)
1252{
1253 DECODE_HEAD;
1254 READ_BUF(NFS4_MAX_SESSIONID_LEN);
1255 COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN);
1256
1257 DECODE_TAIL;
1258}
1259
1260static __be32
1261nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
1262 struct nfsd4_sequence *seq)
1263{
1264 DECODE_HEAD;
1265
1266 READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
1267 COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
1268 READ32(seq->seqid);
1269 READ32(seq->slotid);
1270 READ32(seq->maxslots);
1271 READ32(seq->cachethis);
1272
996 DECODE_TAIL; 1273 DECODE_TAIL;
997} 1274}
998 1275
@@ -1005,7 +1282,7 @@ nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
1005static __be32 1282static __be32
1006nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p) 1283nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
1007{ 1284{
1008 return nfserr_opnotsupp; 1285 return nfserr_notsupp;
1009} 1286}
1010 1287
1011typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *); 1288typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
@@ -1031,7 +1308,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
1031 [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm, 1308 [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm,
1032 [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade, 1309 [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade,
1033 [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh, 1310 [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh,
1034 [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_notsupp, 1311 [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_noop,
1035 [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop, 1312 [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop,
1036 [OP_READ] = (nfsd4_dec)nfsd4_decode_read, 1313 [OP_READ] = (nfsd4_dec)nfsd4_decode_read,
1037 [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir, 1314 [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir,
@@ -1050,6 +1327,67 @@ static nfsd4_dec nfsd4_dec_ops[] = {
1050 [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner, 1327 [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner,
1051}; 1328};
1052 1329
1330static nfsd4_dec nfsd41_dec_ops[] = {
1331 [OP_ACCESS] (nfsd4_dec)nfsd4_decode_access,
1332 [OP_CLOSE] (nfsd4_dec)nfsd4_decode_close,
1333 [OP_COMMIT] (nfsd4_dec)nfsd4_decode_commit,
1334 [OP_CREATE] (nfsd4_dec)nfsd4_decode_create,
1335 [OP_DELEGPURGE] (nfsd4_dec)nfsd4_decode_notsupp,
1336 [OP_DELEGRETURN] (nfsd4_dec)nfsd4_decode_delegreturn,
1337 [OP_GETATTR] (nfsd4_dec)nfsd4_decode_getattr,
1338 [OP_GETFH] (nfsd4_dec)nfsd4_decode_noop,
1339 [OP_LINK] (nfsd4_dec)nfsd4_decode_link,
1340 [OP_LOCK] (nfsd4_dec)nfsd4_decode_lock,
1341 [OP_LOCKT] (nfsd4_dec)nfsd4_decode_lockt,
1342 [OP_LOCKU] (nfsd4_dec)nfsd4_decode_locku,
1343 [OP_LOOKUP] (nfsd4_dec)nfsd4_decode_lookup,
1344 [OP_LOOKUPP] (nfsd4_dec)nfsd4_decode_noop,
1345 [OP_NVERIFY] (nfsd4_dec)nfsd4_decode_verify,
1346 [OP_OPEN] (nfsd4_dec)nfsd4_decode_open,
1347 [OP_OPENATTR] (nfsd4_dec)nfsd4_decode_notsupp,
1348 [OP_OPEN_CONFIRM] (nfsd4_dec)nfsd4_decode_notsupp,
1349 [OP_OPEN_DOWNGRADE] (nfsd4_dec)nfsd4_decode_open_downgrade,
1350 [OP_PUTFH] (nfsd4_dec)nfsd4_decode_putfh,
1351 [OP_PUTPUBFH] (nfsd4_dec)nfsd4_decode_notsupp,
1352 [OP_PUTROOTFH] (nfsd4_dec)nfsd4_decode_noop,
1353 [OP_READ] (nfsd4_dec)nfsd4_decode_read,
1354 [OP_READDIR] (nfsd4_dec)nfsd4_decode_readdir,
1355 [OP_READLINK] (nfsd4_dec)nfsd4_decode_noop,
1356 [OP_REMOVE] (nfsd4_dec)nfsd4_decode_remove,
1357 [OP_RENAME] (nfsd4_dec)nfsd4_decode_rename,
1358 [OP_RENEW] (nfsd4_dec)nfsd4_decode_notsupp,
1359 [OP_RESTOREFH] (nfsd4_dec)nfsd4_decode_noop,
1360 [OP_SAVEFH] (nfsd4_dec)nfsd4_decode_noop,
1361 [OP_SECINFO] (nfsd4_dec)nfsd4_decode_secinfo,
1362 [OP_SETATTR] (nfsd4_dec)nfsd4_decode_setattr,
1363 [OP_SETCLIENTID] (nfsd4_dec)nfsd4_decode_notsupp,
1364 [OP_SETCLIENTID_CONFIRM](nfsd4_dec)nfsd4_decode_notsupp,
1365 [OP_VERIFY] (nfsd4_dec)nfsd4_decode_verify,
1366 [OP_WRITE] (nfsd4_dec)nfsd4_decode_write,
1367 [OP_RELEASE_LOCKOWNER] (nfsd4_dec)nfsd4_decode_notsupp,
1368
1369 /* new operations for NFSv4.1 */
1370 [OP_BACKCHANNEL_CTL] (nfsd4_dec)nfsd4_decode_notsupp,
1371 [OP_BIND_CONN_TO_SESSION](nfsd4_dec)nfsd4_decode_notsupp,
1372 [OP_EXCHANGE_ID] (nfsd4_dec)nfsd4_decode_exchange_id,
1373 [OP_CREATE_SESSION] (nfsd4_dec)nfsd4_decode_create_session,
1374 [OP_DESTROY_SESSION] (nfsd4_dec)nfsd4_decode_destroy_session,
1375 [OP_FREE_STATEID] (nfsd4_dec)nfsd4_decode_notsupp,
1376 [OP_GET_DIR_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp,
1377 [OP_GETDEVICEINFO] (nfsd4_dec)nfsd4_decode_notsupp,
1378 [OP_GETDEVICELIST] (nfsd4_dec)nfsd4_decode_notsupp,
1379 [OP_LAYOUTCOMMIT] (nfsd4_dec)nfsd4_decode_notsupp,
1380 [OP_LAYOUTGET] (nfsd4_dec)nfsd4_decode_notsupp,
1381 [OP_LAYOUTRETURN] (nfsd4_dec)nfsd4_decode_notsupp,
1382 [OP_SECINFO_NO_NAME] (nfsd4_dec)nfsd4_decode_notsupp,
1383 [OP_SEQUENCE] (nfsd4_dec)nfsd4_decode_sequence,
1384 [OP_SET_SSV] (nfsd4_dec)nfsd4_decode_notsupp,
1385 [OP_TEST_STATEID] (nfsd4_dec)nfsd4_decode_notsupp,
1386 [OP_WANT_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp,
1387 [OP_DESTROY_CLIENTID] (nfsd4_dec)nfsd4_decode_notsupp,
1388 [OP_RECLAIM_COMPLETE] (nfsd4_dec)nfsd4_decode_notsupp,
1389};
1390
1053struct nfsd4_minorversion_ops { 1391struct nfsd4_minorversion_ops {
1054 nfsd4_dec *decoders; 1392 nfsd4_dec *decoders;
1055 int nops; 1393 int nops;
@@ -1057,6 +1395,7 @@ struct nfsd4_minorversion_ops {
1057 1395
1058static struct nfsd4_minorversion_ops nfsd4_minorversion[] = { 1396static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
1059 [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) }, 1397 [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
1398 [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
1060}; 1399};
1061 1400
1062static __be32 1401static __be32
@@ -1412,6 +1751,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1412{ 1751{
1413 u32 bmval0 = bmval[0]; 1752 u32 bmval0 = bmval[0];
1414 u32 bmval1 = bmval[1]; 1753 u32 bmval1 = bmval[1];
1754 u32 bmval2 = bmval[2];
1415 struct kstat stat; 1755 struct kstat stat;
1416 struct svc_fh tempfh; 1756 struct svc_fh tempfh;
1417 struct kstatfs statfs; 1757 struct kstatfs statfs;
@@ -1425,12 +1765,16 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1425 int err; 1765 int err;
1426 int aclsupport = 0; 1766 int aclsupport = 0;
1427 struct nfs4_acl *acl = NULL; 1767 struct nfs4_acl *acl = NULL;
1768 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1769 u32 minorversion = resp->cstate.minorversion;
1428 1770
1429 BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1); 1771 BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
1430 BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0); 1772 BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
1431 BUG_ON(bmval1 & ~NFSD_SUPPORTED_ATTRS_WORD1); 1773 BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion));
1774 BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion));
1432 1775
1433 if (exp->ex_fslocs.migrated) { 1776 if (exp->ex_fslocs.migrated) {
1777 BUG_ON(bmval[2]);
1434 status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err); 1778 status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err);
1435 if (status) 1779 if (status)
1436 goto out; 1780 goto out;
@@ -1476,22 +1820,42 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1476 if ((buflen -= 16) < 0) 1820 if ((buflen -= 16) < 0)
1477 goto out_resource; 1821 goto out_resource;
1478 1822
1479 WRITE32(2); 1823 if (unlikely(bmval2)) {
1480 WRITE32(bmval0); 1824 WRITE32(3);
1481 WRITE32(bmval1); 1825 WRITE32(bmval0);
1826 WRITE32(bmval1);
1827 WRITE32(bmval2);
1828 } else if (likely(bmval1)) {
1829 WRITE32(2);
1830 WRITE32(bmval0);
1831 WRITE32(bmval1);
1832 } else {
1833 WRITE32(1);
1834 WRITE32(bmval0);
1835 }
1482 attrlenp = p++; /* to be backfilled later */ 1836 attrlenp = p++; /* to be backfilled later */
1483 1837
1484 if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { 1838 if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
1485 u32 word0 = NFSD_SUPPORTED_ATTRS_WORD0; 1839 u32 word0 = nfsd_suppattrs0(minorversion);
1840 u32 word1 = nfsd_suppattrs1(minorversion);
1841 u32 word2 = nfsd_suppattrs2(minorversion);
1842
1486 if ((buflen -= 12) < 0) 1843 if ((buflen -= 12) < 0)
1487 goto out_resource; 1844 goto out_resource;
1488 if (!aclsupport) 1845 if (!aclsupport)
1489 word0 &= ~FATTR4_WORD0_ACL; 1846 word0 &= ~FATTR4_WORD0_ACL;
1490 if (!exp->ex_fslocs.locations) 1847 if (!exp->ex_fslocs.locations)
1491 word0 &= ~FATTR4_WORD0_FS_LOCATIONS; 1848 word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
1492 WRITE32(2); 1849 if (!word2) {
1493 WRITE32(word0); 1850 WRITE32(2);
1494 WRITE32(NFSD_SUPPORTED_ATTRS_WORD1); 1851 WRITE32(word0);
1852 WRITE32(word1);
1853 } else {
1854 WRITE32(3);
1855 WRITE32(word0);
1856 WRITE32(word1);
1857 WRITE32(word2);
1858 }
1495 } 1859 }
1496 if (bmval0 & FATTR4_WORD0_TYPE) { 1860 if (bmval0 & FATTR4_WORD0_TYPE) {
1497 if ((buflen -= 4) < 0) 1861 if ((buflen -= 4) < 0)
@@ -1801,6 +2165,13 @@ out_acl:
1801 } 2165 }
1802 WRITE64(stat.ino); 2166 WRITE64(stat.ino);
1803 } 2167 }
2168 if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
2169 WRITE32(3);
2170 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
2171 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
2172 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2);
2173 }
2174
1804 *attrlenp = htonl((char *)p - (char *)attrlenp - 4); 2175 *attrlenp = htonl((char *)p - (char *)attrlenp - 4);
1805 *countp = p - buffer; 2176 *countp = p - buffer;
1806 status = nfs_ok; 2177 status = nfs_ok;
@@ -2572,6 +2943,143 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
2572} 2943}
2573 2944
2574static __be32 2945static __be32
2946nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
2947 struct nfsd4_exchange_id *exid)
2948{
2949 ENCODE_HEAD;
2950 char *major_id;
2951 char *server_scope;
2952 int major_id_sz;
2953 int server_scope_sz;
2954 uint64_t minor_id = 0;
2955
2956 if (nfserr)
2957 return nfserr;
2958
2959 major_id = utsname()->nodename;
2960 major_id_sz = strlen(major_id);
2961 server_scope = utsname()->nodename;
2962 server_scope_sz = strlen(server_scope);
2963
2964 RESERVE_SPACE(
2965 8 /* eir_clientid */ +
2966 4 /* eir_sequenceid */ +
2967 4 /* eir_flags */ +
2968 4 /* spr_how (SP4_NONE) */ +
2969 8 /* so_minor_id */ +
2970 4 /* so_major_id.len */ +
2971 (XDR_QUADLEN(major_id_sz) * 4) +
2972 4 /* eir_server_scope.len */ +
2973 (XDR_QUADLEN(server_scope_sz) * 4) +
2974 4 /* eir_server_impl_id.count (0) */);
2975
2976 WRITEMEM(&exid->clientid, 8);
2977 WRITE32(exid->seqid);
2978 WRITE32(exid->flags);
2979
2980 /* state_protect4_r. Currently only support SP4_NONE */
2981 BUG_ON(exid->spa_how != SP4_NONE);
2982 WRITE32(exid->spa_how);
2983
2984 /* The server_owner struct */
2985 WRITE64(minor_id); /* Minor id */
2986 /* major id */
2987 WRITE32(major_id_sz);
2988 WRITEMEM(major_id, major_id_sz);
2989
2990 /* Server scope */
2991 WRITE32(server_scope_sz);
2992 WRITEMEM(server_scope, server_scope_sz);
2993
2994 /* Implementation id */
2995 WRITE32(0); /* zero length nfs_impl_id4 array */
2996 ADJUST_ARGS();
2997 return 0;
2998}
2999
3000static __be32
3001nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
3002 struct nfsd4_create_session *sess)
3003{
3004 ENCODE_HEAD;
3005
3006 if (nfserr)
3007 return nfserr;
3008
3009 RESERVE_SPACE(24);
3010 WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN);
3011 WRITE32(sess->seqid);
3012 WRITE32(sess->flags);
3013 ADJUST_ARGS();
3014
3015 RESERVE_SPACE(28);
3016 WRITE32(0); /* headerpadsz */
3017 WRITE32(sess->fore_channel.maxreq_sz);
3018 WRITE32(sess->fore_channel.maxresp_sz);
3019 WRITE32(sess->fore_channel.maxresp_cached);
3020 WRITE32(sess->fore_channel.maxops);
3021 WRITE32(sess->fore_channel.maxreqs);
3022 WRITE32(sess->fore_channel.nr_rdma_attrs);
3023 ADJUST_ARGS();
3024
3025 if (sess->fore_channel.nr_rdma_attrs) {
3026 RESERVE_SPACE(4);
3027 WRITE32(sess->fore_channel.rdma_attrs);
3028 ADJUST_ARGS();
3029 }
3030
3031 RESERVE_SPACE(28);
3032 WRITE32(0); /* headerpadsz */
3033 WRITE32(sess->back_channel.maxreq_sz);
3034 WRITE32(sess->back_channel.maxresp_sz);
3035 WRITE32(sess->back_channel.maxresp_cached);
3036 WRITE32(sess->back_channel.maxops);
3037 WRITE32(sess->back_channel.maxreqs);
3038 WRITE32(sess->back_channel.nr_rdma_attrs);
3039 ADJUST_ARGS();
3040
3041 if (sess->back_channel.nr_rdma_attrs) {
3042 RESERVE_SPACE(4);
3043 WRITE32(sess->back_channel.rdma_attrs);
3044 ADJUST_ARGS();
3045 }
3046 return 0;
3047}
3048
3049static __be32
3050nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
3051 struct nfsd4_destroy_session *destroy_session)
3052{
3053 return nfserr;
3054}
3055
3056__be32
3057nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
3058 struct nfsd4_sequence *seq)
3059{
3060 ENCODE_HEAD;
3061
3062 if (nfserr)
3063 return nfserr;
3064
3065 RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20);
3066 WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
3067 WRITE32(seq->seqid);
3068 WRITE32(seq->slotid);
3069 WRITE32(seq->maxslots);
3070 /*
3071 * FIXME: for now:
3072 * target_maxslots = maxslots
3073 * status_flags = 0
3074 */
3075 WRITE32(seq->maxslots);
3076 WRITE32(0);
3077
3078 ADJUST_ARGS();
3079 return 0;
3080}
3081
3082static __be32
2575nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) 3083nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
2576{ 3084{
2577 return nfserr; 3085 return nfserr;
@@ -2579,6 +3087,11 @@ nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
2579 3087
2580typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); 3088typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
2581 3089
3090/*
3091 * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1
3092 * since we don't need to filter out obsolete ops as this is
3093 * done in the decoding phase.
3094 */
2582static nfsd4_enc nfsd4_enc_ops[] = { 3095static nfsd4_enc nfsd4_enc_ops[] = {
2583 [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access, 3096 [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access,
2584 [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close, 3097 [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close,
@@ -2617,8 +3130,77 @@ static nfsd4_enc nfsd4_enc_ops[] = {
2617 [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop, 3130 [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop,
2618 [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write, 3131 [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write,
2619 [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop, 3132 [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop,
3133
3134 /* NFSv4.1 operations */
3135 [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop,
3136 [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
3137 [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id,
3138 [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session,
3139 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session,
3140 [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
3141 [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
3142 [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop,
3143 [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
3144 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
3145 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
3146 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
3147 [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop,
3148 [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
3149 [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
3150 [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
3151 [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
3152 [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop,
3153 [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop,
2620}; 3154};
2621 3155
3156/*
3157 * Calculate the total amount of memory that the compound response has taken
3158 * after encoding the current operation.
3159 *
3160 * pad: add on 8 bytes for the next operation's op_code and status so that
3161 * there is room to cache a failure on the next operation.
3162 *
3163 * Compare this length to the session se_fmaxresp_cached.
3164 *
3165 * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
3166 * will be at least a page and will therefore hold the xdr_buf head.
3167 */
3168static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
3169{
3170 int status = 0;
3171 struct xdr_buf *xb = &resp->rqstp->rq_res;
3172 struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
3173 struct nfsd4_session *session = NULL;
3174 struct nfsd4_slot *slot = resp->cstate.slot;
3175 u32 length, tlen = 0, pad = 8;
3176
3177 if (!nfsd4_has_session(&resp->cstate))
3178 return status;
3179
3180 session = resp->cstate.session;
3181 if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0)
3182 return status;
3183
3184 if (resp->opcnt >= args->opcnt)
3185 pad = 0; /* this is the last operation */
3186
3187 if (xb->page_len == 0) {
3188 length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
3189 } else {
3190 if (xb->tail[0].iov_base && xb->tail[0].iov_len > 0)
3191 tlen = (char *)resp->p - (char *)xb->tail[0].iov_base;
3192
3193 length = xb->head[0].iov_len + xb->page_len + tlen + pad;
3194 }
3195 dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
3196 length, xb->page_len, tlen, pad);
3197
3198 if (length <= session->se_fmaxresp_cached)
3199 return status;
3200 else
3201 return nfserr_rep_too_big_to_cache;
3202}
3203
2622void 3204void
2623nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) 3205nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
2624{ 3206{
@@ -2635,6 +3217,9 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
2635 BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || 3217 BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
2636 !nfsd4_enc_ops[op->opnum]); 3218 !nfsd4_enc_ops[op->opnum]);
2637 op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); 3219 op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
3220 /* nfsd4_check_drc_limit guarantees enough room for error status */
3221 if (!op->status && nfsd4_check_drc_limit(resp))
3222 op->status = nfserr_rep_too_big_to_cache;
2638status: 3223status:
2639 /* 3224 /*
2640 * Note: We write the status directly, instead of using WRITE32(), 3225 * Note: We write the status directly, instead of using WRITE32(),
@@ -2735,6 +3320,18 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
2735 iov = &rqstp->rq_res.head[0]; 3320 iov = &rqstp->rq_res.head[0];
2736 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; 3321 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
2737 BUG_ON(iov->iov_len > PAGE_SIZE); 3322 BUG_ON(iov->iov_len > PAGE_SIZE);
3323 if (nfsd4_has_session(&resp->cstate)) {
3324 if (resp->cstate.status == nfserr_replay_cache &&
3325 !nfsd4_not_cached(resp)) {
3326 iov->iov_len = resp->cstate.iovlen;
3327 } else {
3328 nfsd4_store_cache_entry(resp);
3329 dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
3330 resp->cstate.slot->sl_inuse = 0;
3331 }
3332 if (resp->cstate.session)
3333 nfsd4_put_session(resp->cstate.session);
3334 }
2738 return 1; 3335 return 1;
2739} 3336}
2740 3337
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index a4ed8644d69c..af16849d243a 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -60,6 +60,7 @@ enum {
60 NFSD_FO_UnlockFS, 60 NFSD_FO_UnlockFS,
61 NFSD_Threads, 61 NFSD_Threads,
62 NFSD_Pool_Threads, 62 NFSD_Pool_Threads,
63 NFSD_Pool_Stats,
63 NFSD_Versions, 64 NFSD_Versions,
64 NFSD_Ports, 65 NFSD_Ports,
65 NFSD_MaxBlkSize, 66 NFSD_MaxBlkSize,
@@ -172,6 +173,16 @@ static const struct file_operations exports_operations = {
172 .owner = THIS_MODULE, 173 .owner = THIS_MODULE,
173}; 174};
174 175
176extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
177
178static struct file_operations pool_stats_operations = {
179 .open = nfsd_pool_stats_open,
180 .read = seq_read,
181 .llseek = seq_lseek,
182 .release = seq_release,
183 .owner = THIS_MODULE,
184};
185
175/*----------------------------------------------------------------------------*/ 186/*----------------------------------------------------------------------------*/
176/* 187/*
177 * payload - write methods 188 * payload - write methods
@@ -781,8 +792,9 @@ out_free:
781static ssize_t __write_versions(struct file *file, char *buf, size_t size) 792static ssize_t __write_versions(struct file *file, char *buf, size_t size)
782{ 793{
783 char *mesg = buf; 794 char *mesg = buf;
784 char *vers, sign; 795 char *vers, *minorp, sign;
785 int len, num; 796 int len, num;
797 unsigned minor;
786 ssize_t tlen = 0; 798 ssize_t tlen = 0;
787 char *sep; 799 char *sep;
788 800
@@ -803,9 +815,20 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
803 do { 815 do {
804 sign = *vers; 816 sign = *vers;
805 if (sign == '+' || sign == '-') 817 if (sign == '+' || sign == '-')
806 num = simple_strtol((vers+1), NULL, 0); 818 num = simple_strtol((vers+1), &minorp, 0);
807 else 819 else
808 num = simple_strtol(vers, NULL, 0); 820 num = simple_strtol(vers, &minorp, 0);
821 if (*minorp == '.') {
822 if (num < 4)
823 return -EINVAL;
824 minor = simple_strtoul(minorp+1, NULL, 0);
825 if (minor == 0)
826 return -EINVAL;
827 if (nfsd_minorversion(minor, sign == '-' ?
828 NFSD_CLEAR : NFSD_SET) < 0)
829 return -EINVAL;
830 goto next;
831 }
809 switch(num) { 832 switch(num) {
810 case 2: 833 case 2:
811 case 3: 834 case 3:
@@ -815,6 +838,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
815 default: 838 default:
816 return -EINVAL; 839 return -EINVAL;
817 } 840 }
841 next:
818 vers += len + 1; 842 vers += len + 1;
819 tlen += len; 843 tlen += len;
820 } while ((len = qword_get(&mesg, vers, size)) > 0); 844 } while ((len = qword_get(&mesg, vers, size)) > 0);
@@ -833,6 +857,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
833 num); 857 num);
834 sep = " "; 858 sep = " ";
835 } 859 }
860 if (nfsd_vers(4, NFSD_AVAIL))
861 for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; minor++)
862 len += sprintf(buf+len, " %c4.%u",
863 (nfsd_vers(4, NFSD_TEST) &&
864 nfsd_minorversion(minor, NFSD_TEST)) ?
865 '+' : '-',
866 minor);
836 len += sprintf(buf+len, "\n"); 867 len += sprintf(buf+len, "\n");
837 return len; 868 return len;
838} 869}
@@ -1248,6 +1279,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1248 [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR}, 1279 [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},
1249 [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, 1280 [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
1250 [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, 1281 [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
1282 [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
1251 [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, 1283 [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
1252 [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, 1284 [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
1253 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, 1285 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 6f7f26351227..e298e260b5f1 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -180,6 +180,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
180{ 180{
181 __be32 nfserr; 181 __be32 nfserr;
182 int stable = 1; 182 int stable = 1;
183 unsigned long cnt = argp->len;
183 184
184 dprintk("nfsd: WRITE %s %d bytes at %d\n", 185 dprintk("nfsd: WRITE %s %d bytes at %d\n",
185 SVCFH_fmt(&argp->fh), 186 SVCFH_fmt(&argp->fh),
@@ -188,7 +189,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
188 nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, 189 nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
189 argp->offset, 190 argp->offset,
190 rqstp->rq_vec, argp->vlen, 191 rqstp->rq_vec, argp->vlen,
191 argp->len, 192 &cnt,
192 &stable); 193 &stable);
193 return nfsd_return_attrs(nfserr, resp); 194 return nfsd_return_attrs(nfserr, resp);
194} 195}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 7c09852be713..cbba4a935786 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -22,6 +22,7 @@
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/fs_struct.h> 23#include <linux/fs_struct.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/swap.h>
25 26
26#include <linux/sunrpc/types.h> 27#include <linux/sunrpc/types.h>
27#include <linux/sunrpc/stats.h> 28#include <linux/sunrpc/stats.h>
@@ -40,9 +41,6 @@
40extern struct svc_program nfsd_program; 41extern struct svc_program nfsd_program;
41static int nfsd(void *vrqstp); 42static int nfsd(void *vrqstp);
42struct timeval nfssvc_boot; 43struct timeval nfssvc_boot;
43static atomic_t nfsd_busy;
44static unsigned long nfsd_last_call;
45static DEFINE_SPINLOCK(nfsd_call_lock);
46 44
47/* 45/*
48 * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members 46 * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members
@@ -123,6 +121,8 @@ struct svc_program nfsd_program = {
123 121
124}; 122};
125 123
124u32 nfsd_supported_minorversion;
125
126int nfsd_vers(int vers, enum vers_op change) 126int nfsd_vers(int vers, enum vers_op change)
127{ 127{
128 if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS) 128 if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
@@ -149,6 +149,28 @@ int nfsd_vers(int vers, enum vers_op change)
149 } 149 }
150 return 0; 150 return 0;
151} 151}
152
153int nfsd_minorversion(u32 minorversion, enum vers_op change)
154{
155 if (minorversion > NFSD_SUPPORTED_MINOR_VERSION)
156 return -1;
157 switch(change) {
158 case NFSD_SET:
159 nfsd_supported_minorversion = minorversion;
160 break;
161 case NFSD_CLEAR:
162 if (minorversion == 0)
163 return -1;
164 nfsd_supported_minorversion = minorversion - 1;
165 break;
166 case NFSD_TEST:
167 return minorversion <= nfsd_supported_minorversion;
168 case NFSD_AVAIL:
169 return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
170 }
171 return 0;
172}
173
152/* 174/*
153 * Maximum number of nfsd processes 175 * Maximum number of nfsd processes
154 */ 176 */
@@ -200,6 +222,28 @@ void nfsd_reset_versions(void)
200 } 222 }
201} 223}
202 224
225/*
226 * Each session guarantees a negotiated per slot memory cache for replies
227 * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated
228 * NFSv4.1 server might want to use more memory for a DRC than a machine
229 * with mutiple services.
230 *
231 * Impose a hard limit on the number of pages for the DRC which varies
232 * according to the machines free pages. This is of course only a default.
233 *
234 * For now this is a #defined shift which could be under admin control
235 * in the future.
236 */
237static void set_max_drc(void)
238{
239 /* The percent of nr_free_buffer_pages used by the V4.1 server DRC */
240 #define NFSD_DRC_SIZE_SHIFT 7
241 nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages()
242 >> NFSD_DRC_SIZE_SHIFT;
243 nfsd_serv->sv_drc_pages_used = 0;
244 dprintk("%s svc_drc_max_pages %u\n", __func__,
245 nfsd_serv->sv_drc_max_pages);
246}
203 247
204int nfsd_create_serv(void) 248int nfsd_create_serv(void)
205{ 249{
@@ -227,11 +271,12 @@ int nfsd_create_serv(void)
227 nfsd_max_blksize /= 2; 271 nfsd_max_blksize /= 2;
228 } 272 }
229 273
230 atomic_set(&nfsd_busy, 0);
231 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, 274 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
232 nfsd_last_thread, nfsd, THIS_MODULE); 275 nfsd_last_thread, nfsd, THIS_MODULE);
233 if (nfsd_serv == NULL) 276 if (nfsd_serv == NULL)
234 err = -ENOMEM; 277 err = -ENOMEM;
278 else
279 set_max_drc();
235 280
236 do_gettimeofday(&nfssvc_boot); /* record boot time */ 281 do_gettimeofday(&nfssvc_boot); /* record boot time */
237 return err; 282 return err;
@@ -375,26 +420,6 @@ nfsd_svc(unsigned short port, int nrservs)
375 return error; 420 return error;
376} 421}
377 422
378static inline void
379update_thread_usage(int busy_threads)
380{
381 unsigned long prev_call;
382 unsigned long diff;
383 int decile;
384
385 spin_lock(&nfsd_call_lock);
386 prev_call = nfsd_last_call;
387 nfsd_last_call = jiffies;
388 decile = busy_threads*10/nfsdstats.th_cnt;
389 if (decile>0 && decile <= 10) {
390 diff = nfsd_last_call - prev_call;
391 if ( (nfsdstats.th_usage[decile-1] += diff) >= NFSD_USAGE_WRAP)
392 nfsdstats.th_usage[decile-1] -= NFSD_USAGE_WRAP;
393 if (decile == 10)
394 nfsdstats.th_fullcnt++;
395 }
396 spin_unlock(&nfsd_call_lock);
397}
398 423
399/* 424/*
400 * This is the NFS server kernel thread 425 * This is the NFS server kernel thread
@@ -460,8 +485,6 @@ nfsd(void *vrqstp)
460 continue; 485 continue;
461 } 486 }
462 487
463 update_thread_usage(atomic_read(&nfsd_busy));
464 atomic_inc(&nfsd_busy);
465 488
466 /* Lock the export hash tables for reading. */ 489 /* Lock the export hash tables for reading. */
467 exp_readlock(); 490 exp_readlock();
@@ -470,8 +493,6 @@ nfsd(void *vrqstp)
470 493
471 /* Unlock export hash tables */ 494 /* Unlock export hash tables */
472 exp_readunlock(); 495 exp_readunlock();
473 update_thread_usage(atomic_read(&nfsd_busy));
474 atomic_dec(&nfsd_busy);
475 } 496 }
476 497
477 /* Clear signals before calling svc_exit_thread() */ 498 /* Clear signals before calling svc_exit_thread() */
@@ -539,6 +560,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
539 + rqstp->rq_res.head[0].iov_len; 560 + rqstp->rq_res.head[0].iov_len;
540 rqstp->rq_res.head[0].iov_len += sizeof(__be32); 561 rqstp->rq_res.head[0].iov_len += sizeof(__be32);
541 562
563 /* NFSv4.1 DRC requires statp */
564 if (rqstp->rq_vers == 4)
565 nfsd4_set_statp(rqstp, statp);
566
542 /* Now call the procedure handler, and encode NFS status. */ 567 /* Now call the procedure handler, and encode NFS status. */
543 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); 568 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
544 nfserr = map_new_errors(rqstp->rq_vers, nfserr); 569 nfserr = map_new_errors(rqstp->rq_vers, nfserr);
@@ -570,3 +595,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
570 nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1); 595 nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1);
571 return 1; 596 return 1;
572} 597}
598
599int nfsd_pool_stats_open(struct inode *inode, struct file *file)
600{
601 if (nfsd_serv == NULL)
602 return -ENODEV;
603 return svc_pool_stats_open(nfsd_serv, file);
604}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 78376b6c0236..ab93fcfef254 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -366,8 +366,9 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
366 } 366 }
367 367
368 /* Revoke setuid/setgid on chown */ 368 /* Revoke setuid/setgid on chown */
369 if (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) || 369 if (!S_ISDIR(inode->i_mode) &&
370 ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid)) { 370 (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
371 ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) {
371 iap->ia_valid |= ATTR_KILL_PRIV; 372 iap->ia_valid |= ATTR_KILL_PRIV;
372 if (iap->ia_valid & ATTR_MODE) { 373 if (iap->ia_valid & ATTR_MODE) {
373 /* we're setting mode too, just clear the s*id bits */ 374 /* we're setting mode too, just clear the s*id bits */
@@ -960,7 +961,7 @@ static void kill_suid(struct dentry *dentry)
960static __be32 961static __be32
961nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 962nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
962 loff_t offset, struct kvec *vec, int vlen, 963 loff_t offset, struct kvec *vec, int vlen,
963 unsigned long cnt, int *stablep) 964 unsigned long *cnt, int *stablep)
964{ 965{
965 struct svc_export *exp; 966 struct svc_export *exp;
966 struct dentry *dentry; 967 struct dentry *dentry;
@@ -974,7 +975,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
974 err = nfserr_perm; 975 err = nfserr_perm;
975 976
976 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && 977 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
977 (!lock_may_write(file->f_path.dentry->d_inode, offset, cnt))) 978 (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
978 goto out; 979 goto out;
979#endif 980#endif
980 981
@@ -1009,7 +1010,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1009 host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); 1010 host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
1010 set_fs(oldfs); 1011 set_fs(oldfs);
1011 if (host_err >= 0) { 1012 if (host_err >= 0) {
1012 nfsdstats.io_write += cnt; 1013 nfsdstats.io_write += host_err;
1013 fsnotify_modify(file->f_path.dentry); 1014 fsnotify_modify(file->f_path.dentry);
1014 } 1015 }
1015 1016
@@ -1054,9 +1055,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1054 } 1055 }
1055 1056
1056 dprintk("nfsd: write complete host_err=%d\n", host_err); 1057 dprintk("nfsd: write complete host_err=%d\n", host_err);
1057 if (host_err >= 0) 1058 if (host_err >= 0) {
1058 err = 0; 1059 err = 0;
1059 else 1060 *cnt = host_err;
1061 } else
1060 err = nfserrno(host_err); 1062 err = nfserrno(host_err);
1061out: 1063out:
1062 return err; 1064 return err;
@@ -1098,7 +1100,7 @@ out:
1098 */ 1100 */
1099__be32 1101__be32
1100nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 1102nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1101 loff_t offset, struct kvec *vec, int vlen, unsigned long cnt, 1103 loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt,
1102 int *stablep) 1104 int *stablep)
1103{ 1105{
1104 __be32 err = 0; 1106 __be32 err = 0;
@@ -1179,6 +1181,21 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
1179 return 0; 1181 return 0;
1180} 1182}
1181 1183
1184/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
1185 * setting size to 0 may fail for some specific file systems by the permission
1186 * checking which requires WRITE permission but the mode is 000.
1187 * we ignore the resizing(to 0) on the just new created file, since the size is
1188 * 0 after file created.
1189 *
1190 * call this only after vfs_create() is called.
1191 * */
1192static void
1193nfsd_check_ignore_resizing(struct iattr *iap)
1194{
1195 if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
1196 iap->ia_valid &= ~ATTR_SIZE;
1197}
1198
1182/* 1199/*
1183 * Create a file (regular, directory, device, fifo); UNIX sockets 1200 * Create a file (regular, directory, device, fifo); UNIX sockets
1184 * not yet implemented. 1201 * not yet implemented.
@@ -1274,6 +1291,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1274 switch (type) { 1291 switch (type) {
1275 case S_IFREG: 1292 case S_IFREG:
1276 host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); 1293 host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
1294 if (!host_err)
1295 nfsd_check_ignore_resizing(iap);
1277 break; 1296 break;
1278 case S_IFDIR: 1297 case S_IFDIR:
1279 host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); 1298 host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
@@ -1427,6 +1446,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1427 /* setattr will sync the child (or not) */ 1446 /* setattr will sync the child (or not) */
1428 } 1447 }
1429 1448
1449 nfsd_check_ignore_resizing(iap);
1450
1430 if (createmode == NFS3_CREATE_EXCLUSIVE) { 1451 if (createmode == NFS3_CREATE_EXCLUSIVE) {
1431 /* Cram the verifier into atime/mtime */ 1452 /* Cram the verifier into atime/mtime */
1432 iap->ia_valid = ATTR_MTIME|ATTR_ATIME 1453 iap->ia_valid = ATTR_MTIME|ATTR_ATIME
diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
index 1a17020f9faf..ce2d6bcc6266 100644
--- a/fs/romfs/Kconfig
+++ b/fs/romfs/Kconfig
@@ -1,6 +1,6 @@
1config ROMFS_FS 1config ROMFS_FS
2 tristate "ROM file system support" 2 tristate "ROM file system support"
3 depends on BLOCK 3 depends on BLOCK || MTD
4 ---help--- 4 ---help---
5 This is a very small read-only file system mainly intended for 5 This is a very small read-only file system mainly intended for
6 initial ram disks of installation disks, but it could be used for 6 initial ram disks of installation disks, but it could be used for
@@ -14,3 +14,49 @@ config ROMFS_FS
14 14
15 If you don't know whether you need it, then you don't need it: 15 If you don't know whether you need it, then you don't need it:
16 answer N. 16 answer N.
17
18#
19# Select the backing stores to be supported
20#
21choice
22 prompt "RomFS backing stores"
23 depends on ROMFS_FS
24 default ROMFS_BACKED_BY_BLOCK
25 help
26 Select the backing stores to be supported.
27
28config ROMFS_BACKED_BY_BLOCK
29 bool "Block device-backed ROM file system support"
30 depends on BLOCK
31 help
32 This permits ROMFS to use block devices buffered through the page
33 cache as the medium from which to retrieve data. It does not allow
34 direct mapping of the medium.
35
36 If unsure, answer Y.
37
38config ROMFS_BACKED_BY_MTD
39 bool "MTD-backed ROM file system support"
40 depends on MTD=y || (ROMFS_FS=m && MTD)
41 help
42 This permits ROMFS to use MTD based devices directly, without the
43 intercession of the block layer (which may have been disabled). It
44 also allows direct mapping of MTD devices through romfs files under
45 NOMMU conditions if the underlying device is directly addressable by
46 the CPU.
47
48 If unsure, answer Y.
49
50config ROMFS_BACKED_BY_BOTH
51 bool "Both the above"
52 depends on BLOCK && (MTD=y || (ROMFS_FS=m && MTD))
53endchoice
54
55
56config ROMFS_ON_BLOCK
57 bool
58 default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH
59
60config ROMFS_ON_MTD
61 bool
62 default y if ROMFS_BACKED_BY_MTD || ROMFS_BACKED_BY_BOTH
diff --git a/fs/romfs/Makefile b/fs/romfs/Makefile
index c95b21cf49a3..420beb7d495c 100644
--- a/fs/romfs/Makefile
+++ b/fs/romfs/Makefile
@@ -1,7 +1,12 @@
1# 1#
2# Makefile for the linux romfs filesystem routines. 2# Makefile for the linux RomFS filesystem routines.
3# 3#
4 4
5obj-$(CONFIG_ROMFS_FS) += romfs.o 5obj-$(CONFIG_ROMFS_FS) += romfs.o
6 6
7romfs-objs := inode.o 7romfs-y := storage.o super.o
8
9ifneq ($(CONFIG_MMU),y)
10romfs-$(CONFIG_ROMFS_ON_MTD) += mmap-nommu.o
11endif
12
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
deleted file mode 100644
index 98a232f7196b..000000000000
--- a/fs/romfs/inode.c
+++ /dev/null
@@ -1,665 +0,0 @@
1/*
2 * ROMFS file system, Linux implementation
3 *
4 * Copyright (C) 1997-1999 Janos Farkas <chexum@shadow.banki.hu>
5 *
6 * Using parts of the minix filesystem
7 * Copyright (C) 1991, 1992 Linus Torvalds
8 *
9 * and parts of the affs filesystem additionally
10 * Copyright (C) 1993 Ray Burr
11 * Copyright (C) 1996 Hans-Joachim Widmaier
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License
15 * as published by the Free Software Foundation; either version
16 * 2 of the License, or (at your option) any later version.
17 *
18 * Changes
19 * Changed for 2.1.19 modules
20 * Jan 1997 Initial release
21 * Jun 1997 2.1.43+ changes
22 * Proper page locking in readpage
23 * Changed to work with 2.1.45+ fs
24 * Jul 1997 Fixed follow_link
25 * 2.1.47
26 * lookup shouldn't return -ENOENT
27 * from Horst von Brand:
28 * fail on wrong checksum
29 * double unlock_super was possible
30 * correct namelen for statfs
31 * spotted by Bill Hawes:
32 * readlink shouldn't iput()
33 * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir()
34 * exposed a problem in readdir
35 * 2.1.107 code-freeze spellchecker run
36 * Aug 1998 2.1.118+ VFS changes
37 * Sep 1998 2.1.122 another VFS change (follow_link)
38 * Apr 1999 2.2.7 no more EBADF checking in
39 * lookup/readdir, use ERR_PTR
40 * Jun 1999 2.3.6 d_alloc_root use changed
41 * 2.3.9 clean up usage of ENOENT/negative
42 * dentries in lookup
43 * clean up page flags setting
44 * (error, uptodate, locking) in
45 * in readpage
46 * use init_special_inode for
47 * fifos/sockets (and streamline) in
48 * read_inode, fix _ops table order
49 * Aug 1999 2.3.16 __initfunc() => __init change
50 * Oct 1999 2.3.24 page->owner hack obsoleted
51 * Nov 1999 2.3.27 2.3.25+ page->offset => index change
52 */
53
54/* todo:
55 * - see Documentation/filesystems/romfs.txt
56 * - use allocated, not stack memory for file names?
57 * - considering write access...
58 * - network (tftp) files?
59 * - merge back some _op tables
60 */
61
62/*
63 * Sorry about some optimizations and for some goto's. I just wanted
64 * to squeeze some more bytes out of this code.. :)
65 */
66
67#include <linux/module.h>
68#include <linux/types.h>
69#include <linux/errno.h>
70#include <linux/slab.h>
71#include <linux/romfs_fs.h>
72#include <linux/fs.h>
73#include <linux/init.h>
74#include <linux/pagemap.h>
75#include <linux/smp_lock.h>
76#include <linux/buffer_head.h>
77#include <linux/vfs.h>
78
79#include <asm/uaccess.h>
80
81struct romfs_inode_info {
82 unsigned long i_metasize; /* size of non-data area */
83 unsigned long i_dataoffset; /* from the start of fs */
84 struct inode vfs_inode;
85};
86
87static struct inode *romfs_iget(struct super_block *, unsigned long);
88
89/* instead of private superblock data */
90static inline unsigned long romfs_maxsize(struct super_block *sb)
91{
92 return (unsigned long)sb->s_fs_info;
93}
94
95static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
96{
97 return container_of(inode, struct romfs_inode_info, vfs_inode);
98}
99
100static __u32
101romfs_checksum(void *data, int size)
102{
103 __u32 sum;
104 __be32 *ptr;
105
106 sum = 0; ptr = data;
107 size>>=2;
108 while (size>0) {
109 sum += be32_to_cpu(*ptr++);
110 size--;
111 }
112 return sum;
113}
114
115static const struct super_operations romfs_ops;
116
117static int romfs_fill_super(struct super_block *s, void *data, int silent)
118{
119 struct buffer_head *bh;
120 struct romfs_super_block *rsb;
121 struct inode *root;
122 int sz, ret = -EINVAL;
123
124 /* I would parse the options here, but there are none.. :) */
125
126 sb_set_blocksize(s, ROMBSIZE);
127 s->s_maxbytes = 0xFFFFFFFF;
128
129 bh = sb_bread(s, 0);
130 if (!bh) {
131 /* XXX merge with other printk? */
132 printk ("romfs: unable to read superblock\n");
133 goto outnobh;
134 }
135
136 rsb = (struct romfs_super_block *)bh->b_data;
137 sz = be32_to_cpu(rsb->size);
138 if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1
139 || sz < ROMFH_SIZE) {
140 if (!silent)
141 printk ("VFS: Can't find a romfs filesystem on dev "
142 "%s.\n", s->s_id);
143 goto out;
144 }
145 if (romfs_checksum(rsb, min_t(int, sz, 512))) {
146 printk ("romfs: bad initial checksum on dev "
147 "%s.\n", s->s_id);
148 goto out;
149 }
150
151 s->s_magic = ROMFS_MAGIC;
152 s->s_fs_info = (void *)(long)sz;
153
154 s->s_flags |= MS_RDONLY;
155
156 /* Find the start of the fs */
157 sz = (ROMFH_SIZE +
158 strnlen(rsb->name, ROMFS_MAXFN) + 1 + ROMFH_PAD)
159 & ROMFH_MASK;
160
161 s->s_op = &romfs_ops;
162 root = romfs_iget(s, sz);
163 if (IS_ERR(root)) {
164 ret = PTR_ERR(root);
165 goto out;
166 }
167
168 ret = -ENOMEM;
169 s->s_root = d_alloc_root(root);
170 if (!s->s_root)
171 goto outiput;
172
173 brelse(bh);
174 return 0;
175
176outiput:
177 iput(root);
178out:
179 brelse(bh);
180outnobh:
181 return ret;
182}
183
184/* That's simple too. */
185
186static int
187romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
188{
189 buf->f_type = ROMFS_MAGIC;
190 buf->f_bsize = ROMBSIZE;
191 buf->f_bfree = buf->f_bavail = buf->f_ffree;
192 buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS;
193 buf->f_namelen = ROMFS_MAXFN;
194 return 0;
195}
196
197/* some helper routines */
198
199static int
200romfs_strnlen(struct inode *i, unsigned long offset, unsigned long count)
201{
202 struct buffer_head *bh;
203 unsigned long avail, maxsize, res;
204
205 maxsize = romfs_maxsize(i->i_sb);
206 if (offset >= maxsize)
207 return -1;
208
209 /* strnlen is almost always valid */
210 if (count > maxsize || offset+count > maxsize)
211 count = maxsize-offset;
212
213 bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
214 if (!bh)
215 return -1; /* error */
216
217 avail = ROMBSIZE - (offset & ROMBMASK);
218 maxsize = min_t(unsigned long, count, avail);
219 res = strnlen(((char *)bh->b_data)+(offset&ROMBMASK), maxsize);
220 brelse(bh);
221
222 if (res < maxsize)
223 return res; /* found all of it */
224
225 while (res < count) {
226 offset += maxsize;
227
228 bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
229 if (!bh)
230 return -1;
231 maxsize = min_t(unsigned long, count - res, ROMBSIZE);
232 avail = strnlen(bh->b_data, maxsize);
233 res += avail;
234 brelse(bh);
235 if (avail < maxsize)
236 return res;
237 }
238 return res;
239}
240
241static int
242romfs_copyfrom(struct inode *i, void *dest, unsigned long offset, unsigned long count)
243{
244 struct buffer_head *bh;
245 unsigned long avail, maxsize, res;
246
247 maxsize = romfs_maxsize(i->i_sb);
248 if (offset >= maxsize || count > maxsize || offset+count>maxsize)
249 return -1;
250
251 bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
252 if (!bh)
253 return -1; /* error */
254
255 avail = ROMBSIZE - (offset & ROMBMASK);
256 maxsize = min_t(unsigned long, count, avail);
257 memcpy(dest, ((char *)bh->b_data) + (offset & ROMBMASK), maxsize);
258 brelse(bh);
259
260 res = maxsize; /* all of it */
261
262 while (res < count) {
263 offset += maxsize;
264 dest += maxsize;
265
266 bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
267 if (!bh)
268 return -1;
269 maxsize = min_t(unsigned long, count - res, ROMBSIZE);
270 memcpy(dest, bh->b_data, maxsize);
271 brelse(bh);
272 res += maxsize;
273 }
274 return res;
275}
276
277static unsigned char romfs_dtype_table[] = {
278 DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
279};
280
281static int
282romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
283{
284 struct inode *i = filp->f_path.dentry->d_inode;
285 struct romfs_inode ri;
286 unsigned long offset, maxoff;
287 int j, ino, nextfh;
288 int stored = 0;
289 char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
290
291 lock_kernel();
292
293 maxoff = romfs_maxsize(i->i_sb);
294
295 offset = filp->f_pos;
296 if (!offset) {
297 offset = i->i_ino & ROMFH_MASK;
298 if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
299 goto out;
300 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
301 }
302
303 /* Not really failsafe, but we are read-only... */
304 for(;;) {
305 if (!offset || offset >= maxoff) {
306 offset = maxoff;
307 filp->f_pos = offset;
308 goto out;
309 }
310 filp->f_pos = offset;
311
312 /* Fetch inode info */
313 if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
314 goto out;
315
316 j = romfs_strnlen(i, offset+ROMFH_SIZE, sizeof(fsname)-1);
317 if (j < 0)
318 goto out;
319
320 fsname[j]=0;
321 romfs_copyfrom(i, fsname, offset+ROMFH_SIZE, j);
322
323 ino = offset;
324 nextfh = be32_to_cpu(ri.next);
325 if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
326 ino = be32_to_cpu(ri.spec);
327 if (filldir(dirent, fsname, j, offset, ino,
328 romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) {
329 goto out;
330 }
331 stored++;
332 offset = nextfh & ROMFH_MASK;
333 }
334out:
335 unlock_kernel();
336 return stored;
337}
338
339static struct dentry *
340romfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
341{
342 unsigned long offset, maxoff;
343 long res;
344 int fslen;
345 struct inode *inode = NULL;
346 char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
347 struct romfs_inode ri;
348 const char *name; /* got from dentry */
349 int len;
350
351 res = -EACCES; /* placeholder for "no data here" */
352 offset = dir->i_ino & ROMFH_MASK;
353 lock_kernel();
354 if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
355 goto error;
356
357 maxoff = romfs_maxsize(dir->i_sb);
358 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
359
360 /* OK, now find the file whose name is in "dentry" in the
361 * directory specified by "dir". */
362
363 name = dentry->d_name.name;
364 len = dentry->d_name.len;
365
366 for(;;) {
367 if (!offset || offset >= maxoff)
368 goto success; /* negative success */
369 if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
370 goto error;
371
372 /* try to match the first 16 bytes of name */
373 fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, ROMFH_SIZE);
374 if (len < ROMFH_SIZE) {
375 if (len == fslen) {
376 /* both are shorter, and same size */
377 romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
378 if (strncmp (name, fsname, len) == 0)
379 break;
380 }
381 } else if (fslen >= ROMFH_SIZE) {
382 /* both are longer; XXX optimize max size */
383 fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, sizeof(fsname)-1);
384 if (len == fslen) {
385 romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
386 if (strncmp(name, fsname, len) == 0)
387 break;
388 }
389 }
390 /* next entry */
391 offset = be32_to_cpu(ri.next) & ROMFH_MASK;
392 }
393
394 /* Hard link handling */
395 if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
396 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
397
398 inode = romfs_iget(dir->i_sb, offset);
399 if (IS_ERR(inode)) {
400 res = PTR_ERR(inode);
401 goto error;
402 }
403
404success:
405 d_add(dentry, inode);
406 res = 0;
407error:
408 unlock_kernel();
409 return ERR_PTR(res);
410}
411
412/*
413 * Ok, we do readpage, to be able to execute programs. Unfortunately,
414 * we can't use bmap, since we may have looser alignments.
415 */
416
417static int
418romfs_readpage(struct file *file, struct page * page)
419{
420 struct inode *inode = page->mapping->host;
421 loff_t offset, size;
422 unsigned long filled;
423 void *buf;
424 int result = -EIO;
425
426 page_cache_get(page);
427 lock_kernel();
428 buf = kmap(page);
429 if (!buf)
430 goto err_out;
431
432 /* 32 bit warning -- but not for us :) */
433 offset = page_offset(page);
434 size = i_size_read(inode);
435 filled = 0;
436 result = 0;
437 if (offset < size) {
438 unsigned long readlen;
439
440 size -= offset;
441 readlen = size > PAGE_SIZE ? PAGE_SIZE : size;
442
443 filled = romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen);
444
445 if (filled != readlen) {
446 SetPageError(page);
447 filled = 0;
448 result = -EIO;
449 }
450 }
451
452 if (filled < PAGE_SIZE)
453 memset(buf + filled, 0, PAGE_SIZE-filled);
454
455 if (!result)
456 SetPageUptodate(page);
457 flush_dcache_page(page);
458
459 unlock_page(page);
460
461 kunmap(page);
462err_out:
463 page_cache_release(page);
464 unlock_kernel();
465
466 return result;
467}
468
469/* Mapping from our types to the kernel */
470
471static const struct address_space_operations romfs_aops = {
472 .readpage = romfs_readpage
473};
474
475static const struct file_operations romfs_dir_operations = {
476 .read = generic_read_dir,
477 .readdir = romfs_readdir,
478};
479
480static const struct inode_operations romfs_dir_inode_operations = {
481 .lookup = romfs_lookup,
482};
483
484static mode_t romfs_modemap[] =
485{
486 0, S_IFDIR+0644, S_IFREG+0644, S_IFLNK+0777,
487 S_IFBLK+0600, S_IFCHR+0600, S_IFSOCK+0644, S_IFIFO+0644
488};
489
490static struct inode *
491romfs_iget(struct super_block *sb, unsigned long ino)
492{
493 int nextfh, ret;
494 struct romfs_inode ri;
495 struct inode *i;
496
497 ino &= ROMFH_MASK;
498 i = iget_locked(sb, ino);
499 if (!i)
500 return ERR_PTR(-ENOMEM);
501 if (!(i->i_state & I_NEW))
502 return i;
503
504 i->i_mode = 0;
505
506 /* Loop for finding the real hard link */
507 for(;;) {
508 if (romfs_copyfrom(i, &ri, ino, ROMFH_SIZE) <= 0) {
509 printk(KERN_ERR "romfs: read error for inode 0x%lx\n",
510 ino);
511 iget_failed(i);
512 return ERR_PTR(-EIO);
513 }
514 /* XXX: do romfs_checksum here too (with name) */
515
516 nextfh = be32_to_cpu(ri.next);
517 if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
518 break;
519
520 ino = be32_to_cpu(ri.spec) & ROMFH_MASK;
521 }
522
523 i->i_nlink = 1; /* Hard to decide.. */
524 i->i_size = be32_to_cpu(ri.size);
525 i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
526 i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
527
528 /* Precalculate the data offset */
529 ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
530 if (ret >= 0)
531 ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
532 else
533 ino = 0;
534
535 ROMFS_I(i)->i_metasize = ino;
536 ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
537
538 /* Compute permissions */
539 ino = romfs_modemap[nextfh & ROMFH_TYPE];
540 /* only "normal" files have ops */
541 switch (nextfh & ROMFH_TYPE) {
542 case 1:
543 i->i_size = ROMFS_I(i)->i_metasize;
544 i->i_op = &romfs_dir_inode_operations;
545 i->i_fop = &romfs_dir_operations;
546 if (nextfh & ROMFH_EXEC)
547 ino |= S_IXUGO;
548 i->i_mode = ino;
549 break;
550 case 2:
551 i->i_fop = &generic_ro_fops;
552 i->i_data.a_ops = &romfs_aops;
553 if (nextfh & ROMFH_EXEC)
554 ino |= S_IXUGO;
555 i->i_mode = ino;
556 break;
557 case 3:
558 i->i_op = &page_symlink_inode_operations;
559 i->i_data.a_ops = &romfs_aops;
560 i->i_mode = ino | S_IRWXUGO;
561 break;
562 default:
563 /* depending on MBZ for sock/fifos */
564 nextfh = be32_to_cpu(ri.spec);
565 init_special_inode(i, ino,
566 MKDEV(nextfh>>16,nextfh&0xffff));
567 }
568 unlock_new_inode(i);
569 return i;
570}
571
572static struct kmem_cache * romfs_inode_cachep;
573
574static struct inode *romfs_alloc_inode(struct super_block *sb)
575{
576 struct romfs_inode_info *ei;
577 ei = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
578 if (!ei)
579 return NULL;
580 return &ei->vfs_inode;
581}
582
583static void romfs_destroy_inode(struct inode *inode)
584{
585 kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
586}
587
588static void init_once(void *foo)
589{
590 struct romfs_inode_info *ei = foo;
591
592 inode_init_once(&ei->vfs_inode);
593}
594
595static int init_inodecache(void)
596{
597 romfs_inode_cachep = kmem_cache_create("romfs_inode_cache",
598 sizeof(struct romfs_inode_info),
599 0, (SLAB_RECLAIM_ACCOUNT|
600 SLAB_MEM_SPREAD),
601 init_once);
602 if (romfs_inode_cachep == NULL)
603 return -ENOMEM;
604 return 0;
605}
606
607static void destroy_inodecache(void)
608{
609 kmem_cache_destroy(romfs_inode_cachep);
610}
611
612static int romfs_remount(struct super_block *sb, int *flags, char *data)
613{
614 *flags |= MS_RDONLY;
615 return 0;
616}
617
618static const struct super_operations romfs_ops = {
619 .alloc_inode = romfs_alloc_inode,
620 .destroy_inode = romfs_destroy_inode,
621 .statfs = romfs_statfs,
622 .remount_fs = romfs_remount,
623};
624
625static int romfs_get_sb(struct file_system_type *fs_type,
626 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
627{
628 return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super,
629 mnt);
630}
631
632static struct file_system_type romfs_fs_type = {
633 .owner = THIS_MODULE,
634 .name = "romfs",
635 .get_sb = romfs_get_sb,
636 .kill_sb = kill_block_super,
637 .fs_flags = FS_REQUIRES_DEV,
638};
639
640static int __init init_romfs_fs(void)
641{
642 int err = init_inodecache();
643 if (err)
644 goto out1;
645 err = register_filesystem(&romfs_fs_type);
646 if (err)
647 goto out;
648 return 0;
649out:
650 destroy_inodecache();
651out1:
652 return err;
653}
654
655static void __exit exit_romfs_fs(void)
656{
657 unregister_filesystem(&romfs_fs_type);
658 destroy_inodecache();
659}
660
661/* Yes, works even as a module... :) */
662
663module_init(init_romfs_fs)
664module_exit(exit_romfs_fs)
665MODULE_LICENSE("GPL");
diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h
new file mode 100644
index 000000000000..06044a9dc62d
--- /dev/null
+++ b/fs/romfs/internal.h
@@ -0,0 +1,47 @@
1/* RomFS internal definitions
2 *
3 * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/romfs_fs.h>
13
14struct romfs_inode_info {
15 struct inode vfs_inode;
16 unsigned long i_metasize; /* size of non-data area */
17 unsigned long i_dataoffset; /* from the start of fs */
18};
19
20static inline size_t romfs_maxsize(struct super_block *sb)
21{
22 return (size_t) (unsigned long) sb->s_fs_info;
23}
24
25static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
26{
27 return container_of(inode, struct romfs_inode_info, vfs_inode);
28}
29
30/*
31 * mmap-nommu.c
32 */
33#if !defined(CONFIG_MMU) && defined(CONFIG_ROMFS_ON_MTD)
34extern const struct file_operations romfs_ro_fops;
35#else
36#define romfs_ro_fops generic_ro_fops
37#endif
38
39/*
40 * storage.c
41 */
42extern int romfs_dev_read(struct super_block *sb, unsigned long pos,
43 void *buf, size_t buflen);
44extern ssize_t romfs_dev_strnlen(struct super_block *sb,
45 unsigned long pos, size_t maxlen);
46extern int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
47 const char *str, size_t size);
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
new file mode 100644
index 000000000000..f0511e816967
--- /dev/null
+++ b/fs/romfs/mmap-nommu.c
@@ -0,0 +1,75 @@
1/* NOMMU mmap support for RomFS on MTD devices
2 *
3 * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/mm.h>
13#include <linux/mtd/super.h>
14#include "internal.h"
15
16/*
17 * try to determine where a shared mapping can be made
18 * - only supported for NOMMU at the moment (MMU can't doesn't copy private
19 * mappings)
20 * - attempts to map through to the underlying MTD device
21 */
22static unsigned long romfs_get_unmapped_area(struct file *file,
23 unsigned long addr,
24 unsigned long len,
25 unsigned long pgoff,
26 unsigned long flags)
27{
28 struct inode *inode = file->f_mapping->host;
29 struct mtd_info *mtd = inode->i_sb->s_mtd;
30 unsigned long isize, offset;
31
32 if (!mtd)
33 goto cant_map_directly;
34
35 isize = i_size_read(inode);
36 offset = pgoff << PAGE_SHIFT;
37 if (offset > isize || len > isize || offset > isize - len)
38 return (unsigned long) -EINVAL;
39
40 /* we need to call down to the MTD layer to do the actual mapping */
41 if (mtd->get_unmapped_area) {
42 if (addr != 0)
43 return (unsigned long) -EINVAL;
44
45 if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
46 return (unsigned long) -EINVAL;
47
48 offset += ROMFS_I(inode)->i_dataoffset;
49 if (offset > mtd->size - len)
50 return (unsigned long) -EINVAL;
51
52 return mtd->get_unmapped_area(mtd, len, offset, flags);
53 }
54
55cant_map_directly:
56 return (unsigned long) -ENOSYS;
57}
58
59/*
60 * permit a R/O mapping to be made directly through onto an MTD device if
61 * possible
62 */
63static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
64{
65 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
66}
67
68const struct file_operations romfs_ro_fops = {
69 .llseek = generic_file_llseek,
70 .read = do_sync_read,
71 .aio_read = generic_file_aio_read,
72 .splice_read = generic_file_splice_read,
73 .mmap = romfs_mmap,
74 .get_unmapped_area = romfs_get_unmapped_area,
75};
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
new file mode 100644
index 000000000000..7e3e1e12a081
--- /dev/null
+++ b/fs/romfs/storage.c
@@ -0,0 +1,261 @@
1/* RomFS storage access routines
2 *
3 * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/fs.h>
13#include <linux/mtd/super.h>
14#include <linux/buffer_head.h>
15#include "internal.h"
16
17#if !defined(CONFIG_ROMFS_ON_MTD) && !defined(CONFIG_ROMFS_ON_BLOCK)
18#error no ROMFS backing store interface configured
19#endif
20
21#ifdef CONFIG_ROMFS_ON_MTD
22#define ROMFS_MTD_READ(sb, ...) ((sb)->s_mtd->read((sb)->s_mtd, ##__VA_ARGS__))
23
24/*
25 * read data from an romfs image on an MTD device
26 */
27static int romfs_mtd_read(struct super_block *sb, unsigned long pos,
28 void *buf, size_t buflen)
29{
30 size_t rlen;
31 int ret;
32
33 ret = ROMFS_MTD_READ(sb, pos, buflen, &rlen, buf);
34 return (ret < 0 || rlen != buflen) ? -EIO : 0;
35}
36
37/*
38 * determine the length of a string in a romfs image on an MTD device
39 */
40static ssize_t romfs_mtd_strnlen(struct super_block *sb,
41 unsigned long pos, size_t maxlen)
42{
43 ssize_t n = 0;
44 size_t segment;
45 u_char buf[16], *p;
46 size_t len;
47 int ret;
48
49 /* scan the string up to 16 bytes at a time */
50 while (maxlen > 0) {
51 segment = min_t(size_t, maxlen, 16);
52 ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
53 if (ret < 0)
54 return ret;
55 p = memchr(buf, 0, len);
56 if (p)
57 return n + (p - buf);
58 maxlen -= len;
59 pos += len;
60 n += len;
61 }
62
63 return n;
64}
65
66/*
67 * compare a string to one in a romfs image on MTD
68 * - return 1 if matched, 0 if differ, -ve if error
69 */
70static int romfs_mtd_strncmp(struct super_block *sb, unsigned long pos,
71 const char *str, size_t size)
72{
73 u_char buf[16];
74 size_t len, segment;
75 int ret;
76
77 /* scan the string up to 16 bytes at a time */
78 while (size > 0) {
79 segment = min_t(size_t, size, 16);
80 ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
81 if (ret < 0)
82 return ret;
83 if (memcmp(buf, str, len) != 0)
84 return 0;
85 size -= len;
86 pos += len;
87 str += len;
88 }
89
90 return 1;
91}
92#endif /* CONFIG_ROMFS_ON_MTD */
93
94#ifdef CONFIG_ROMFS_ON_BLOCK
95/*
96 * read data from an romfs image on a block device
97 */
98static int romfs_blk_read(struct super_block *sb, unsigned long pos,
99 void *buf, size_t buflen)
100{
101 struct buffer_head *bh;
102 unsigned long offset;
103 size_t segment;
104
105 /* copy the string up to blocksize bytes at a time */
106 while (buflen > 0) {
107 offset = pos & (ROMBSIZE - 1);
108 segment = min_t(size_t, buflen, ROMBSIZE - offset);
109 bh = sb_bread(sb, pos >> ROMBSBITS);
110 if (!bh)
111 return -EIO;
112 memcpy(buf, bh->b_data + offset, segment);
113 brelse(bh);
114 buflen -= segment;
115 pos += segment;
116 }
117
118 return 0;
119}
120
121/*
122 * determine the length of a string in romfs on a block device
123 */
124static ssize_t romfs_blk_strnlen(struct super_block *sb,
125 unsigned long pos, size_t limit)
126{
127 struct buffer_head *bh;
128 unsigned long offset;
129 ssize_t n = 0;
130 size_t segment;
131 u_char *buf, *p;
132
133 /* scan the string up to blocksize bytes at a time */
134 while (limit > 0) {
135 offset = pos & (ROMBSIZE - 1);
136 segment = min_t(size_t, limit, ROMBSIZE - offset);
137 bh = sb_bread(sb, pos >> ROMBSBITS);
138 if (!bh)
139 return -EIO;
140 buf = bh->b_data + offset;
141 p = memchr(buf, 0, segment);
142 brelse(bh);
143 if (p)
144 return n + (p - buf);
145 limit -= segment;
146 pos += segment;
147 n += segment;
148 }
149
150 return n;
151}
152
153/*
154 * compare a string to one in a romfs image on a block device
155 * - return 1 if matched, 0 if differ, -ve if error
156 */
157static int romfs_blk_strncmp(struct super_block *sb, unsigned long pos,
158 const char *str, size_t size)
159{
160 struct buffer_head *bh;
161 unsigned long offset;
162 size_t segment;
163 bool x;
164
165 /* scan the string up to 16 bytes at a time */
166 while (size > 0) {
167 offset = pos & (ROMBSIZE - 1);
168 segment = min_t(size_t, size, ROMBSIZE - offset);
169 bh = sb_bread(sb, pos >> ROMBSBITS);
170 if (!bh)
171 return -EIO;
172 x = (memcmp(bh->b_data + offset, str, segment) != 0);
173 brelse(bh);
174 if (x)
175 return 0;
176 size -= segment;
177 pos += segment;
178 str += segment;
179 }
180
181 return 1;
182}
183#endif /* CONFIG_ROMFS_ON_BLOCK */
184
185/*
186 * read data from the romfs image
187 */
188int romfs_dev_read(struct super_block *sb, unsigned long pos,
189 void *buf, size_t buflen)
190{
191 size_t limit;
192
193 limit = romfs_maxsize(sb);
194 if (pos >= limit)
195 return -EIO;
196 if (buflen > limit - pos)
197 buflen = limit - pos;
198
199#ifdef CONFIG_ROMFS_ON_MTD
200 if (sb->s_mtd)
201 return romfs_mtd_read(sb, pos, buf, buflen);
202#endif
203#ifdef CONFIG_ROMFS_ON_BLOCK
204 if (sb->s_bdev)
205 return romfs_blk_read(sb, pos, buf, buflen);
206#endif
207 return -EIO;
208}
209
210/*
211 * determine the length of a string in romfs
212 */
213ssize_t romfs_dev_strnlen(struct super_block *sb,
214 unsigned long pos, size_t maxlen)
215{
216 size_t limit;
217
218 limit = romfs_maxsize(sb);
219 if (pos >= limit)
220 return -EIO;
221 if (maxlen > limit - pos)
222 maxlen = limit - pos;
223
224#ifdef CONFIG_ROMFS_ON_MTD
225 if (sb->s_mtd)
226 return romfs_mtd_strnlen(sb, pos, limit);
227#endif
228#ifdef CONFIG_ROMFS_ON_BLOCK
229 if (sb->s_bdev)
230 return romfs_blk_strnlen(sb, pos, limit);
231#endif
232 return -EIO;
233}
234
235/*
236 * compare a string to one in romfs
237 * - return 1 if matched, 0 if differ, -ve if error
238 */
239int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
240 const char *str, size_t size)
241{
242 size_t limit;
243
244 limit = romfs_maxsize(sb);
245 if (pos >= limit)
246 return -EIO;
247 if (size > ROMFS_MAXFN)
248 return -ENAMETOOLONG;
249 if (size > limit - pos)
250 return -EIO;
251
252#ifdef CONFIG_ROMFS_ON_MTD
253 if (sb->s_mtd)
254 return romfs_mtd_strncmp(sb, pos, str, size);
255#endif
256#ifdef CONFIG_ROMFS_ON_BLOCK
257 if (sb->s_bdev)
258 return romfs_blk_strncmp(sb, pos, str, size);
259#endif
260 return -EIO;
261}
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
new file mode 100644
index 000000000000..1e548a4975ba
--- /dev/null
+++ b/fs/romfs/super.c
@@ -0,0 +1,648 @@
1/* Block- or MTD-based romfs
2 *
3 * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * Derived from: ROMFS file system, Linux implementation
7 *
8 * Copyright © 1997-1999 Janos Farkas <chexum@shadow.banki.hu>
9 *
10 * Using parts of the minix filesystem
11 * Copyright © 1991, 1992 Linus Torvalds
12 *
13 * and parts of the affs filesystem additionally
14 * Copyright © 1993 Ray Burr
15 * Copyright © 1996 Hans-Joachim Widmaier
16 *
17 * Changes
18 * Changed for 2.1.19 modules
19 * Jan 1997 Initial release
20 * Jun 1997 2.1.43+ changes
21 * Proper page locking in readpage
22 * Changed to work with 2.1.45+ fs
23 * Jul 1997 Fixed follow_link
24 * 2.1.47
25 * lookup shouldn't return -ENOENT
26 * from Horst von Brand:
27 * fail on wrong checksum
28 * double unlock_super was possible
29 * correct namelen for statfs
30 * spotted by Bill Hawes:
31 * readlink shouldn't iput()
32 * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir()
33 * exposed a problem in readdir
34 * 2.1.107 code-freeze spellchecker run
35 * Aug 1998 2.1.118+ VFS changes
36 * Sep 1998 2.1.122 another VFS change (follow_link)
37 * Apr 1999 2.2.7 no more EBADF checking in
38 * lookup/readdir, use ERR_PTR
39 * Jun 1999 2.3.6 d_alloc_root use changed
40 * 2.3.9 clean up usage of ENOENT/negative
41 * dentries in lookup
42 * clean up page flags setting
43 * (error, uptodate, locking) in
44 * in readpage
45 * use init_special_inode for
46 * fifos/sockets (and streamline) in
47 * read_inode, fix _ops table order
48 * Aug 1999 2.3.16 __initfunc() => __init change
49 * Oct 1999 2.3.24 page->owner hack obsoleted
50 * Nov 1999 2.3.27 2.3.25+ page->offset => index change
51 *
52 *
53 * This program is free software; you can redistribute it and/or
54 * modify it under the terms of the GNU General Public Licence
55 * as published by the Free Software Foundation; either version
56 * 2 of the Licence, or (at your option) any later version.
57 */
58
59#include <linux/module.h>
60#include <linux/string.h>
61#include <linux/fs.h>
62#include <linux/time.h>
63#include <linux/slab.h>
64#include <linux/init.h>
65#include <linux/blkdev.h>
66#include <linux/parser.h>
67#include <linux/mount.h>
68#include <linux/namei.h>
69#include <linux/statfs.h>
70#include <linux/mtd/super.h>
71#include <linux/ctype.h>
72#include <linux/highmem.h>
73#include <linux/pagemap.h>
74#include <linux/uaccess.h>
75#include "internal.h"
76
77static struct kmem_cache *romfs_inode_cachep;
78
79static const umode_t romfs_modemap[8] = {
80 0, /* hard link */
81 S_IFDIR | 0644, /* directory */
82 S_IFREG | 0644, /* regular file */
83 S_IFLNK | 0777, /* symlink */
84 S_IFBLK | 0600, /* blockdev */
85 S_IFCHR | 0600, /* chardev */
86 S_IFSOCK | 0644, /* socket */
87 S_IFIFO | 0644 /* FIFO */
88};
89
90static const unsigned char romfs_dtype_table[] = {
91 DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
92};
93
94static struct inode *romfs_iget(struct super_block *sb, unsigned long pos);
95
96/*
97 * read a page worth of data from the image
98 */
99static int romfs_readpage(struct file *file, struct page *page)
100{
101 struct inode *inode = page->mapping->host;
102 loff_t offset, size;
103 unsigned long fillsize, pos;
104 void *buf;
105 int ret;
106
107 buf = kmap(page);
108 if (!buf)
109 return -ENOMEM;
110
111 /* 32 bit warning -- but not for us :) */
112 offset = page_offset(page);
113 size = i_size_read(inode);
114 fillsize = 0;
115 ret = 0;
116 if (offset < size) {
117 size -= offset;
118 fillsize = size > PAGE_SIZE ? PAGE_SIZE : size;
119
120 pos = ROMFS_I(inode)->i_dataoffset + offset;
121
122 ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize);
123 if (ret < 0) {
124 SetPageError(page);
125 fillsize = 0;
126 ret = -EIO;
127 }
128 }
129
130 if (fillsize < PAGE_SIZE)
131 memset(buf + fillsize, 0, PAGE_SIZE - fillsize);
132 if (ret == 0)
133 SetPageUptodate(page);
134
135 flush_dcache_page(page);
136 kunmap(page);
137 unlock_page(page);
138 return ret;
139}
140
141static const struct address_space_operations romfs_aops = {
142 .readpage = romfs_readpage
143};
144
145/*
146 * read the entries from a directory
147 */
148static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
149{
150 struct inode *i = filp->f_dentry->d_inode;
151 struct romfs_inode ri;
152 unsigned long offset, maxoff;
153 int j, ino, nextfh;
154 int stored = 0;
155 char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
156 int ret;
157
158 maxoff = romfs_maxsize(i->i_sb);
159
160 offset = filp->f_pos;
161 if (!offset) {
162 offset = i->i_ino & ROMFH_MASK;
163 ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
164 if (ret < 0)
165 goto out;
166 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
167 }
168
169 /* Not really failsafe, but we are read-only... */
170 for (;;) {
171 if (!offset || offset >= maxoff) {
172 offset = maxoff;
173 filp->f_pos = offset;
174 goto out;
175 }
176 filp->f_pos = offset;
177
178 /* Fetch inode info */
179 ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
180 if (ret < 0)
181 goto out;
182
183 j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE,
184 sizeof(fsname) - 1);
185 if (j < 0)
186 goto out;
187
188 ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j);
189 if (ret < 0)
190 goto out;
191 fsname[j] = '\0';
192
193 ino = offset;
194 nextfh = be32_to_cpu(ri.next);
195 if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
196 ino = be32_to_cpu(ri.spec);
197 if (filldir(dirent, fsname, j, offset, ino,
198 romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0)
199 goto out;
200
201 stored++;
202 offset = nextfh & ROMFH_MASK;
203 }
204
205out:
206 return stored;
207}
208
209/*
210 * look up an entry in a directory
211 */
212static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
213 struct nameidata *nd)
214{
215 unsigned long offset, maxoff;
216 struct inode *inode;
217 struct romfs_inode ri;
218 const char *name; /* got from dentry */
219 int len, ret;
220
221 offset = dir->i_ino & ROMFH_MASK;
222 ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE);
223 if (ret < 0)
224 goto error;
225
226 /* search all the file entries in the list starting from the one
227 * pointed to by the directory's special data */
228 maxoff = romfs_maxsize(dir->i_sb);
229 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
230
231 name = dentry->d_name.name;
232 len = dentry->d_name.len;
233
234 for (;;) {
235 if (!offset || offset >= maxoff)
236 goto out0;
237
238 ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri));
239 if (ret < 0)
240 goto error;
241
242 /* try to match the first 16 bytes of name */
243 ret = romfs_dev_strncmp(dir->i_sb, offset + ROMFH_SIZE, name,
244 len);
245 if (ret < 0)
246 goto error;
247 if (ret == 1)
248 break;
249
250 /* next entry */
251 offset = be32_to_cpu(ri.next) & ROMFH_MASK;
252 }
253
254 /* Hard link handling */
255 if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
256 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
257
258 inode = romfs_iget(dir->i_sb, offset);
259 if (IS_ERR(inode)) {
260 ret = PTR_ERR(inode);
261 goto error;
262 }
263 goto outi;
264
265 /*
266 * it's a bit funky, _lookup needs to return an error code
267 * (negative) or a NULL, both as a dentry. ENOENT should not
268 * be returned, instead we need to create a negative dentry by
269 * d_add(dentry, NULL); and return 0 as no error.
270 * (Although as I see, it only matters on writable file
271 * systems).
272 */
273out0:
274 inode = NULL;
275outi:
276 d_add(dentry, inode);
277 ret = 0;
278error:
279 return ERR_PTR(ret);
280}
281
282static const struct file_operations romfs_dir_operations = {
283 .read = generic_read_dir,
284 .readdir = romfs_readdir,
285};
286
287static struct inode_operations romfs_dir_inode_operations = {
288 .lookup = romfs_lookup,
289};
290
291/*
292 * get a romfs inode based on its position in the image (which doubles as the
293 * inode number)
294 */
295static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
296{
297 struct romfs_inode_info *inode;
298 struct romfs_inode ri;
299 struct inode *i;
300 unsigned long nlen;
301 unsigned nextfh, ret;
302 umode_t mode;
303
304 /* we might have to traverse a chain of "hard link" file entries to get
305 * to the actual file */
306 for (;;) {
307 ret = romfs_dev_read(sb, pos, &ri, sizeof(ri));
308 if (ret < 0)
309 goto error;
310
311 /* XXX: do romfs_checksum here too (with name) */
312
313 nextfh = be32_to_cpu(ri.next);
314 if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
315 break;
316
317 pos = be32_to_cpu(ri.spec) & ROMFH_MASK;
318 }
319
320 /* determine the length of the filename */
321 nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN);
322 if (IS_ERR_VALUE(nlen))
323 goto eio;
324
325 /* get an inode for this image position */
326 i = iget_locked(sb, pos);
327 if (!i)
328 return ERR_PTR(-ENOMEM);
329
330 if (!(i->i_state & I_NEW))
331 return i;
332
333 /* precalculate the data offset */
334 inode = ROMFS_I(i);
335 inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK;
336 inode->i_dataoffset = pos + inode->i_metasize;
337
338 i->i_nlink = 1; /* Hard to decide.. */
339 i->i_size = be32_to_cpu(ri.size);
340 i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
341 i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
342
343 /* set up mode and ops */
344 mode = romfs_modemap[nextfh & ROMFH_TYPE];
345
346 switch (nextfh & ROMFH_TYPE) {
347 case ROMFH_DIR:
348 i->i_size = ROMFS_I(i)->i_metasize;
349 i->i_op = &romfs_dir_inode_operations;
350 i->i_fop = &romfs_dir_operations;
351 if (nextfh & ROMFH_EXEC)
352 mode |= S_IXUGO;
353 break;
354 case ROMFH_REG:
355 i->i_fop = &romfs_ro_fops;
356 i->i_data.a_ops = &romfs_aops;
357 if (i->i_sb->s_mtd)
358 i->i_data.backing_dev_info =
359 i->i_sb->s_mtd->backing_dev_info;
360 if (nextfh & ROMFH_EXEC)
361 mode |= S_IXUGO;
362 break;
363 case ROMFH_SYM:
364 i->i_op = &page_symlink_inode_operations;
365 i->i_data.a_ops = &romfs_aops;
366 mode |= S_IRWXUGO;
367 break;
368 default:
369 /* depending on MBZ for sock/fifos */
370 nextfh = be32_to_cpu(ri.spec);
371 init_special_inode(i, mode, MKDEV(nextfh >> 16,
372 nextfh & 0xffff));
373 break;
374 }
375
376 i->i_mode = mode;
377
378 unlock_new_inode(i);
379 return i;
380
381eio:
382 ret = -EIO;
383error:
384 printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos);
385 return ERR_PTR(ret);
386}
387
388/*
389 * allocate a new inode
390 */
391static struct inode *romfs_alloc_inode(struct super_block *sb)
392{
393 struct romfs_inode_info *inode;
394 inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
395 return inode ? &inode->vfs_inode : NULL;
396}
397
398/*
399 * return a spent inode to the slab cache
400 */
401static void romfs_destroy_inode(struct inode *inode)
402{
403 kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
404}
405
406/*
407 * get filesystem statistics
408 */
409static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
410{
411 buf->f_type = ROMFS_MAGIC;
412 buf->f_namelen = ROMFS_MAXFN;
413 buf->f_bsize = ROMBSIZE;
414 buf->f_bfree = buf->f_bavail = buf->f_ffree;
415 buf->f_blocks =
416 (romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS;
417 return 0;
418}
419
420/*
421 * remounting must involve read-only
422 */
423static int romfs_remount(struct super_block *sb, int *flags, char *data)
424{
425 *flags |= MS_RDONLY;
426 return 0;
427}
428
429static const struct super_operations romfs_super_ops = {
430 .alloc_inode = romfs_alloc_inode,
431 .destroy_inode = romfs_destroy_inode,
432 .statfs = romfs_statfs,
433 .remount_fs = romfs_remount,
434};
435
436/*
437 * checksum check on part of a romfs filesystem
438 */
439static __u32 romfs_checksum(const void *data, int size)
440{
441 const __be32 *ptr = data;
442 __u32 sum;
443
444 sum = 0;
445 size >>= 2;
446 while (size > 0) {
447 sum += be32_to_cpu(*ptr++);
448 size--;
449 }
450 return sum;
451}
452
453/*
454 * fill in the superblock
455 */
456static int romfs_fill_super(struct super_block *sb, void *data, int silent)
457{
458 struct romfs_super_block *rsb;
459 struct inode *root;
460 unsigned long pos, img_size;
461 const char *storage;
462 size_t len;
463 int ret;
464
465#ifdef CONFIG_BLOCK
466 if (!sb->s_mtd) {
467 sb_set_blocksize(sb, ROMBSIZE);
468 } else {
469 sb->s_blocksize = ROMBSIZE;
470 sb->s_blocksize_bits = blksize_bits(ROMBSIZE);
471 }
472#endif
473
474 sb->s_maxbytes = 0xFFFFFFFF;
475 sb->s_magic = ROMFS_MAGIC;
476 sb->s_flags |= MS_RDONLY | MS_NOATIME;
477 sb->s_op = &romfs_super_ops;
478
479 /* read the image superblock and check it */
480 rsb = kmalloc(512, GFP_KERNEL);
481 if (!rsb)
482 return -ENOMEM;
483
484 sb->s_fs_info = (void *) 512;
485 ret = romfs_dev_read(sb, 0, rsb, 512);
486 if (ret < 0)
487 goto error_rsb;
488
489 img_size = be32_to_cpu(rsb->size);
490
491 if (sb->s_mtd && img_size > sb->s_mtd->size)
492 goto error_rsb_inval;
493
494 sb->s_fs_info = (void *) img_size;
495
496 if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 ||
497 img_size < ROMFH_SIZE) {
498 if (!silent)
499 printk(KERN_WARNING "VFS:"
500 " Can't find a romfs filesystem on dev %s.\n",
501 sb->s_id);
502 goto error_rsb_inval;
503 }
504
505 if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) {
506 printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n",
507 sb->s_id);
508 goto error_rsb_inval;
509 }
510
511 storage = sb->s_mtd ? "MTD" : "the block layer";
512
513 len = strnlen(rsb->name, ROMFS_MAXFN);
514 if (!silent)
515 printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n",
516 (unsigned) len, (unsigned) len, rsb->name, storage);
517
518 kfree(rsb);
519 rsb = NULL;
520
521 /* find the root directory */
522 pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
523
524 root = romfs_iget(sb, pos);
525 if (!root)
526 goto error;
527
528 sb->s_root = d_alloc_root(root);
529 if (!sb->s_root)
530 goto error_i;
531
532 return 0;
533
534error_i:
535 iput(root);
536error:
537 return -EINVAL;
538error_rsb_inval:
539 ret = -EINVAL;
540error_rsb:
541 return ret;
542}
543
544/*
545 * get a superblock for mounting
546 */
547static int romfs_get_sb(struct file_system_type *fs_type,
548 int flags, const char *dev_name,
549 void *data, struct vfsmount *mnt)
550{
551 int ret = -EINVAL;
552
553#ifdef CONFIG_ROMFS_ON_MTD
554 ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super,
555 mnt);
556#endif
557#ifdef CONFIG_ROMFS_ON_BLOCK
558 if (ret == -EINVAL)
559 ret = get_sb_bdev(fs_type, flags, dev_name, data,
560 romfs_fill_super, mnt);
561#endif
562 return ret;
563}
564
565/*
566 * destroy a romfs superblock in the appropriate manner
567 */
568static void romfs_kill_sb(struct super_block *sb)
569{
570#ifdef CONFIG_ROMFS_ON_MTD
571 if (sb->s_mtd) {
572 kill_mtd_super(sb);
573 return;
574 }
575#endif
576#ifdef CONFIG_ROMFS_ON_BLOCK
577 if (sb->s_bdev) {
578 kill_block_super(sb);
579 return;
580 }
581#endif
582}
583
584static struct file_system_type romfs_fs_type = {
585 .owner = THIS_MODULE,
586 .name = "romfs",
587 .get_sb = romfs_get_sb,
588 .kill_sb = romfs_kill_sb,
589 .fs_flags = FS_REQUIRES_DEV,
590};
591
592/*
593 * inode storage initialiser
594 */
595static void romfs_i_init_once(void *_inode)
596{
597 struct romfs_inode_info *inode = _inode;
598
599 inode_init_once(&inode->vfs_inode);
600}
601
602/*
603 * romfs module initialisation
604 */
605static int __init init_romfs_fs(void)
606{
607 int ret;
608
609 printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n");
610
611 romfs_inode_cachep =
612 kmem_cache_create("romfs_i",
613 sizeof(struct romfs_inode_info), 0,
614 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
615 romfs_i_init_once);
616
617 if (!romfs_inode_cachep) {
618 printk(KERN_ERR
619 "ROMFS error: Failed to initialise inode cache\n");
620 return -ENOMEM;
621 }
622 ret = register_filesystem(&romfs_fs_type);
623 if (ret) {
624 printk(KERN_ERR "ROMFS error: Failed to register filesystem\n");
625 goto error_register;
626 }
627 return 0;
628
629error_register:
630 kmem_cache_destroy(romfs_inode_cachep);
631 return ret;
632}
633
634/*
635 * romfs module removal
636 */
637static void __exit exit_romfs_fs(void)
638{
639 unregister_filesystem(&romfs_fs_type);
640 kmem_cache_destroy(romfs_inode_cachep);
641}
642
643module_init(init_romfs_fs);
644module_exit(exit_romfs_fs);
645
646MODULE_DESCRIPTION("Direct-MTD Capable RomFS");
647MODULE_AUTHOR("Red Hat, Inc.");
648MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 69e971d5ddc1..2b1b8fe5e037 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -40,6 +40,7 @@
40#include <linux/dcache.h> 40#include <linux/dcache.h>
41#include <linux/exportfs.h> 41#include <linux/exportfs.h>
42#include <linux/zlib.h> 42#include <linux/zlib.h>
43#include <linux/slab.h>
43 44
44#include "squashfs_fs.h" 45#include "squashfs_fs.h"
45#include "squashfs_fs_sb.h" 46#include "squashfs_fs_sb.h"
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index f393620890ee..af1914462f02 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -194,29 +194,26 @@ static int make_free_space(struct ubifs_info *c)
194} 194}
195 195
196/** 196/**
197 * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index. 197 * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index.
198 * @c: UBIFS file-system description object 198 * @c: UBIFS file-system description object
199 * 199 *
200 * This function calculates and returns the number of eraseblocks which should 200 * This function calculates and returns the number of LEBs which should be kept
201 * be kept for index usage. 201 * for index usage.
202 */ 202 */
203int ubifs_calc_min_idx_lebs(struct ubifs_info *c) 203int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
204{ 204{
205 int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz; 205 int idx_lebs;
206 long long idx_size; 206 long long idx_size;
207 207
208 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; 208 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
209
210 /* And make sure we have thrice the index size of space reserved */ 209 /* And make sure we have thrice the index size of space reserved */
211 idx_size = idx_size + (idx_size << 1); 210 idx_size += idx_size << 1;
212
213 /* 211 /*
214 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes' 212 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
215 * pair, nor similarly the two variables for the new index size, so we 213 * pair, nor similarly the two variables for the new index size, so we
216 * have to do this costly 64-bit division on fast-path. 214 * have to do this costly 64-bit division on fast-path.
217 */ 215 */
218 idx_size += eff_leb_size - 1; 216 idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size);
219 idx_lebs = div_u64(idx_size, eff_leb_size);
220 /* 217 /*
221 * The index head is not available for the in-the-gaps method, so add an 218 * The index head is not available for the in-the-gaps method, so add an
222 * extra LEB to compensate. 219 * extra LEB to compensate.
@@ -310,23 +307,23 @@ static int can_use_rp(struct ubifs_info *c)
310 * do_budget_space - reserve flash space for index and data growth. 307 * do_budget_space - reserve flash space for index and data growth.
311 * @c: UBIFS file-system description object 308 * @c: UBIFS file-system description object
312 * 309 *
313 * This function makes sure UBIFS has enough free eraseblocks for index growth 310 * This function makes sure UBIFS has enough free LEBs for index growth and
314 * and data. 311 * data.
315 * 312 *
316 * When budgeting index space, UBIFS reserves thrice as many LEBs as the index 313 * When budgeting index space, UBIFS reserves thrice as many LEBs as the index
317 * would take if it was consolidated and written to the flash. This guarantees 314 * would take if it was consolidated and written to the flash. This guarantees
318 * that the "in-the-gaps" commit method always succeeds and UBIFS will always 315 * that the "in-the-gaps" commit method always succeeds and UBIFS will always
319 * be able to commit dirty index. So this function basically adds amount of 316 * be able to commit dirty index. So this function basically adds amount of
320 * budgeted index space to the size of the current index, multiplies this by 3, 317 * budgeted index space to the size of the current index, multiplies this by 3,
321 * and makes sure this does not exceed the amount of free eraseblocks. 318 * and makes sure this does not exceed the amount of free LEBs.
322 * 319 *
323 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables: 320 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
324 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might 321 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
325 * be large, because UBIFS does not do any index consolidation as long as 322 * be large, because UBIFS does not do any index consolidation as long as
326 * there is free space. IOW, the index may take a lot of LEBs, but the LEBs 323 * there is free space. IOW, the index may take a lot of LEBs, but the LEBs
327 * will contain a lot of dirt. 324 * will contain a lot of dirt.
328 * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be 325 * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW,
329 * consolidated to take up to @c->min_idx_lebs LEBs. 326 * the index may be consolidated to take up to @c->min_idx_lebs LEBs.
330 * 327 *
331 * This function returns zero in case of success, and %-ENOSPC in case of 328 * This function returns zero in case of success, and %-ENOSPC in case of
332 * failure. 329 * failure.
@@ -695,12 +692,12 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
695 * This function calculates amount of free space to report to user-space. 692 * This function calculates amount of free space to report to user-space.
696 * 693 *
697 * Because UBIFS may introduce substantial overhead (the index, node headers, 694 * Because UBIFS may introduce substantial overhead (the index, node headers,
698 * alignment, wastage at the end of eraseblocks, etc), it cannot report real 695 * alignment, wastage at the end of LEBs, etc), it cannot report real amount of
699 * amount of free flash space it has (well, because not all dirty space is 696 * free flash space it has (well, because not all dirty space is reclaimable,
700 * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so, 697 * UBIFS does not actually know the real amount). If UBIFS did so, it would
701 * it would bread user expectations about what free space is. Users seem to 698 * bread user expectations about what free space is. Users seem to accustomed
702 * accustomed to assume that if the file-system reports N bytes of free space, 699 * to assume that if the file-system reports N bytes of free space, they would
703 * they would be able to fit a file of N bytes to the FS. This almost works for 700 * be able to fit a file of N bytes to the FS. This almost works for
704 * traditional file-systems, because they have way less overhead than UBIFS. 701 * traditional file-systems, because they have way less overhead than UBIFS.
705 * So, to keep users happy, UBIFS tries to take the overhead into account. 702 * So, to keep users happy, UBIFS tries to take the overhead into account.
706 */ 703 */
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index e975bd82f38b..ce2cd8343618 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -479,9 +479,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
479 "bad or corrupted node)"); 479 "bad or corrupted node)");
480 else { 480 else {
481 for (i = 0; i < nlen && dent->name[i]; i++) 481 for (i = 0; i < nlen && dent->name[i]; i++)
482 printk("%c", dent->name[i]); 482 printk(KERN_CONT "%c", dent->name[i]);
483 } 483 }
484 printk("\n"); 484 printk(KERN_CONT "\n");
485 485
486 break; 486 break;
487 } 487 }
@@ -1214,7 +1214,7 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
1214 1214
1215 /* 1215 /*
1216 * Make sure the last key in our znode is less or 1216 * Make sure the last key in our znode is less or
1217 * equivalent than the the key in zbranch which goes 1217 * equivalent than the key in the zbranch which goes
1218 * after our pointing zbranch. 1218 * after our pointing zbranch.
1219 */ 1219 */
1220 cmp = keys_cmp(c, max, 1220 cmp = keys_cmp(c, max,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 0ff89fe71e51..6d34dc7e33e1 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -430,6 +430,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
430 struct ubifs_inode *ui = ubifs_inode(inode); 430 struct ubifs_inode *ui = ubifs_inode(inode);
431 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 431 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
432 int uninitialized_var(err), appending = !!(pos + len > inode->i_size); 432 int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
433 int skipped_read = 0;
433 struct page *page; 434 struct page *page;
434 435
435 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); 436 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
@@ -444,7 +445,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
444 445
445 if (!PageUptodate(page)) { 446 if (!PageUptodate(page)) {
446 /* The page is not loaded from the flash */ 447 /* The page is not loaded from the flash */
447 if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) 448 if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
448 /* 449 /*
449 * We change whole page so no need to load it. But we 450 * We change whole page so no need to load it. But we
450 * have to set the @PG_checked flag to make the further 451 * have to set the @PG_checked flag to make the further
@@ -453,7 +454,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
453 * the media. 454 * the media.
454 */ 455 */
455 SetPageChecked(page); 456 SetPageChecked(page);
456 else { 457 skipped_read = 1;
458 } else {
457 err = do_readpage(page); 459 err = do_readpage(page);
458 if (err) { 460 if (err) {
459 unlock_page(page); 461 unlock_page(page);
@@ -470,6 +472,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
470 if (unlikely(err)) { 472 if (unlikely(err)) {
471 ubifs_assert(err == -ENOSPC); 473 ubifs_assert(err == -ENOSPC);
472 /* 474 /*
475 * If we skipped reading the page because we were going to
476 * write all of it, then it is not up to date.
477 */
478 if (skipped_read) {
479 ClearPageChecked(page);
480 ClearPageUptodate(page);
481 }
482 /*
473 * Budgeting failed which means it would have to force 483 * Budgeting failed which means it would have to force
474 * write-back but didn't, because we set the @fast flag in the 484 * write-back but didn't, because we set the @fast flag in the
475 * request. Write-back cannot be done now, while we have the 485 * request. Write-back cannot be done now, while we have the
@@ -949,7 +959,7 @@ static int do_writepage(struct page *page, int len)
949 * whole index and correct all inode sizes, which is long an unacceptable. 959 * whole index and correct all inode sizes, which is long an unacceptable.
950 * 960 *
951 * To prevent situations like this, UBIFS writes pages back only if they are 961 * To prevent situations like this, UBIFS writes pages back only if they are
952 * within last synchronized inode size, i.e. the the size which has been 962 * within the last synchronized inode size, i.e. the size which has been
953 * written to the flash media last time. Otherwise, UBIFS forces inode 963 * written to the flash media last time. Otherwise, UBIFS forces inode
954 * write-back, thus making sure the on-flash inode contains current inode size, 964 * write-back, thus making sure the on-flash inode contains current inode size,
955 * and then keeps writing pages back. 965 * and then keeps writing pages back.
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 717d79c97c5e..1d54383d1269 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -478,7 +478,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
478 * ubifs_find_free_space - find a data LEB with free space. 478 * ubifs_find_free_space - find a data LEB with free space.
479 * @c: the UBIFS file-system description object 479 * @c: the UBIFS file-system description object
480 * @min_space: minimum amount of required free space 480 * @min_space: minimum amount of required free space
481 * @free: contains amount of free space in the LEB on exit 481 * @offs: contains offset of where free space starts on exit
482 * @squeeze: whether to try to find space in a non-empty LEB first 482 * @squeeze: whether to try to find space in a non-empty LEB first
483 * 483 *
484 * This function looks for an LEB with at least @min_space bytes of free space. 484 * This function looks for an LEB with at least @min_space bytes of free space.
@@ -490,7 +490,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
490 * failed to find a LEB with @min_space bytes of free space and other a negative 490 * failed to find a LEB with @min_space bytes of free space and other a negative
491 * error codes in case of failure. 491 * error codes in case of failure.
492 */ 492 */
493int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, 493int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
494 int squeeze) 494 int squeeze)
495{ 495{
496 const struct ubifs_lprops *lprops; 496 const struct ubifs_lprops *lprops;
@@ -558,10 +558,10 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
558 spin_unlock(&c->space_lock); 558 spin_unlock(&c->space_lock);
559 } 559 }
560 560
561 *free = lprops->free; 561 *offs = c->leb_size - lprops->free;
562 ubifs_release_lprops(c); 562 ubifs_release_lprops(c);
563 563
564 if (*free == c->leb_size) { 564 if (*offs == 0) {
565 /* 565 /*
566 * Ensure that empty LEBs have been unmapped. They may not have 566 * Ensure that empty LEBs have been unmapped. They may not have
567 * been, for example, because of an unclean unmount. Also 567 * been, for example, because of an unclean unmount. Also
@@ -573,8 +573,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
573 return err; 573 return err;
574 } 574 }
575 575
576 dbg_find("found LEB %d, free %d", lnum, *free); 576 dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs);
577 ubifs_assert(*free >= min_space); 577 ubifs_assert(*offs <= c->leb_size - min_space);
578 return lnum; 578 return lnum;
579 579
580out: 580out:
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index a711d33b3d3e..f0f5f15d384e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -47,7 +47,7 @@
47 * have to waste large pieces of free space at the end of LEB B, because nodes 47 * have to waste large pieces of free space at the end of LEB B, because nodes
48 * from LEB A would not fit. And the worst situation is when all nodes are of 48 * from LEB A would not fit. And the worst situation is when all nodes are of
49 * maximum size. So dark watermark is the amount of free + dirty space in LEB 49 * maximum size. So dark watermark is the amount of free + dirty space in LEB
50 * which are guaranteed to be reclaimable. If LEB has less space, the GC migh 50 * which are guaranteed to be reclaimable. If LEB has less space, the GC might
51 * be unable to reclaim it. So, LEBs with free + dirty greater than dark 51 * be unable to reclaim it. So, LEBs with free + dirty greater than dark
52 * watermark are "good" LEBs from GC's point of few. The other LEBs are not so 52 * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
53 * good, and GC takes extra care when moving them. 53 * good, and GC takes extra care when moving them.
@@ -57,14 +57,6 @@
57#include "ubifs.h" 57#include "ubifs.h"
58 58
59/* 59/*
60 * GC tries to optimize the way it fit nodes to available space, and it sorts
61 * nodes a little. The below constants are watermarks which define "large",
62 * "medium", and "small" nodes.
63 */
64#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
65#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ
66
67/*
68 * GC may need to move more than one LEB to make progress. The below constants 60 * GC may need to move more than one LEB to make progress. The below constants
69 * define "soft" and "hard" limits on the number of LEBs the garbage collector 61 * define "soft" and "hard" limits on the number of LEBs the garbage collector
70 * may move. 62 * may move.
@@ -116,83 +108,222 @@ static int switch_gc_head(struct ubifs_info *c)
116} 108}
117 109
118/** 110/**
119 * joinup - bring data nodes for an inode together. 111 * list_sort - sort a list.
120 * @c: UBIFS file-system description object 112 * @priv: private data, passed to @cmp
121 * @sleb: describes scanned LEB 113 * @head: the list to sort
122 * @inum: inode number 114 * @cmp: the elements comparison function
123 * @blk: block number
124 * @data: list to which to add data nodes
125 * 115 *
126 * This function looks at the first few nodes in the scanned LEB @sleb and adds 116 * This function has been implemented by Mark J Roberts <mjr@znex.org>. It
127 * them to @data if they are data nodes from @inum and have a larger block 117 * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted
128 * number than @blk. This function returns %0 on success and a negative error 118 * in ascending order.
129 * code on failure. 119 *
120 * The comparison function @cmp is supposed to return a negative value if @a is
121 * than @b, and a positive value if @a is greater than @b. If @a and @b are
122 * equivalent, then it does not matter what this function returns.
130 */ 123 */
131static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum, 124static void list_sort(void *priv, struct list_head *head,
132 unsigned int blk, struct list_head *data) 125 int (*cmp)(void *priv, struct list_head *a,
126 struct list_head *b))
133{ 127{
134 int err, cnt = 6, lnum = sleb->lnum, offs; 128 struct list_head *p, *q, *e, *list, *tail, *oldhead;
135 struct ubifs_scan_node *snod, *tmp; 129 int insize, nmerges, psize, qsize, i;
136 union ubifs_key *key; 130
131 if (list_empty(head))
132 return;
133
134 list = head->next;
135 list_del(head);
136 insize = 1;
137 for (;;) {
138 p = oldhead = list;
139 list = tail = NULL;
140 nmerges = 0;
141
142 while (p) {
143 nmerges++;
144 q = p;
145 psize = 0;
146 for (i = 0; i < insize; i++) {
147 psize++;
148 q = q->next == oldhead ? NULL : q->next;
149 if (!q)
150 break;
151 }
137 152
138 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { 153 qsize = insize;
139 key = &snod->key; 154 while (psize > 0 || (qsize > 0 && q)) {
140 if (key_inum(c, key) == inum && 155 if (!psize) {
141 key_type(c, key) == UBIFS_DATA_KEY && 156 e = q;
142 key_block(c, key) > blk) { 157 q = q->next;
143 offs = snod->offs; 158 qsize--;
144 err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0); 159 if (q == oldhead)
145 if (err < 0) 160 q = NULL;
146 return err; 161 } else if (!qsize || !q) {
147 list_del(&snod->list); 162 e = p;
148 if (err) { 163 p = p->next;
149 list_add_tail(&snod->list, data); 164 psize--;
150 blk = key_block(c, key); 165 if (p == oldhead)
151 } else 166 p = NULL;
152 kfree(snod); 167 } else if (cmp(priv, p, q) <= 0) {
153 cnt = 6; 168 e = p;
154 } else if (--cnt == 0) 169 p = p->next;
170 psize--;
171 if (p == oldhead)
172 p = NULL;
173 } else {
174 e = q;
175 q = q->next;
176 qsize--;
177 if (q == oldhead)
178 q = NULL;
179 }
180 if (tail)
181 tail->next = e;
182 else
183 list = e;
184 e->prev = tail;
185 tail = e;
186 }
187 p = q;
188 }
189
190 tail->next = list;
191 list->prev = tail;
192
193 if (nmerges <= 1)
155 break; 194 break;
195
196 insize *= 2;
156 } 197 }
157 return 0; 198
199 head->next = list;
200 head->prev = list->prev;
201 list->prev->next = head;
202 list->prev = head;
158} 203}
159 204
160/** 205/**
161 * move_nodes - move nodes. 206 * data_nodes_cmp - compare 2 data nodes.
207 * @priv: UBIFS file-system description object
208 * @a: first data node
209 * @a: second data node
210 *
211 * This function compares data nodes @a and @b. Returns %1 if @a has greater
212 * inode or block number, and %-1 otherwise.
213 */
214int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
215{
216 ino_t inuma, inumb;
217 struct ubifs_info *c = priv;
218 struct ubifs_scan_node *sa, *sb;
219
220 cond_resched();
221 sa = list_entry(a, struct ubifs_scan_node, list);
222 sb = list_entry(b, struct ubifs_scan_node, list);
223 ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
224 ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
225
226 inuma = key_inum(c, &sa->key);
227 inumb = key_inum(c, &sb->key);
228
229 if (inuma == inumb) {
230 unsigned int blka = key_block(c, &sa->key);
231 unsigned int blkb = key_block(c, &sb->key);
232
233 if (blka <= blkb)
234 return -1;
235 } else if (inuma <= inumb)
236 return -1;
237
238 return 1;
239}
240
241/*
242 * nondata_nodes_cmp - compare 2 non-data nodes.
243 * @priv: UBIFS file-system description object
244 * @a: first node
245 * @a: second node
246 *
247 * This function compares nodes @a and @b. It makes sure that inode nodes go
248 * first and sorted by length in descending order. Directory entry nodes go
249 * after inode nodes and are sorted in ascending hash valuer order.
250 */
251int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
252{
253 int typea, typeb;
254 ino_t inuma, inumb;
255 struct ubifs_info *c = priv;
256 struct ubifs_scan_node *sa, *sb;
257
258 cond_resched();
259 sa = list_entry(a, struct ubifs_scan_node, list);
260 sb = list_entry(b, struct ubifs_scan_node, list);
261 typea = key_type(c, &sa->key);
262 typeb = key_type(c, &sb->key);
263 ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY);
264
265 /* Inodes go before directory entries */
266 if (typea == UBIFS_INO_KEY) {
267 if (typeb == UBIFS_INO_KEY)
268 return sb->len - sa->len;
269 return -1;
270 }
271 if (typeb == UBIFS_INO_KEY)
272 return 1;
273
274 ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY);
275 inuma = key_inum(c, &sa->key);
276 inumb = key_inum(c, &sb->key);
277
278 if (inuma == inumb) {
279 uint32_t hasha = key_hash(c, &sa->key);
280 uint32_t hashb = key_hash(c, &sb->key);
281
282 if (hasha <= hashb)
283 return -1;
284 } else if (inuma <= inumb)
285 return -1;
286
287 return 1;
288}
289
290/**
291 * sort_nodes - sort nodes for GC.
162 * @c: UBIFS file-system description object 292 * @c: UBIFS file-system description object
163 * @sleb: describes nodes to move 293 * @sleb: describes nodes to sort and contains the result on exit
294 * @nondata: contains non-data nodes on exit
295 * @min: minimum node size is returned here
164 * 296 *
165 * This function moves valid nodes from data LEB described by @sleb to the GC 297 * This function sorts the list of inodes to garbage collect. First of all, it
166 * journal head. The obsolete nodes are dropped. 298 * kills obsolete nodes and separates data and non-data nodes to the
299 * @sleb->nodes and @nondata lists correspondingly.
300 *
301 * Data nodes are then sorted in block number order - this is important for
302 * bulk-read; data nodes with lower inode number go before data nodes with
303 * higher inode number, and data nodes with lower block number go before data
304 * nodes with higher block number;
167 * 305 *
168 * When moving nodes we have to deal with classical bin-packing problem: the 306 * Non-data nodes are sorted as follows.
169 * space in the current GC journal head LEB and in @c->gc_lnum are the "bins", 307 * o First go inode nodes - they are sorted in descending length order.
170 * where the nodes in the @sleb->nodes list are the elements which should be 308 * o Then go directory entry nodes - they are sorted in hash order, which
171 * fit optimally to the bins. This function uses the "first fit decreasing" 309 * should supposedly optimize 'readdir()'. Direntry nodes with lower parent
172 * strategy, although it does not really sort the nodes but just split them on 310 * inode number go before direntry nodes with higher parent inode number,
173 * 3 classes - large, medium, and small, so they are roughly sorted. 311 * and direntry nodes with lower name hash values go before direntry nodes
312 * with higher name hash values.
174 * 313 *
175 * This function returns zero in case of success, %-EAGAIN if commit is 314 * This function returns zero in case of success and a negative error code in
176 * required, and other negative error codes in case of other failures. 315 * case of failure.
177 */ 316 */
178static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) 317static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
318 struct list_head *nondata, int *min)
179{ 319{
180 struct ubifs_scan_node *snod, *tmp; 320 struct ubifs_scan_node *snod, *tmp;
181 struct list_head data, large, medium, small;
182 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
183 int avail, err, min = INT_MAX;
184 unsigned int blk = 0;
185 ino_t inum = 0;
186 321
187 INIT_LIST_HEAD(&data); 322 *min = INT_MAX;
188 INIT_LIST_HEAD(&large);
189 INIT_LIST_HEAD(&medium);
190 INIT_LIST_HEAD(&small);
191 323
192 while (!list_empty(&sleb->nodes)) { 324 /* Separate data nodes and non-data nodes */
193 struct list_head *lst = sleb->nodes.next; 325 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
194 326 int err;
195 snod = list_entry(lst, struct ubifs_scan_node, list);
196 327
197 ubifs_assert(snod->type != UBIFS_IDX_NODE); 328 ubifs_assert(snod->type != UBIFS_IDX_NODE);
198 ubifs_assert(snod->type != UBIFS_REF_NODE); 329 ubifs_assert(snod->type != UBIFS_REF_NODE);
@@ -201,53 +332,72 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
201 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, 332 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
202 snod->offs, 0); 333 snod->offs, 0);
203 if (err < 0) 334 if (err < 0)
204 goto out; 335 return err;
205 336
206 list_del(lst);
207 if (!err) { 337 if (!err) {
208 /* The node is obsolete, remove it from the list */ 338 /* The node is obsolete, remove it from the list */
339 list_del(&snod->list);
209 kfree(snod); 340 kfree(snod);
210 continue; 341 continue;
211 } 342 }
212 343
213 /* 344 if (snod->len < *min)
214 * Sort the list of nodes so that data nodes go first, large 345 *min = snod->len;
215 * nodes go second, and small nodes go last. 346
216 */ 347 if (key_type(c, &snod->key) != UBIFS_DATA_KEY)
217 if (key_type(c, &snod->key) == UBIFS_DATA_KEY) { 348 list_move_tail(&snod->list, nondata);
218 if (inum != key_inum(c, &snod->key)) {
219 if (inum) {
220 /*
221 * Try to move data nodes from the same
222 * inode together.
223 */
224 err = joinup(c, sleb, inum, blk, &data);
225 if (err)
226 goto out;
227 }
228 inum = key_inum(c, &snod->key);
229 blk = key_block(c, &snod->key);
230 }
231 list_add_tail(lst, &data);
232 } else if (snod->len > MEDIUM_NODE_WM)
233 list_add_tail(lst, &large);
234 else if (snod->len > SMALL_NODE_WM)
235 list_add_tail(lst, &medium);
236 else
237 list_add_tail(lst, &small);
238
239 /* And find the smallest node */
240 if (snod->len < min)
241 min = snod->len;
242 } 349 }
243 350
244 /* 351 /* Sort data and non-data nodes */
245 * Join the tree lists so that we'd have one roughly sorted list 352 list_sort(c, &sleb->nodes, &data_nodes_cmp);
246 * ('large' will be the head of the joined list). 353 list_sort(c, nondata, &nondata_nodes_cmp);
247 */ 354 return 0;
248 list_splice(&data, &large); 355}
249 list_splice(&medium, large.prev); 356
250 list_splice(&small, large.prev); 357/**
358 * move_node - move a node.
359 * @c: UBIFS file-system description object
360 * @sleb: describes the LEB to move nodes from
361 * @snod: the mode to move
362 * @wbuf: write-buffer to move node to
363 *
364 * This function moves node @snod to @wbuf, changes TNC correspondingly, and
365 * destroys @snod. Returns zero in case of success and a negative error code in
366 * case of failure.
367 */
368static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
369 struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf)
370{
371 int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;
372
373 cond_resched();
374 err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len);
375 if (err)
376 return err;
377
378 err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
379 snod->offs, new_lnum, new_offs,
380 snod->len);
381 list_del(&snod->list);
382 kfree(snod);
383 return err;
384}
385
386/**
387 * move_nodes - move nodes.
388 * @c: UBIFS file-system description object
389 * @sleb: describes the LEB to move nodes from
390 *
391 * This function moves valid nodes from data LEB described by @sleb to the GC
392 * journal head. This function returns zero in case of success, %-EAGAIN if
393 * commit is required, and other negative error codes in case of other
394 * failures.
395 */
396static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
397{
398 int err, min;
399 LIST_HEAD(nondata);
400 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
251 401
252 if (wbuf->lnum == -1) { 402 if (wbuf->lnum == -1) {
253 /* 403 /*
@@ -256,42 +406,59 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
256 */ 406 */
257 err = switch_gc_head(c); 407 err = switch_gc_head(c);
258 if (err) 408 if (err)
259 goto out; 409 return err;
260 } 410 }
261 411
412 err = sort_nodes(c, sleb, &nondata, &min);
413 if (err)
414 goto out;
415
262 /* Write nodes to their new location. Use the first-fit strategy */ 416 /* Write nodes to their new location. Use the first-fit strategy */
263 while (1) { 417 while (1) {
264 avail = c->leb_size - wbuf->offs - wbuf->used; 418 int avail;
265 list_for_each_entry_safe(snod, tmp, &large, list) { 419 struct ubifs_scan_node *snod, *tmp;
266 int new_lnum, new_offs; 420
421 /* Move data nodes */
422 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
423 avail = c->leb_size - wbuf->offs - wbuf->used;
424 if (snod->len > avail)
425 /*
426 * Do not skip data nodes in order to optimize
427 * bulk-read.
428 */
429 break;
430
431 err = move_node(c, sleb, snod, wbuf);
432 if (err)
433 goto out;
434 }
267 435
436 /* Move non-data nodes */
437 list_for_each_entry_safe(snod, tmp, &nondata, list) {
438 avail = c->leb_size - wbuf->offs - wbuf->used;
268 if (avail < min) 439 if (avail < min)
269 break; 440 break;
270 441
271 if (snod->len > avail) 442 if (snod->len > avail) {
272 /* This node does not fit */ 443 /*
444 * Keep going only if this is an inode with
445 * some data. Otherwise stop and switch the GC
446 * head. IOW, we assume that data-less inode
447 * nodes and direntry nodes are roughly of the
448 * same size.
449 */
450 if (key_type(c, &snod->key) == UBIFS_DENT_KEY ||
451 snod->len == UBIFS_INO_NODE_SZ)
452 break;
273 continue; 453 continue;
454 }
274 455
275 cond_resched(); 456 err = move_node(c, sleb, snod, wbuf);
276
277 new_lnum = wbuf->lnum;
278 new_offs = wbuf->offs + wbuf->used;
279 err = ubifs_wbuf_write_nolock(wbuf, snod->node,
280 snod->len);
281 if (err) 457 if (err)
282 goto out; 458 goto out;
283 err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
284 snod->offs, new_lnum, new_offs,
285 snod->len);
286 if (err)
287 goto out;
288
289 avail = c->leb_size - wbuf->offs - wbuf->used;
290 list_del(&snod->list);
291 kfree(snod);
292 } 459 }
293 460
294 if (list_empty(&large)) 461 if (list_empty(&sleb->nodes) && list_empty(&nondata))
295 break; 462 break;
296 463
297 /* 464 /*
@@ -306,10 +473,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
306 return 0; 473 return 0;
307 474
308out: 475out:
309 list_for_each_entry_safe(snod, tmp, &large, list) { 476 list_splice_tail(&nondata, &sleb->nodes);
310 list_del(&snod->list);
311 kfree(snod);
312 }
313 return err; 477 return err;
314} 478}
315 479
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index a11ca0958a23..64b5f3a309f5 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -114,7 +114,7 @@ static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
114 */ 114 */
115static int reserve_space(struct ubifs_info *c, int jhead, int len) 115static int reserve_space(struct ubifs_info *c, int jhead, int len)
116{ 116{
117 int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze; 117 int err = 0, err1, retries = 0, avail, lnum, offs, squeeze;
118 struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; 118 struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
119 119
120 /* 120 /*
@@ -139,10 +139,9 @@ again:
139 * Write buffer wasn't seek'ed or there is no enough space - look for an 139 * Write buffer wasn't seek'ed or there is no enough space - look for an
140 * LEB with some empty space. 140 * LEB with some empty space.
141 */ 141 */
142 lnum = ubifs_find_free_space(c, len, &free, squeeze); 142 lnum = ubifs_find_free_space(c, len, &offs, squeeze);
143 if (lnum >= 0) { 143 if (lnum >= 0) {
144 /* Found an LEB, add it to the journal head */ 144 /* Found an LEB, add it to the journal head */
145 offs = c->leb_size - free;
146 err = ubifs_add_bud_to_log(c, jhead, lnum, offs); 145 err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
147 if (err) 146 if (err)
148 goto out_return; 147 goto out_return;
@@ -1366,7 +1365,7 @@ out_ro:
1366 * @host: host inode 1365 * @host: host inode
1367 * 1366 *
1368 * This function writes the updated version of an extended attribute inode and 1367 * This function writes the updated version of an extended attribute inode and
1369 * the host inode tho the journal (to the base head). The host inode is written 1368 * the host inode to the journal (to the base head). The host inode is written
1370 * after the extended attribute inode in order to guarantee that the extended 1369 * after the extended attribute inode in order to guarantee that the extended
1371 * attribute will be flushed when the inode is synchronized by 'fsync()' and 1370 * attribute will be flushed when the inode is synchronized by 'fsync()' and
1372 * consequently, the write-buffer is synchronized. This function returns zero 1371 * consequently, the write-buffer is synchronized. This function returns zero
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index efb3430a2581..5fa27ea031ba 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -381,8 +381,8 @@ static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k)
381 * @c: UBIFS file-system description object 381 * @c: UBIFS file-system description object
382 * @key: the key to get hash from 382 * @key: the key to get hash from
383 */ 383 */
384static inline int key_hash(const struct ubifs_info *c, 384static inline uint32_t key_hash(const struct ubifs_info *c,
385 const union ubifs_key *key) 385 const union ubifs_key *key)
386{ 386{
387 return key->u32[1] & UBIFS_S_KEY_HASH_MASK; 387 return key->u32[1] & UBIFS_S_KEY_HASH_MASK;
388} 388}
@@ -392,7 +392,7 @@ static inline int key_hash(const struct ubifs_info *c,
392 * @c: UBIFS file-system description object 392 * @c: UBIFS file-system description object
393 * @k: the key to get hash from 393 * @k: the key to get hash from
394 */ 394 */
395static inline int key_hash_flash(const struct ubifs_info *c, const void *k) 395static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k)
396{ 396{
397 const union ubifs_key *key = k; 397 const union ubifs_key *key = k;
398 398
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 3e0aa7367556..56e33772a1ee 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -239,7 +239,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
239 } 239 }
240 240
241 /* 241 /*
242 * Make sure the the amount of space in buds will not exceed 242 * Make sure the amount of space in buds will not exceed the
243 * 'c->max_bud_bytes' limit, because we want to guarantee mount time 243 * 'c->max_bud_bytes' limit, because we want to guarantee mount time
244 * limits. 244 * limits.
245 * 245 *
@@ -367,7 +367,6 @@ static void remove_buds(struct ubifs_info *c)
367 bud->jhead, c->leb_size - bud->start, 367 bud->jhead, c->leb_size - bud->start,
368 c->cmt_bud_bytes); 368 c->cmt_bud_bytes);
369 rb_erase(p1, &c->buds); 369 rb_erase(p1, &c->buds);
370 list_del(&bud->list);
371 /* 370 /*
372 * If the commit does not finish, the recovery will need 371 * If the commit does not finish, the recovery will need
373 * to replay the journal, in which case the old buds 372 * to replay the journal, in which case the old buds
@@ -375,7 +374,7 @@ static void remove_buds(struct ubifs_info *c)
375 * commit i.e. do not allow them to be garbage 374 * commit i.e. do not allow them to be garbage
376 * collected. 375 * collected.
377 */ 376 */
378 list_add(&bud->list, &c->old_buds); 377 list_move(&bud->list, &c->old_buds);
379 } 378 }
380 } 379 }
381 spin_unlock(&c->buds_lock); 380 spin_unlock(&c->buds_lock);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 3216a1f277f8..8cbfb8248025 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -229,7 +229,7 @@ static int layout_cnodes(struct ubifs_info *c)
229 while (offs + len > c->leb_size) { 229 while (offs + len > c->leb_size) {
230 alen = ALIGN(offs, c->min_io_size); 230 alen = ALIGN(offs, c->min_io_size);
231 upd_ltab(c, lnum, c->leb_size - alen, alen - offs); 231 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
232 dbg_chk_lpt_sz(c, 2, alen - offs); 232 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
233 err = alloc_lpt_leb(c, &lnum); 233 err = alloc_lpt_leb(c, &lnum);
234 if (err) 234 if (err)
235 goto no_space; 235 goto no_space;
@@ -272,7 +272,7 @@ static int layout_cnodes(struct ubifs_info *c)
272 if (offs + c->lsave_sz > c->leb_size) { 272 if (offs + c->lsave_sz > c->leb_size) {
273 alen = ALIGN(offs, c->min_io_size); 273 alen = ALIGN(offs, c->min_io_size);
274 upd_ltab(c, lnum, c->leb_size - alen, alen - offs); 274 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
275 dbg_chk_lpt_sz(c, 2, alen - offs); 275 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
276 err = alloc_lpt_leb(c, &lnum); 276 err = alloc_lpt_leb(c, &lnum);
277 if (err) 277 if (err)
278 goto no_space; 278 goto no_space;
@@ -292,7 +292,7 @@ static int layout_cnodes(struct ubifs_info *c)
292 if (offs + c->ltab_sz > c->leb_size) { 292 if (offs + c->ltab_sz > c->leb_size) {
293 alen = ALIGN(offs, c->min_io_size); 293 alen = ALIGN(offs, c->min_io_size);
294 upd_ltab(c, lnum, c->leb_size - alen, alen - offs); 294 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
295 dbg_chk_lpt_sz(c, 2, alen - offs); 295 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
296 err = alloc_lpt_leb(c, &lnum); 296 err = alloc_lpt_leb(c, &lnum);
297 if (err) 297 if (err)
298 goto no_space; 298 goto no_space;
@@ -416,14 +416,12 @@ static int write_cnodes(struct ubifs_info *c)
416 alen, UBI_SHORTTERM); 416 alen, UBI_SHORTTERM);
417 if (err) 417 if (err)
418 return err; 418 return err;
419 dbg_chk_lpt_sz(c, 4, alen - wlen);
420 } 419 }
421 dbg_chk_lpt_sz(c, 2, 0); 420 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
422 err = realloc_lpt_leb(c, &lnum); 421 err = realloc_lpt_leb(c, &lnum);
423 if (err) 422 if (err)
424 goto no_space; 423 goto no_space;
425 offs = 0; 424 offs = from = 0;
426 from = 0;
427 ubifs_assert(lnum >= c->lpt_first && 425 ubifs_assert(lnum >= c->lpt_first &&
428 lnum <= c->lpt_last); 426 lnum <= c->lpt_last);
429 err = ubifs_leb_unmap(c, lnum); 427 err = ubifs_leb_unmap(c, lnum);
@@ -477,11 +475,11 @@ static int write_cnodes(struct ubifs_info *c)
477 UBI_SHORTTERM); 475 UBI_SHORTTERM);
478 if (err) 476 if (err)
479 return err; 477 return err;
480 dbg_chk_lpt_sz(c, 2, alen - wlen); 478 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
481 err = realloc_lpt_leb(c, &lnum); 479 err = realloc_lpt_leb(c, &lnum);
482 if (err) 480 if (err)
483 goto no_space; 481 goto no_space;
484 offs = 0; 482 offs = from = 0;
485 ubifs_assert(lnum >= c->lpt_first && 483 ubifs_assert(lnum >= c->lpt_first &&
486 lnum <= c->lpt_last); 484 lnum <= c->lpt_last);
487 err = ubifs_leb_unmap(c, lnum); 485 err = ubifs_leb_unmap(c, lnum);
@@ -504,11 +502,11 @@ static int write_cnodes(struct ubifs_info *c)
504 UBI_SHORTTERM); 502 UBI_SHORTTERM);
505 if (err) 503 if (err)
506 return err; 504 return err;
507 dbg_chk_lpt_sz(c, 2, alen - wlen); 505 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
508 err = realloc_lpt_leb(c, &lnum); 506 err = realloc_lpt_leb(c, &lnum);
509 if (err) 507 if (err)
510 goto no_space; 508 goto no_space;
511 offs = 0; 509 offs = from = 0;
512 ubifs_assert(lnum >= c->lpt_first && 510 ubifs_assert(lnum >= c->lpt_first &&
513 lnum <= c->lpt_last); 511 lnum <= c->lpt_last);
514 err = ubifs_leb_unmap(c, lnum); 512 err = ubifs_leb_unmap(c, lnum);
@@ -1756,10 +1754,16 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
1756/** 1754/**
1757 * dbg_chk_lpt_sz - check LPT does not write more than LPT size. 1755 * dbg_chk_lpt_sz - check LPT does not write more than LPT size.
1758 * @c: the UBIFS file-system description object 1756 * @c: the UBIFS file-system description object
1759 * @action: action 1757 * @action: what to do
1760 * @len: length written 1758 * @len: length written
1761 * 1759 *
1762 * This function returns %0 on success and a negative error code on failure. 1760 * This function returns %0 on success and a negative error code on failure.
1761 * The @action argument may be one of:
1762 * o %0 - LPT debugging checking starts, initialize debugging variables;
1763 * o %1 - wrote an LPT node, increase LPT size by @len bytes;
1764 * o %2 - switched to a different LEB and wasted @len bytes;
1765 * o %3 - check that we've written the right number of bytes.
1766 * o %4 - wasted @len bytes;
1763 */ 1767 */
1764int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) 1768int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
1765{ 1769{
@@ -1917,12 +1921,12 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
1917 lnum, offs); 1921 lnum, offs);
1918 err = ubifs_unpack_nnode(c, buf, &nnode); 1922 err = ubifs_unpack_nnode(c, buf, &nnode);
1919 for (i = 0; i < UBIFS_LPT_FANOUT; i++) { 1923 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1920 printk("%d:%d", nnode.nbranch[i].lnum, 1924 printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum,
1921 nnode.nbranch[i].offs); 1925 nnode.nbranch[i].offs);
1922 if (i != UBIFS_LPT_FANOUT - 1) 1926 if (i != UBIFS_LPT_FANOUT - 1)
1923 printk(", "); 1927 printk(KERN_CONT ", ");
1924 } 1928 }
1925 printk("\n"); 1929 printk(KERN_CONT "\n");
1926 break; 1930 break;
1927 } 1931 }
1928 case UBIFS_LPT_LTAB: 1932 case UBIFS_LPT_LTAB:
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 90acac603e63..10662975d2ef 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -425,59 +425,35 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
425 * @lnum: LEB number of the LEB from which @buf was read 425 * @lnum: LEB number of the LEB from which @buf was read
426 * @offs: offset from which @buf was read 426 * @offs: offset from which @buf was read
427 * 427 *
428 * This function scans @buf for more nodes and returns %0 is a node is found and 428 * This function ensures that the corrupted node at @offs is the last thing
429 * %1 if no more nodes are found. 429 * written to a LEB. This function returns %1 if more data is not found and
430 * %0 if more data is found.
430 */ 431 */
431static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, 432static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
432 int lnum, int offs) 433 int lnum, int offs)
433{ 434{
434 int skip, next_offs = 0; 435 struct ubifs_ch *ch = buf;
436 int skip, dlen = le32_to_cpu(ch->len);
435 437
436 if (len > UBIFS_DATA_NODE_SZ) { 438 /* Check for empty space after the corrupt node's common header */
437 struct ubifs_ch *ch = buf; 439 skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs;
438 int dlen = le32_to_cpu(ch->len); 440 if (is_empty(buf + skip, len - skip))
439 441 return 1;
440 if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ && 442 /*
441 dlen <= UBIFS_MAX_DATA_NODE_SZ) 443 * The area after the common header size is not empty, so the common
442 /* The corrupt node looks like a data node */ 444 * header must be intact. Check it.
443 next_offs = ALIGN(offs + dlen, 8); 445 */
444 } 446 if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) {
445 447 dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs);
446 if (c->min_io_size == 1) 448 return 0;
447 skip = 8;
448 else
449 skip = ALIGN(offs + 1, c->min_io_size) - offs;
450
451 offs += skip;
452 buf += skip;
453 len -= skip;
454 while (len > 8) {
455 struct ubifs_ch *ch = buf;
456 uint32_t magic = le32_to_cpu(ch->magic);
457 int ret;
458
459 if (magic == UBIFS_NODE_MAGIC) {
460 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
461 if (ret == SCANNED_A_NODE || ret > 0) {
462 /*
463 * There is a small chance this is just data in
464 * a data node, so check that possibility. e.g.
465 * this is part of a file that itself contains
466 * a UBIFS image.
467 */
468 if (next_offs && offs + le32_to_cpu(ch->len) <=
469 next_offs)
470 continue;
471 dbg_rcvry("unexpected node at %d:%d", lnum,
472 offs);
473 return 0;
474 }
475 }
476 offs += 8;
477 buf += 8;
478 len -= 8;
479 } 449 }
480 return 1; 450 /* Now we know the corrupt node's length we can skip over it */
451 skip = ALIGN(offs + dlen, c->min_io_size) - offs;
452 /* After which there should be empty space */
453 if (is_empty(buf + skip, len - skip))
454 return 1;
455 dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip);
456 return 0;
481} 457}
482 458
483/** 459/**
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index ce42a7b0ca5a..11cc80125a49 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -143,7 +143,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
143 dirty -= c->leb_size - lp->free; 143 dirty -= c->leb_size - lp->free;
144 /* 144 /*
145 * If the replay order was perfect the dirty space would now be 145 * If the replay order was perfect the dirty space would now be
146 * zero. The order is not perfect because the the journal heads 146 * zero. The order is not perfect because the journal heads
147 * race with each other. This is not a problem but is does mean 147 * race with each other. This is not a problem but is does mean
148 * that the dirty space may temporarily exceed c->leb_size 148 * that the dirty space may temporarily exceed c->leb_size
149 * during the replay. 149 * during the replay.
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index e070c643d1bb..57085e43320f 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -193,6 +193,7 @@ static int create_default_filesystem(struct ubifs_info *c)
193 if (tmp64 > DEFAULT_MAX_RP_SIZE) 193 if (tmp64 > DEFAULT_MAX_RP_SIZE)
194 tmp64 = DEFAULT_MAX_RP_SIZE; 194 tmp64 = DEFAULT_MAX_RP_SIZE;
195 sup->rp_size = cpu_to_le64(tmp64); 195 sup->rp_size = cpu_to_le64(tmp64);
196 sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION);
196 197
197 err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM); 198 err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);
198 kfree(sup); 199 kfree(sup);
@@ -532,17 +533,39 @@ int ubifs_read_superblock(struct ubifs_info *c)
532 if (IS_ERR(sup)) 533 if (IS_ERR(sup))
533 return PTR_ERR(sup); 534 return PTR_ERR(sup);
534 535
536 c->fmt_version = le32_to_cpu(sup->fmt_version);
537 c->ro_compat_version = le32_to_cpu(sup->ro_compat_version);
538
535 /* 539 /*
536 * The software supports all previous versions but not future versions, 540 * The software supports all previous versions but not future versions,
537 * due to the unavailability of time-travelling equipment. 541 * due to the unavailability of time-travelling equipment.
538 */ 542 */
539 c->fmt_version = le32_to_cpu(sup->fmt_version);
540 if (c->fmt_version > UBIFS_FORMAT_VERSION) { 543 if (c->fmt_version > UBIFS_FORMAT_VERSION) {
541 ubifs_err("on-flash format version is %d, but software only " 544 struct super_block *sb = c->vfs_sb;
542 "supports up to version %d", c->fmt_version, 545 int mounting_ro = sb->s_flags & MS_RDONLY;
543 UBIFS_FORMAT_VERSION); 546
544 err = -EINVAL; 547 ubifs_assert(!c->ro_media || mounting_ro);
545 goto out; 548 if (!mounting_ro ||
549 c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
550 ubifs_err("on-flash format version is w%d/r%d, but "
551 "software only supports up to version "
552 "w%d/r%d", c->fmt_version,
553 c->ro_compat_version, UBIFS_FORMAT_VERSION,
554 UBIFS_RO_COMPAT_VERSION);
555 if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) {
556 ubifs_msg("only R/O mounting is possible");
557 err = -EROFS;
558 } else
559 err = -EINVAL;
560 goto out;
561 }
562
563 /*
564 * The FS is mounted R/O, and the media format is
565 * R/O-compatible with the UBIFS implementation, so we can
566 * mount.
567 */
568 c->rw_incompat = 1;
546 } 569 }
547 570
548 if (c->fmt_version < 3) { 571 if (c->fmt_version < 3) {
@@ -623,7 +646,6 @@ int ubifs_read_superblock(struct ubifs_info *c)
623 c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS; 646 c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;
624 c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs; 647 c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs;
625 c->main_first = c->leb_cnt - c->main_lebs; 648 c->main_first = c->leb_cnt - c->main_lebs;
626 c->report_rp_size = ubifs_reported_space(c, c->rp_size);
627 649
628 err = validate_sb(c, sup); 650 err = validate_sb(c, sup);
629out: 651out:
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index e7bab52a1410..02feb59cefca 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -206,8 +206,7 @@ static int shrink_tnc_trees(int nr, int age, int *contention)
206 * Move this one to the end of the list to provide some 206 * Move this one to the end of the list to provide some
207 * fairness. 207 * fairness.
208 */ 208 */
209 list_del(&c->infos_list); 209 list_move_tail(&c->infos_list, &ubifs_infos);
210 list_add_tail(&c->infos_list, &ubifs_infos);
211 mutex_unlock(&c->umount_mutex); 210 mutex_unlock(&c->umount_mutex);
212 if (freed >= nr) 211 if (freed >= nr)
213 break; 212 break;
@@ -263,8 +262,7 @@ static int kick_a_thread(void)
263 } 262 }
264 263
265 if (i == 1) { 264 if (i == 1) {
266 list_del(&c->infos_list); 265 list_move_tail(&c->infos_list, &ubifs_infos);
267 list_add_tail(&c->infos_list, &ubifs_infos);
268 spin_unlock(&ubifs_infos_lock); 266 spin_unlock(&ubifs_infos_lock);
269 267
270 ubifs_request_bg_commit(c); 268 ubifs_request_bg_commit(c);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index c5c98355459a..faa44f90608a 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -421,8 +421,8 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
421 seq_printf(s, ",no_chk_data_crc"); 421 seq_printf(s, ",no_chk_data_crc");
422 422
423 if (c->mount_opts.override_compr) { 423 if (c->mount_opts.override_compr) {
424 seq_printf(s, ",compr="); 424 seq_printf(s, ",compr=%s",
425 seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type)); 425 ubifs_compr_name(c->mount_opts.compr_type));
426 } 426 }
427 427
428 return 0; 428 return 0;
@@ -700,6 +700,8 @@ static int init_constants_sb(struct ubifs_info *c)
700 if (err) 700 if (err)
701 return err; 701 return err;
702 702
703 /* Initialize effective LEB size used in budgeting calculations */
704 c->idx_leb_size = c->leb_size - c->max_idx_node_sz;
703 return 0; 705 return 0;
704} 706}
705 707
@@ -716,6 +718,7 @@ static void init_constants_master(struct ubifs_info *c)
716 long long tmp64; 718 long long tmp64;
717 719
718 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 720 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
721 c->report_rp_size = ubifs_reported_space(c, c->rp_size);
719 722
720 /* 723 /*
721 * Calculate total amount of FS blocks. This number is not used 724 * Calculate total amount of FS blocks. This number is not used
@@ -1201,7 +1204,7 @@ static int mount_ubifs(struct ubifs_info *c)
1201 goto out_cbuf; 1204 goto out_cbuf;
1202 1205
1203 /* Create background thread */ 1206 /* Create background thread */
1204 c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); 1207 c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
1205 if (IS_ERR(c->bgt)) { 1208 if (IS_ERR(c->bgt)) {
1206 err = PTR_ERR(c->bgt); 1209 err = PTR_ERR(c->bgt);
1207 c->bgt = NULL; 1210 c->bgt = NULL;
@@ -1318,11 +1321,15 @@ static int mount_ubifs(struct ubifs_info *c)
1318 else { 1321 else {
1319 c->need_recovery = 0; 1322 c->need_recovery = 0;
1320 ubifs_msg("recovery completed"); 1323 ubifs_msg("recovery completed");
1321 /* GC LEB has to be empty and taken at this point */ 1324 /*
1322 ubifs_assert(c->lst.taken_empty_lebs == 1); 1325 * GC LEB has to be empty and taken at this point. But
1326 * the journal head LEBs may also be accounted as
1327 * "empty taken" if they are empty.
1328 */
1329 ubifs_assert(c->lst.taken_empty_lebs > 0);
1323 } 1330 }
1324 } else 1331 } else
1325 ubifs_assert(c->lst.taken_empty_lebs == 1); 1332 ubifs_assert(c->lst.taken_empty_lebs > 0);
1326 1333
1327 err = dbg_check_filesystem(c); 1334 err = dbg_check_filesystem(c);
1328 if (err) 1335 if (err)
@@ -1344,8 +1351,9 @@ static int mount_ubifs(struct ubifs_info *c)
1344 x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; 1351 x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
1345 ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d " 1352 ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d "
1346 "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); 1353 "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
1347 ubifs_msg("media format: %d (latest is %d)", 1354 ubifs_msg("media format: w%d/r%d (latest is w%d/r%d)",
1348 c->fmt_version, UBIFS_FORMAT_VERSION); 1355 c->fmt_version, c->ro_compat_version,
1356 UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION);
1349 ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); 1357 ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
1350 ubifs_msg("reserved for root: %llu bytes (%llu KiB)", 1358 ubifs_msg("reserved for root: %llu bytes (%llu KiB)",
1351 c->report_rp_size, c->report_rp_size >> 10); 1359 c->report_rp_size, c->report_rp_size >> 10);
@@ -1485,6 +1493,15 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1485{ 1493{
1486 int err, lnum; 1494 int err, lnum;
1487 1495
1496 if (c->rw_incompat) {
1497 ubifs_err("the file-system is not R/W-compatible");
1498 ubifs_msg("on-flash format version is w%d/r%d, but software "
1499 "only supports up to version w%d/r%d", c->fmt_version,
1500 c->ro_compat_version, UBIFS_FORMAT_VERSION,
1501 UBIFS_RO_COMPAT_VERSION);
1502 return -EROFS;
1503 }
1504
1488 mutex_lock(&c->umount_mutex); 1505 mutex_lock(&c->umount_mutex);
1489 dbg_save_space_info(c); 1506 dbg_save_space_info(c);
1490 c->remounting_rw = 1; 1507 c->remounting_rw = 1;
@@ -1554,7 +1571,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1554 ubifs_create_buds_lists(c); 1571 ubifs_create_buds_lists(c);
1555 1572
1556 /* Create background thread */ 1573 /* Create background thread */
1557 c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); 1574 c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
1558 if (IS_ERR(c->bgt)) { 1575 if (IS_ERR(c->bgt)) {
1559 err = PTR_ERR(c->bgt); 1576 err = PTR_ERR(c->bgt);
1560 c->bgt = NULL; 1577 c->bgt = NULL;
@@ -1775,7 +1792,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1775 c->bu.buf = NULL; 1792 c->bu.buf = NULL;
1776 } 1793 }
1777 1794
1778 ubifs_assert(c->lst.taken_empty_lebs == 1); 1795 ubifs_assert(c->lst.taken_empty_lebs > 0);
1779 return 0; 1796 return 0;
1780} 1797}
1781 1798
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index fa28a84c6a1b..f249f7b0d656 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1252,7 +1252,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
1252 * splitting in the middle of the colliding sequence. Also, when 1252 * splitting in the middle of the colliding sequence. Also, when
1253 * removing the leftmost key, we would have to correct the key of the 1253 * removing the leftmost key, we would have to correct the key of the
1254 * parent node, which would introduce additional complications. Namely, 1254 * parent node, which would introduce additional complications. Namely,
1255 * if we changed the the leftmost key of the parent znode, the garbage 1255 * if we changed the leftmost key of the parent znode, the garbage
1256 * collector would be unable to find it (GC is doing this when GC'ing 1256 * collector would be unable to find it (GC is doing this when GC'ing
1257 * indexing LEBs). Although we already have an additional RB-tree where 1257 * indexing LEBs). Although we already have an additional RB-tree where
1258 * we save such changed znodes (see 'ins_clr_old_idx_znode()') until 1258 * we save such changed znodes (see 'ins_clr_old_idx_znode()') until
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index b25fc36cf72f..3eee07e0c495 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -36,9 +36,31 @@
36/* UBIFS node magic number (must not have the padding byte first or last) */ 36/* UBIFS node magic number (must not have the padding byte first or last) */
37#define UBIFS_NODE_MAGIC 0x06101831 37#define UBIFS_NODE_MAGIC 0x06101831
38 38
39/* UBIFS on-flash format version */ 39/*
40 * UBIFS on-flash format version. This version is increased when the on-flash
41 * format is changing. If this happens, UBIFS is will support older versions as
42 * well. But older UBIFS code will not support newer formats. Format changes
43 * will be rare and only when absolutely necessary, e.g. to fix a bug or to add
44 * a new feature.
45 *
46 * UBIFS went into mainline kernel with format version 4. The older formats
47 * were development formats.
48 */
40#define UBIFS_FORMAT_VERSION 4 49#define UBIFS_FORMAT_VERSION 4
41 50
51/*
52 * Read-only compatibility version. If the UBIFS format is changed, older UBIFS
53 * implementations will not be able to mount newer formats in read-write mode.
54 * However, depending on the change, it may be possible to mount newer formats
55 * in R/O mode. This is indicated by the R/O compatibility version which is
56 * stored in the super-block.
57 *
58 * This is needed to support boot-loaders which only need R/O mounting. With
59 * this flag it is possible to do UBIFS format changes without a need to update
60 * boot-loaders.
61 */
62#define UBIFS_RO_COMPAT_VERSION 0
63
42/* Minimum logical eraseblock size in bytes */ 64/* Minimum logical eraseblock size in bytes */
43#define UBIFS_MIN_LEB_SZ (15*1024) 65#define UBIFS_MIN_LEB_SZ (15*1024)
44 66
@@ -53,7 +75,7 @@
53 75
54/* 76/*
55 * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes 77 * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes
56 * shorter than uncompressed data length, UBIFS preferes to leave this data 78 * shorter than uncompressed data length, UBIFS prefers to leave this data
57 * node uncompress, because it'll be read faster. 79 * node uncompress, because it'll be read faster.
58 */ 80 */
59#define UBIFS_MIN_COMPRESS_DIFF 64 81#define UBIFS_MIN_COMPRESS_DIFF 64
@@ -586,6 +608,7 @@ struct ubifs_pad_node {
586 * @padding2: reserved for future, zeroes 608 * @padding2: reserved for future, zeroes
587 * @time_gran: time granularity in nanoseconds 609 * @time_gran: time granularity in nanoseconds
588 * @uuid: UUID generated when the file system image was created 610 * @uuid: UUID generated when the file system image was created
611 * @ro_compat_version: UBIFS R/O compatibility version
589 */ 612 */
590struct ubifs_sb_node { 613struct ubifs_sb_node {
591 struct ubifs_ch ch; 614 struct ubifs_ch ch;
@@ -612,7 +635,8 @@ struct ubifs_sb_node {
612 __le64 rp_size; 635 __le64 rp_size;
613 __le32 time_gran; 636 __le32 time_gran;
614 __u8 uuid[16]; 637 __u8 uuid[16];
615 __u8 padding2[3972]; 638 __le32 ro_compat_version;
639 __u8 padding2[3968];
616} __attribute__ ((packed)); 640} __attribute__ ((packed));
617 641
618/** 642/**
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 039a68bee29a..0a8341e14088 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -934,6 +934,7 @@ struct ubifs_debug_info;
934 * by @commit_sem 934 * by @commit_sem
935 * @cnt_lock: protects @highest_inum and @max_sqnum counters 935 * @cnt_lock: protects @highest_inum and @max_sqnum counters
936 * @fmt_version: UBIFS on-flash format version 936 * @fmt_version: UBIFS on-flash format version
937 * @ro_compat_version: R/O compatibility version
937 * @uuid: UUID from super block 938 * @uuid: UUID from super block
938 * 939 *
939 * @lhead_lnum: log head logical eraseblock number 940 * @lhead_lnum: log head logical eraseblock number
@@ -966,6 +967,7 @@ struct ubifs_debug_info;
966 * recovery) 967 * recovery)
967 * @bulk_read: enable bulk-reads 968 * @bulk_read: enable bulk-reads
968 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) 969 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
970 * @rw_incompat: the media is not R/W compatible
969 * 971 *
970 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and 972 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
971 * @calc_idx_sz 973 * @calc_idx_sz
@@ -1015,6 +1017,8 @@ struct ubifs_debug_info;
1015 * @min_io_shift: number of bits in @min_io_size minus one 1017 * @min_io_shift: number of bits in @min_io_size minus one
1016 * @leb_size: logical eraseblock size in bytes 1018 * @leb_size: logical eraseblock size in bytes
1017 * @half_leb_size: half LEB size 1019 * @half_leb_size: half LEB size
1020 * @idx_leb_size: how many bytes of an LEB are effectively available when it is
1021 * used to store indexing nodes (@leb_size - @max_idx_node_sz)
1018 * @leb_cnt: count of logical eraseblocks 1022 * @leb_cnt: count of logical eraseblocks
1019 * @max_leb_cnt: maximum count of logical eraseblocks 1023 * @max_leb_cnt: maximum count of logical eraseblocks
1020 * @old_leb_cnt: count of logical eraseblocks before re-size 1024 * @old_leb_cnt: count of logical eraseblocks before re-size
@@ -1132,8 +1136,8 @@ struct ubifs_debug_info;
1132 * previous commit start 1136 * previous commit start
1133 * @uncat_list: list of un-categorized LEBs 1137 * @uncat_list: list of un-categorized LEBs
1134 * @empty_list: list of empty LEBs 1138 * @empty_list: list of empty LEBs
1135 * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size) 1139 * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size)
1136 * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size) 1140 * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size)
1137 * @freeable_cnt: number of freeable LEBs in @freeable_list 1141 * @freeable_cnt: number of freeable LEBs in @freeable_list
1138 * 1142 *
1139 * @ltab_lnum: LEB number of LPT's own lprops table 1143 * @ltab_lnum: LEB number of LPT's own lprops table
@@ -1177,6 +1181,7 @@ struct ubifs_info {
1177 unsigned long long cmt_no; 1181 unsigned long long cmt_no;
1178 spinlock_t cnt_lock; 1182 spinlock_t cnt_lock;
1179 int fmt_version; 1183 int fmt_version;
1184 int ro_compat_version;
1180 unsigned char uuid[16]; 1185 unsigned char uuid[16];
1181 1186
1182 int lhead_lnum; 1187 int lhead_lnum;
@@ -1205,6 +1210,7 @@ struct ubifs_info {
1205 unsigned int no_chk_data_crc:1; 1210 unsigned int no_chk_data_crc:1;
1206 unsigned int bulk_read:1; 1211 unsigned int bulk_read:1;
1207 unsigned int default_compr:2; 1212 unsigned int default_compr:2;
1213 unsigned int rw_incompat:1;
1208 1214
1209 struct mutex tnc_mutex; 1215 struct mutex tnc_mutex;
1210 struct ubifs_zbranch zroot; 1216 struct ubifs_zbranch zroot;
@@ -1253,6 +1259,7 @@ struct ubifs_info {
1253 int min_io_shift; 1259 int min_io_shift;
1254 int leb_size; 1260 int leb_size;
1255 int half_leb_size; 1261 int half_leb_size;
1262 int idx_leb_size;
1256 int leb_cnt; 1263 int leb_cnt;
1257 int max_leb_cnt; 1264 int max_leb_cnt;
1258 int old_leb_cnt; 1265 int old_leb_cnt;
@@ -1500,7 +1507,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free);
1500long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); 1507long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
1501 1508
1502/* find.c */ 1509/* find.c */
1503int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, 1510int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
1504 int squeeze); 1511 int squeeze);
1505int ubifs_find_free_leb_for_idx(struct ubifs_info *c); 1512int ubifs_find_free_leb_for_idx(struct ubifs_info *c);
1506int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, 1513int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,